src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "argv-util.h"
  43 #include "async.h"
  44 #include "barrier.h"
  45 #include "bpf-lsm.h"
  46 #include "cap-list.h"
  47 #include "capability-util.h"
  48 #include "cgroup-setup.h"
  49 #include "chase-symlinks.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "creds-util.h"
  53 #include "data-fd-util.h"
  54 #include "def.h"
  55 #include "env-file.h"
  56 #include "env-util.h"
  57 #include "errno-list.h"
  58 #include "escape.h"
  59 #include "execute.h"
  60 #include "exit-status.h"
  61 #include "fd-util.h"
  62 #include "fileio.h"
  63 #include "format-util.h"
  64 #include "glob-util.h"
  65 #include "hexdecoct.h"
  66 #include "io-util.h"
  67 #include "ioprio-util.h"
  68 #include "label.h"
  69 #include "log.h"
  70 #include "macro.h"
  71 #include "manager.h"
  72 #include "manager-dump.h"
  73 #include "memory-util.h"
  74 #include "missing_fs.h"
  75 #include "missing_ioprio.h"
  76 #include "mkdir-label.h"
  77 #include "mount-util.h"
  78 #include "mountpoint-util.h"
  79 #include "namespace.h"
  80 #include "parse-util.h"
  81 #include "path-util.h"
  82 #include "process-util.h"
  83 #include "random-util.h"
  84 #include "recurse-dir.h"
  85 #include "rlimit-util.h"
  86 #include "rm-rf.h"
  87 #if HAVE_SECCOMP
  88 #include "seccomp-util.h"
  89 #endif
  90 #include "securebits-util.h"
  91 #include "selinux-util.h"
  92 #include "signal-util.h"
  93 #include "smack-util.h"
  94 #include "socket-util.h"
  95 #include "sort-util.h"
  96 #include "special.h"
  97 #include "stat-util.h"
  98 #include "string-table.h"
  99 #include "string-util.h"
 100 #include "strv.h"
 101 #include "syslog-util.h"
 102 #include "terminal-util.h"
 103 #include "tmpfile-util.h"
 104 #include "umask-util.h"
 105 #include "unit-serialize.h"
 106 #include "user-util.h"
 107 #include "utmp-wtmp.h"
 108
 109 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 110 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 111
 112 #define SNDBUF_SIZE (8*1024*1024)
 113
 114 static int shift_fds(int fds[], size_t n_fds) {
 115         if (n_fds <= 0)
 116                 return 0;
 117
 118         /* Modifies the fds array! (sorts it) */
 119
 120         assert(fds);
 121
 122         for (int start = 0;;) {
 123                 int restart_from = -1;
 124
 125                 for (int i = start; i < (int) n_fds; i++) {
 126                         int nfd;
 127
 128                         /* Already at right index? */
 129                         if (fds[i] == i+3)
 130                                 continue;
 131
 132                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 133                         if (nfd < 0)
 134                                 return -errno;
 135
 136                         safe_close(fds[i]);
 137                         fds[i] = nfd;
 138
 139                         /* Hmm, the fd we wanted isn't free? Then
 140                          * let's remember that and try again from here */
 141                         if (nfd != i+3 && restart_from < 0)
 142                                 restart_from = i;
 143                 }
 144
 145                 if (restart_from < 0)
 146                         break;
 147
 148                 start = restart_from;
 149         }
 150
 151         return 0;
 152 }
 153
 154 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 155         size_t n_fds;
 156         int r;
 157
 158         n_fds = n_socket_fds + n_storage_fds;
 159         if (n_fds <= 0)
 160                 return 0;
 161
 162         assert(fds);
 163
 164         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 165          * O_NONBLOCK only applies to socket activation though. */
 166
 167         for (size_t i = 0; i < n_fds; i++) {
 168
 169                 if (i < n_socket_fds) {
 170                         r = fd_nonblock(fds[i], nonblock);
 171                         if (r < 0)
 172                                 return r;
 173                 }
 174
 175                 /* We unconditionally drop FD_CLOEXEC from the fds,
 176                  * since after all we want to pass these fds to our
 177                  * children */
 178
 179                 r = fd_cloexec(fds[i], false);
 180                 if (r < 0)
 181                         return r;
 182         }
 183
 184         return 0;
 185 }
 186
 187 static const char *exec_context_tty_path(const ExecContext *context) {
 188         assert(context);
 189
 190         if (context->stdio_as_fds)
 191                 return NULL;
 192
 193         if (context->tty_path)
 194                 return context->tty_path;
 195
 196         return "/dev/console";
 197 }
 198
 199 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 200         const char *path;
 201
 202         assert(context);
 203
 204         path = exec_context_tty_path(context);
 205
 206         if (context->tty_vhangup) {
 207                 if (p && p->stdin_fd >= 0)
 208                         (void) terminal_vhangup_fd(p->stdin_fd);
 209                 else if (path)
 210                         (void) terminal_vhangup(path);
 211         }
 212
 213         if (context->tty_reset) {
 214                 if (p && p->stdin_fd >= 0)
 215                         (void) reset_terminal_fd(p->stdin_fd, true);
 216                 else if (path)
 217                         (void) reset_terminal(path);
 218         }
 219
 220         if (p && p->stdin_fd >= 0)
 221                 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
 222
 223         if (context->tty_vt_disallocate && path)
 224                 (void) vt_disallocate(path);
 225 }
 226
 227 static bool is_terminal_input(ExecInput i) {
 228         return IN_SET(i,
 229                       EXEC_INPUT_TTY,
 230                       EXEC_INPUT_TTY_FORCE,
 231                       EXEC_INPUT_TTY_FAIL);
 232 }
 233
 234 static bool is_terminal_output(ExecOutput o) {
 235         return IN_SET(o,
 236                       EXEC_OUTPUT_TTY,
 237                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 238                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 239 }
 240
 241 static bool is_kmsg_output(ExecOutput o) {
 242         return IN_SET(o,
 243                       EXEC_OUTPUT_KMSG,
 244                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 245 }
 246
 247 static bool exec_context_needs_term(const ExecContext *c) {
 248         assert(c);
 249
 250         /* Return true if the execution context suggests we should set $TERM to something useful. */
 251
 252         if (is_terminal_input(c->std_input))
 253                 return true;
 254
 255         if (is_terminal_output(c->std_output))
 256                 return true;
 257
 258         if (is_terminal_output(c->std_error))
 259                 return true;
 260
 261         return !!c->tty_path;
 262 }
 263
 264 static int open_null_as(int flags, int nfd) {
 265         int fd;
 266
 267         assert(nfd >= 0);
 268
 269         fd = open("/dev/null", flags|O_NOCTTY);
 270         if (fd < 0)
 271                 return -errno;
 272
 273         return move_fd(fd, nfd, false);
 274 }
 275
 276 static int connect_journal_socket(
 277                 int fd,
 278                 const char *log_namespace,
 279                 uid_t uid,
 280                 gid_t gid) {
 281
 282         uid_t olduid = UID_INVALID;
 283         gid_t oldgid = GID_INVALID;
 284         const char *j;
 285         int r;
 286
 287         j = log_namespace ?
 288                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 289                 "/run/systemd/journal/stdout";
 290
 291         if (gid_is_valid(gid)) {
 292                 oldgid = getgid();
 293
 294                 if (setegid(gid) < 0)
 295                         return -errno;
 296         }
 297
 298         if (uid_is_valid(uid)) {
 299                 olduid = getuid();
 300
 301                 if (seteuid(uid) < 0) {
 302                         r = -errno;
 303                         goto restore_gid;
 304                 }
 305         }
 306
 307         r = connect_unix_path(fd, AT_FDCWD, j);
 308
 309         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 310            an LSM interferes. */
 311
 312         if (uid_is_valid(uid))
 313                 (void) seteuid(olduid);
 314
 315  restore_gid:
 316         if (gid_is_valid(gid))
 317                 (void) setegid(oldgid);
 318
 319         return r;
 320 }
 321
 322 static int connect_logger_as(
 323                 const Unit *unit,
 324                 const ExecContext *context,
 325                 const ExecParameters *params,
 326                 ExecOutput output,
 327                 const char *ident,
 328                 int nfd,
 329                 uid_t uid,
 330                 gid_t gid) {
 331
 332         _cleanup_close_ int fd = -1;
 333         int r;
 334
 335         assert(context);
 336         assert(params);
 337         assert(output < _EXEC_OUTPUT_MAX);
 338         assert(ident);
 339         assert(nfd >= 0);
 340
 341         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 342         if (fd < 0)
 343                 return -errno;
 344
 345         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 346         if (r < 0)
 347                 return r;
 348
 349         if (shutdown(fd, SHUT_RD) < 0)
 350                 return -errno;
 351
 352         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 353
 354         if (dprintf(fd,
 355                 "%s\n"
 356                 "%s\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n"
 360                 "%i\n"
 361                 "%i\n",
 362                 context->syslog_identifier ?: ident,
 363                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 364                 context->syslog_priority,
 365                 !!context->syslog_level_prefix,
 366                 false,
 367                 is_kmsg_output(output),
 368                 is_terminal_output(output)) < 0)
 369                 return -errno;
 370
 371         return move_fd(TAKE_FD(fd), nfd, false);
 372 }
 373
 374 static int open_terminal_as(const char *path, int flags, int nfd) {
 375         int fd;
 376
 377         assert(path);
 378         assert(nfd >= 0);
 379
 380         fd = open_terminal(path, flags | O_NOCTTY);
 381         if (fd < 0)
 382                 return fd;
 383
 384         return move_fd(fd, nfd, false);
 385 }
 386
 387 static int acquire_path(const char *path, int flags, mode_t mode) {
 388         _cleanup_close_ int fd = -1;
 389         int r;
 390
 391         assert(path);
 392
 393         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 394                 flags |= O_CREAT;
 395
 396         fd = open(path, flags|O_NOCTTY, mode);
 397         if (fd >= 0)
 398                 return TAKE_FD(fd);
 399
 400         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 401                 return -errno;
 402
 403         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 404
 405         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 406         if (fd < 0)
 407                 return -errno;
 408
 409         r = connect_unix_path(fd, AT_FDCWD, path);
 410         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 411                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 412                  * wasn't an AF_UNIX socket after all */
 413                 return -ENXIO;
 414         if (r < 0)
 415                 return r;
 416
 417         if ((flags & O_ACCMODE) == O_RDONLY)
 418                 r = shutdown(fd, SHUT_WR);
 419         else if ((flags & O_ACCMODE) == O_WRONLY)
 420                 r = shutdown(fd, SHUT_RD);
 421         else
 422                 r = 0;
 423         if (r < 0)
 424                 return -errno;
 425
 426         return TAKE_FD(fd);
 427 }
 428
 429 static int fixup_input(
 430                 const ExecContext *context,
 431                 int socket_fd,
 432                 bool apply_tty_stdin) {
 433
 434         ExecInput std_input;
 435
 436         assert(context);
 437
 438         std_input = context->std_input;
 439
 440         if (is_terminal_input(std_input) && !apply_tty_stdin)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 447                 return EXEC_INPUT_NULL;
 448
 449         return std_input;
 450 }
 451
 452 static int fixup_output(ExecOutput output, int socket_fd) {
 453
 454         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 455                 return EXEC_OUTPUT_INHERIT;
 456
 457         return output;
 458 }
 459
 460 static int setup_input(
 461                 const ExecContext *context,
 462                 const ExecParameters *params,
 463                 int socket_fd,
 464                 const int named_iofds[static 3]) {
 465
 466         ExecInput i;
 467         int r;
 468
 469         assert(context);
 470         assert(params);
 471         assert(named_iofds);
 472
 473         if (params->stdin_fd >= 0) {
 474                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 475                         return -errno;
 476
 477                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 478                 if (isatty(STDIN_FILENO)) {
 479                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 480                         (void) reset_terminal_fd(STDIN_FILENO, true);
 481                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
 482                 }
 483
 484                 return STDIN_FILENO;
 485         }
 486
 487         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 488
 489         switch (i) {
 490
 491         case EXEC_INPUT_NULL:
 492                 return open_null_as(O_RDONLY, STDIN_FILENO);
 493
 494         case EXEC_INPUT_TTY:
 495         case EXEC_INPUT_TTY_FORCE:
 496         case EXEC_INPUT_TTY_FAIL: {
 497                 int fd;
 498
 499                 fd = acquire_terminal(exec_context_tty_path(context),
 500                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 501                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 502                                                                   ACQUIRE_TERMINAL_WAIT,
 503                                       USEC_INFINITY);
 504                 if (fd < 0)
 505                         return fd;
 506
 507                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
 508                 if (r < 0)
 509                         return r;
 510
 511                 return move_fd(fd, STDIN_FILENO, false);
 512         }
 513
 514         case EXEC_INPUT_SOCKET:
 515                 assert(socket_fd >= 0);
 516
 517                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 518
 519         case EXEC_INPUT_NAMED_FD:
 520                 assert(named_iofds[STDIN_FILENO] >= 0);
 521
 522                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 523                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 524
 525         case EXEC_INPUT_DATA: {
 526                 int fd;
 527
 528                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 529                 if (fd < 0)
 530                         return fd;
 531
 532                 return move_fd(fd, STDIN_FILENO, false);
 533         }
 534
 535         case EXEC_INPUT_FILE: {
 536                 bool rw;
 537                 int fd;
 538
 539                 assert(context->stdio_file[STDIN_FILENO]);
 540
 541                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 542                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 543
 544                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 545                 if (fd < 0)
 546                         return fd;
 547
 548                 return move_fd(fd, STDIN_FILENO, false);
 549         }
 550
 551         default:
 552                 assert_not_reached();
 553         }
 554 }
 555
 556 static bool can_inherit_stderr_from_stdout(
 557                 const ExecContext *context,
 558                 ExecOutput o,
 559                 ExecOutput e) {
 560
 561         assert(context);
 562
 563         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 564          * stderr fd */
 565
 566         if (e == EXEC_OUTPUT_INHERIT)
 567                 return true;
 568         if (e != o)
 569                 return false;
 570
 571         if (e == EXEC_OUTPUT_NAMED_FD)
 572                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 573
 574         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 575                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 576
 577         return true;
 578 }
 579
 580 static int setup_output(
 581                 const Unit *unit,
 582                 const ExecContext *context,
 583                 const ExecParameters *params,
 584                 int fileno,
 585                 int socket_fd,
 586                 const int named_iofds[static 3],
 587                 const char *ident,
 588                 uid_t uid,
 589                 gid_t gid,
 590                 dev_t *journal_stream_dev,
 591                 ino_t *journal_stream_ino) {
 592
 593         ExecOutput o;
 594         ExecInput i;
 595         int r;
 596
 597         assert(unit);
 598         assert(context);
 599         assert(params);
 600         assert(ident);
 601         assert(journal_stream_dev);
 602         assert(journal_stream_ino);
 603
 604         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 605
 606                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 607                         return -errno;
 608
 609                 return STDOUT_FILENO;
 610         }
 611
 612         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 613                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 614                         return -errno;
 615
 616                 return STDERR_FILENO;
 617         }
 618
 619         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 620         o = fixup_output(context->std_output, socket_fd);
 621
 622         if (fileno == STDERR_FILENO) {
 623                 ExecOutput e;
 624                 e = fixup_output(context->std_error, socket_fd);
 625
 626                 /* This expects the input and output are already set up */
 627
 628                 /* Don't change the stderr file descriptor if we inherit all
 629                  * the way and are not on a tty */
 630                 if (e == EXEC_OUTPUT_INHERIT &&
 631                     o == EXEC_OUTPUT_INHERIT &&
 632                     i == EXEC_INPUT_NULL &&
 633                     !is_terminal_input(context->std_input) &&
 634                     getppid() != 1)
 635                         return fileno;
 636
 637                 /* Duplicate from stdout if possible */
 638                 if (can_inherit_stderr_from_stdout(context, o, e))
 639                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 640
 641                 o = e;
 642
 643         } else if (o == EXEC_OUTPUT_INHERIT) {
 644                 /* If input got downgraded, inherit the original value */
 645                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 646                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 647
 648                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 649                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 650                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 651
 652                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 653                 if (getppid() != 1)
 654                         return fileno;
 655
 656                 /* We need to open /dev/null here anew, to get the right access mode. */
 657                 return open_null_as(O_WRONLY, fileno);
 658         }
 659
 660         switch (o) {
 661
 662         case EXEC_OUTPUT_NULL:
 663                 return open_null_as(O_WRONLY, fileno);
 664
 665         case EXEC_OUTPUT_TTY:
 666                 if (is_terminal_input(i))
 667                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 668
 669                 /* We don't reset the terminal if this is just about output */
 670                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 671
 672         case EXEC_OUTPUT_KMSG:
 673         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 674         case EXEC_OUTPUT_JOURNAL:
 675         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 676                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 677                 if (r < 0) {
 678                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 679                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 680                         r = open_null_as(O_WRONLY, fileno);
 681                 } else {
 682                         struct stat st;
 683
 684                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 685                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 686                          * services to detect whether they are connected to the journal or not.
 687                          *
 688                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 689                          * about STDERR as that's usually the best way to do logging. */
 690
 691                         if (fstat(fileno, &st) >= 0 &&
 692                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 693                                 *journal_stream_dev = st.st_dev;
 694                                 *journal_stream_ino = st.st_ino;
 695                         }
 696                 }
 697                 return r;
 698
 699         case EXEC_OUTPUT_SOCKET:
 700                 assert(socket_fd >= 0);
 701
 702                 return RET_NERRNO(dup2(socket_fd, fileno));
 703
 704         case EXEC_OUTPUT_NAMED_FD:
 705                 assert(named_iofds[fileno] >= 0);
 706
 707                 (void) fd_nonblock(named_iofds[fileno], false);
 708                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 709
 710         case EXEC_OUTPUT_FILE:
 711         case EXEC_OUTPUT_FILE_APPEND:
 712         case EXEC_OUTPUT_FILE_TRUNCATE: {
 713                 bool rw;
 714                 int fd, flags;
 715
 716                 assert(context->stdio_file[fileno]);
 717
 718                 rw = context->std_input == EXEC_INPUT_FILE &&
 719                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 720
 721                 if (rw)
 722                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 723
 724                 flags = O_WRONLY;
 725                 if (o == EXEC_OUTPUT_FILE_APPEND)
 726                         flags |= O_APPEND;
 727                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 728                         flags |= O_TRUNC;
 729
 730                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 731                 if (fd < 0)
 732                         return fd;
 733
 734                 return move_fd(fd, fileno, 0);
 735         }
 736
 737         default:
 738                 assert_not_reached();
 739         }
 740 }
 741
 742 static int chown_terminal(int fd, uid_t uid) {
 743         int r;
 744
 745         assert(fd >= 0);
 746
 747         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 748         if (isatty(fd) < 1) {
 749                 if (IN_SET(errno, EINVAL, ENOTTY))
 750                         return 0; /* not a tty */
 751
 752                 return -errno;
 753         }
 754
 755         /* This might fail. What matters are the results. */
 756         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 757         if (r < 0)
 758                 return r;
 759
 760         return 1;
 761 }
 762
 763 static int setup_confirm_stdio(
 764                 const ExecContext *context,
 765                 const char *vc,
 766                 int *ret_saved_stdin,
 767                 int *ret_saved_stdout) {
 768
 769         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 770         int r;
 771
 772         assert(ret_saved_stdin);
 773         assert(ret_saved_stdout);
 774
 775         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 776         if (saved_stdin < 0)
 777                 return -errno;
 778
 779         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 780         if (saved_stdout < 0)
 781                 return -errno;
 782
 783         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 784         if (fd < 0)
 785                 return fd;
 786
 787         r = chown_terminal(fd, getuid());
 788         if (r < 0)
 789                 return r;
 790
 791         r = reset_terminal_fd(fd, true);
 792         if (r < 0)
 793                 return r;
 794
 795         r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
 796         if (r < 0)
 797                 return r;
 798
 799         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 800         TAKE_FD(fd);
 801         if (r < 0)
 802                 return r;
 803
 804         *ret_saved_stdin = TAKE_FD(saved_stdin);
 805         *ret_saved_stdout = TAKE_FD(saved_stdout);
 806         return 0;
 807 }
 808
 809 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 810         assert(err < 0);
 811
 812         if (err == -ETIMEDOUT)
 813                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 814         else {
 815                 errno = -err;
 816                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 817         }
 818 }
 819
 820 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 821         _cleanup_close_ int fd = -1;
 822
 823         assert(vc);
 824
 825         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 826         if (fd < 0)
 827                 return;
 828
 829         write_confirm_error_fd(err, fd, u);
 830 }
 831
 832 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 833         int r = 0;
 834
 835         assert(saved_stdin);
 836         assert(saved_stdout);
 837
 838         release_terminal();
 839
 840         if (*saved_stdin >= 0)
 841                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 842                         r = -errno;
 843
 844         if (*saved_stdout >= 0)
 845                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 846                         r = -errno;
 847
 848         *saved_stdin = safe_close(*saved_stdin);
 849         *saved_stdout = safe_close(*saved_stdout);
 850
 851         return r;
 852 }
 853
 854 enum {
 855         CONFIRM_PRETEND_FAILURE = -1,
 856         CONFIRM_PRETEND_SUCCESS =  0,
 857         CONFIRM_EXECUTE = 1,
 858 };
 859
 860 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 861         int saved_stdout = -1, saved_stdin = -1, r;
 862         _cleanup_free_ char *e = NULL;
 863         char c;
 864
 865         /* For any internal errors, assume a positive response. */
 866         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 867         if (r < 0) {
 868                 write_confirm_error(r, vc, u);
 869                 return CONFIRM_EXECUTE;
 870         }
 871
 872         /* confirm_spawn might have been disabled while we were sleeping. */
 873         if (manager_is_confirm_spawn_disabled(u->manager)) {
 874                 r = 1;
 875                 goto restore_stdio;
 876         }
 877
 878         e = ellipsize(cmdline, 60, 100);
 879         if (!e) {
 880                 log_oom();
 881                 r = CONFIRM_EXECUTE;
 882                 goto restore_stdio;
 883         }
 884
 885         for (;;) {
 886                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 887                 if (r < 0) {
 888                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 889                         r = CONFIRM_EXECUTE;
 890                         goto restore_stdio;
 891                 }
 892
 893                 switch (c) {
 894                 case 'c':
 895                         printf("Resuming normal execution.\n");
 896                         manager_disable_confirm_spawn();
 897                         r = 1;
 898                         break;
 899                 case 'D':
 900                         unit_dump(u, stdout, "  ");
 901                         continue; /* ask again */
 902                 case 'f':
 903                         printf("Failing execution.\n");
 904                         r = CONFIRM_PRETEND_FAILURE;
 905                         break;
 906                 case 'h':
 907                         printf("  c - continue, proceed without asking anymore\n"
 908                                "  D - dump, show the state of the unit\n"
 909                                "  f - fail, don't execute the command and pretend it failed\n"
 910                                "  h - help\n"
 911                                "  i - info, show a short summary of the unit\n"
 912                                "  j - jobs, show jobs that are in progress\n"
 913                                "  s - skip, don't execute the command and pretend it succeeded\n"
 914                                "  y - yes, execute the command\n");
 915                         continue; /* ask again */
 916                 case 'i':
 917                         printf("  Description: %s\n"
 918                                "  Unit:        %s\n"
 919                                "  Command:     %s\n",
 920                                u->id, u->description, cmdline);
 921                         continue; /* ask again */
 922                 case 'j':
 923                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 924                         continue; /* ask again */
 925                 case 'n':
 926                         /* 'n' was removed in favor of 'f'. */
 927                         printf("Didn't understand 'n', did you mean 'f'?\n");
 928                         continue; /* ask again */
 929                 case 's':
 930                         printf("Skipping execution.\n");
 931                         r = CONFIRM_PRETEND_SUCCESS;
 932                         break;
 933                 case 'y':
 934                         r = CONFIRM_EXECUTE;
 935                         break;
 936                 default:
 937                         assert_not_reached();
 938                 }
 939                 break;
 940         }
 941
 942 restore_stdio:
 943         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 944         return r;
 945 }
 946
 947 static int get_fixed_user(const ExecContext *c, const char **user,
 948                           uid_t *uid, gid_t *gid,
 949                           const char **home, const char **shell) {
 950         int r;
 951         const char *name;
 952
 953         assert(c);
 954
 955         if (!c->user)
 956                 return 0;
 957
 958         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 959          * (i.e. are "/" or "/bin/nologin"). */
 960
 961         name = c->user;
 962         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 963         if (r < 0)
 964                 return r;
 965
 966         *user = name;
 967         return 0;
 968 }
 969
 970 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 971         int r;
 972         const char *name;
 973
 974         assert(c);
 975
 976         if (!c->group)
 977                 return 0;
 978
 979         name = c->group;
 980         r = get_group_creds(&name, gid, 0);
 981         if (r < 0)
 982                 return r;
 983
 984         *group = name;
 985         return 0;
 986 }
 987
 988 static int get_supplementary_groups(const ExecContext *c, const char *user,
 989                                     const char *group, gid_t gid,
 990                                     gid_t **supplementary_gids, int *ngids) {
 991         int r, k = 0;
 992         int ngroups_max;
 993         bool keep_groups = false;
 994         gid_t *groups = NULL;
 995         _cleanup_free_ gid_t *l_gids = NULL;
 996
 997         assert(c);
 998
 999         /*
1000          * If user is given, then lookup GID and supplementary groups list.
1001          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1002          * here and as early as possible so we keep the list of supplementary
1003          * groups of the caller.
1004          */
1005         if (user && gid_is_valid(gid) && gid != 0) {
1006                 /* First step, initialize groups from /etc/groups */
1007                 if (initgroups(user, gid) < 0)
1008                         return -errno;
1009
1010                 keep_groups = true;
1011         }
1012
1013         if (strv_isempty(c->supplementary_groups))
1014                 return 0;
1015
1016         /*
1017          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1018          * be positive, otherwise fail.
1019          */
1020         errno = 0;
1021         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1022         if (ngroups_max <= 0)
1023                 return errno_or_else(EOPNOTSUPP);
1024
1025         l_gids = new(gid_t, ngroups_max);
1026         if (!l_gids)
1027                 return -ENOMEM;
1028
1029         if (keep_groups) {
1030                 /*
1031                  * Lookup the list of groups that the user belongs to, we
1032                  * avoid NSS lookups here too for gid=0.
1033                  */
1034                 k = ngroups_max;
1035                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1036                         return -EINVAL;
1037         } else
1038                 k = 0;
1039
1040         STRV_FOREACH(i, c->supplementary_groups) {
1041                 const char *g;
1042
1043                 if (k >= ngroups_max)
1044                         return -E2BIG;
1045
1046                 g = *i;
1047                 r = get_group_creds(&g, l_gids+k, 0);
1048                 if (r < 0)
1049                         return r;
1050
1051                 k++;
1052         }
1053
1054         /*
1055          * Sets ngids to zero to drop all supplementary groups, happens
1056          * when we are under root and SupplementaryGroups= is empty.
1057          */
1058         if (k == 0) {
1059                 *ngids = 0;
1060                 return 0;
1061         }
1062
1063         /* Otherwise get the final list of supplementary groups */
1064         groups = memdup(l_gids, sizeof(gid_t) * k);
1065         if (!groups)
1066                 return -ENOMEM;
1067
1068         *supplementary_gids = groups;
1069         *ngids = k;
1070
1071         groups = NULL;
1072
1073         return 0;
1074 }
1075
1076 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1077         int r;
1078
1079         /* Handle SupplementaryGroups= if it is not empty */
1080         if (ngids > 0) {
1081                 r = maybe_setgroups(ngids, supplementary_gids);
1082                 if (r < 0)
1083                         return r;
1084         }
1085
1086         if (gid_is_valid(gid)) {
1087                 /* Then set our gids */
1088                 if (setresgid(gid, gid, gid) < 0)
1089                         return -errno;
1090         }
1091
1092         return 0;
1093 }
1094
1095 static int set_securebits(int bits, int mask) {
1096         int current, applied;
1097         current = prctl(PR_GET_SECUREBITS);
1098         if (current < 0)
1099                 return -errno;
1100         /* Clear all securebits defined in mask and set bits */
1101         applied = (current & ~mask) | bits;
1102         if (current == applied)
1103                 return 0;
1104         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1105                 return -errno;
1106         return 1;
1107 }
1108
1109 static int enforce_user(const ExecContext *context, uid_t uid) {
1110         assert(context);
1111         int r;
1112
1113         if (!uid_is_valid(uid))
1114                 return 0;
1115
1116         /* Sets (but doesn't look up) the uid and make sure we keep the
1117          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1118          * required, so we also need keep-caps in this case.
1119          */
1120
1121         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1122
1123                 /* First step: If we need to keep capabilities but
1124                  * drop privileges we need to make sure we keep our
1125                  * caps, while we drop privileges. */
1126                 if (uid != 0) {
1127                         /* Add KEEP_CAPS to the securebits */
1128                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1129                         if (r < 0)
1130                                 return r;
1131                 }
1132         }
1133
1134         /* Second step: actually set the uids */
1135         if (setresuid(uid, uid, uid) < 0)
1136                 return -errno;
1137
1138         /* At this point we should have all necessary capabilities but
1139            are otherwise a normal user. However, the caps might got
1140            corrupted due to the setresuid() so we need clean them up
1141            later. This is done outside of this call. */
1142
1143         return 0;
1144 }
1145
1146 #if HAVE_PAM
1147
1148 static int null_conv(
1149                 int num_msg,
1150                 const struct pam_message **msg,
1151                 struct pam_response **resp,
1152                 void *appdata_ptr) {
1153
1154         /* We don't support conversations */
1155
1156         return PAM_CONV_ERR;
1157 }
1158
1159 #endif
1160
1161 static int setup_pam(
1162                 const char *name,
1163                 const char *user,
1164                 uid_t uid,
1165                 gid_t gid,
1166                 const char *tty,
1167                 char ***env, /* updated on success */
1168                 const int fds[], size_t n_fds) {
1169
1170 #if HAVE_PAM
1171
1172         static const struct pam_conv conv = {
1173                 .conv = null_conv,
1174                 .appdata_ptr = NULL
1175         };
1176
1177         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1178         _cleanup_strv_free_ char **e = NULL;
1179         pam_handle_t *handle = NULL;
1180         sigset_t old_ss;
1181         int pam_code = PAM_SUCCESS, r;
1182         bool close_session = false;
1183         pid_t pam_pid = 0, parent_pid;
1184         int flags = 0;
1185
1186         assert(name);
1187         assert(user);
1188         assert(env);
1189
1190         /* We set up PAM in the parent process, then fork. The child
1191          * will then stay around until killed via PR_GET_PDEATHSIG or
1192          * systemd via the cgroup logic. It will then remove the PAM
1193          * session again. The parent process will exec() the actual
1194          * daemon. We do things this way to ensure that the main PID
1195          * of the daemon is the one we initially fork()ed. */
1196
1197         r = barrier_create(&barrier);
1198         if (r < 0)
1199                 goto fail;
1200
1201         if (log_get_max_level() < LOG_DEBUG)
1202                 flags |= PAM_SILENT;
1203
1204         pam_code = pam_start(name, user, &conv, &handle);
1205         if (pam_code != PAM_SUCCESS) {
1206                 handle = NULL;
1207                 goto fail;
1208         }
1209
1210         if (!tty) {
1211                 _cleanup_free_ char *q = NULL;
1212
1213                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1214                  * out if that's the case, and read the TTY off it. */
1215
1216                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1217                         tty = strjoina("/dev/", q);
1218         }
1219
1220         if (tty) {
1221                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1222                 if (pam_code != PAM_SUCCESS)
1223                         goto fail;
1224         }
1225
1226         STRV_FOREACH(nv, *env) {
1227                 pam_code = pam_putenv(handle, *nv);
1228                 if (pam_code != PAM_SUCCESS)
1229                         goto fail;
1230         }
1231
1232         pam_code = pam_acct_mgmt(handle, flags);
1233         if (pam_code != PAM_SUCCESS)
1234                 goto fail;
1235
1236         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1237         if (pam_code != PAM_SUCCESS)
1238                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1239
1240         pam_code = pam_open_session(handle, flags);
1241         if (pam_code != PAM_SUCCESS)
1242                 goto fail;
1243
1244         close_session = true;
1245
1246         e = pam_getenvlist(handle);
1247         if (!e) {
1248                 pam_code = PAM_BUF_ERR;
1249                 goto fail;
1250         }
1251
1252         /* Block SIGTERM, so that we know that it won't get lost in the child */
1253
1254         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1255
1256         parent_pid = getpid_cached();
1257
1258         r = safe_fork("(sd-pam)", 0, &pam_pid);
1259         if (r < 0)
1260                 goto fail;
1261         if (r == 0) {
1262                 int sig, ret = EXIT_PAM;
1263
1264                 /* The child's job is to reset the PAM session on termination */
1265                 barrier_set_role(&barrier, BARRIER_CHILD);
1266
1267                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1268                  * those fds are open here that have been opened by PAM. */
1269                 (void) close_many(fds, n_fds);
1270
1271                 /* Drop privileges - we don't need any to pam_close_session and this will make
1272                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1273                  * threads to fail to exit normally */
1274
1275                 r = maybe_setgroups(0, NULL);
1276                 if (r < 0)
1277                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1278                 if (setresgid(gid, gid, gid) < 0)
1279                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1280                 if (setresuid(uid, uid, uid) < 0)
1281                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1282
1283                 (void) ignore_signals(SIGPIPE);
1284
1285                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1286                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1287                  * this way. We rely on the control groups kill logic to do the rest for us. */
1288                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1289                         goto child_finish;
1290
1291                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1292                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1293                  *
1294                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1295                 (void) barrier_place(&barrier);
1296
1297                 /* Check if our parent process might already have died? */
1298                 if (getppid() == parent_pid) {
1299                         sigset_t ss;
1300
1301                         assert_se(sigemptyset(&ss) >= 0);
1302                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1303
1304                         for (;;) {
1305                                 if (sigwait(&ss, &sig) < 0) {
1306                                         if (errno == EINTR)
1307                                                 continue;
1308
1309                                         goto child_finish;
1310                                 }
1311
1312                                 assert(sig == SIGTERM);
1313                                 break;
1314                         }
1315                 }
1316
1317                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1318                 if (pam_code != PAM_SUCCESS)
1319                         goto child_finish;
1320
1321                 /* If our parent died we'll end the session */
1322                 if (getppid() != parent_pid) {
1323                         pam_code = pam_close_session(handle, flags);
1324                         if (pam_code != PAM_SUCCESS)
1325                                 goto child_finish;
1326                 }
1327
1328                 ret = 0;
1329
1330         child_finish:
1331                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1332                  * know about this. See pam_end(3) */
1333                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1334                 _exit(ret);
1335         }
1336
1337         barrier_set_role(&barrier, BARRIER_PARENT);
1338
1339         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1340          * here. */
1341         handle = NULL;
1342
1343         /* Unblock SIGTERM again in the parent */
1344         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1345
1346         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1347          * this fd around. */
1348         closelog();
1349
1350         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1351          * recover. However, warn loudly if it happens. */
1352         if (!barrier_place_and_sync(&barrier))
1353                 log_error("PAM initialization failed");
1354
1355         return strv_free_and_replace(*env, e);
1356
1357 fail:
1358         if (pam_code != PAM_SUCCESS) {
1359                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1360                 r = -EPERM;  /* PAM errors do not map to errno */
1361         } else
1362                 log_error_errno(r, "PAM failed: %m");
1363
1364         if (handle) {
1365                 if (close_session)
1366                         pam_code = pam_close_session(handle, flags);
1367
1368                 (void) pam_end(handle, pam_code | flags);
1369         }
1370
1371         closelog();
1372         return r;
1373 #else
1374         return 0;
1375 #endif
1376 }
1377
1378 static void rename_process_from_path(const char *path) {
1379         char process_name[11];
1380         const char *p;
1381         size_t l;
1382
1383         /* This resulting string must fit in 10 chars (i.e. the length
1384          * of "/sbin/init") to look pretty in /bin/ps */
1385
1386         p = basename(path);
1387         if (isempty(p)) {
1388                 rename_process("(...)");
1389                 return;
1390         }
1391
1392         l = strlen(p);
1393         if (l > 8) {
1394                 /* The end of the process name is usually more
1395                  * interesting, since the first bit might just be
1396                  * "systemd-" */
1397                 p = p + l - 8;
1398                 l = 8;
1399         }
1400
1401         process_name[0] = '(';
1402         memcpy(process_name+1, p, l);
1403         process_name[1+l] = ')';
1404         process_name[1+l+1] = 0;
1405
1406         rename_process(process_name);
1407 }
1408
1409 static bool context_has_address_families(const ExecContext *c) {
1410         assert(c);
1411
1412         return c->address_families_allow_list ||
1413                 !set_isempty(c->address_families);
1414 }
1415
1416 static bool context_has_syscall_filters(const ExecContext *c) {
1417         assert(c);
1418
1419         return c->syscall_allow_list ||
1420                 !hashmap_isempty(c->syscall_filter);
1421 }
1422
1423 static bool context_has_syscall_logs(const ExecContext *c) {
1424         assert(c);
1425
1426         return c->syscall_log_allow_list ||
1427                 !hashmap_isempty(c->syscall_log);
1428 }
1429
1430 static bool context_has_no_new_privileges(const ExecContext *c) {
1431         assert(c);
1432
1433         if (c->no_new_privileges)
1434                 return true;
1435
1436         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1437                 return false;
1438
1439         /* We need NNP if we have any form of seccomp and are unprivileged */
1440         return c->lock_personality ||
1441                 c->memory_deny_write_execute ||
1442                 c->private_devices ||
1443                 c->protect_clock ||
1444                 c->protect_hostname ||
1445                 c->protect_kernel_tunables ||
1446                 c->protect_kernel_modules ||
1447                 c->protect_kernel_logs ||
1448                 context_has_address_families(c) ||
1449                 exec_context_restrict_namespaces_set(c) ||
1450                 c->restrict_realtime ||
1451                 c->restrict_suid_sgid ||
1452                 !set_isempty(c->syscall_archs) ||
1453                 context_has_syscall_filters(c) ||
1454                 context_has_syscall_logs(c);
1455 }
1456
1457 static bool exec_context_has_credentials(const ExecContext *context) {
1458
1459         assert(context);
1460
1461         return !hashmap_isempty(context->set_credentials) ||
1462                 !hashmap_isempty(context->load_credentials);
1463 }
1464
1465 #if HAVE_SECCOMP
1466
1467 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1468
1469         if (is_seccomp_available())
1470                 return false;
1471
1472         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1473         return true;
1474 }
1475
1476 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1477         uint32_t negative_action, default_action, action;
1478         int r;
1479
1480         assert(u);
1481         assert(c);
1482
1483         if (!context_has_syscall_filters(c))
1484                 return 0;
1485
1486         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1487                 return 0;
1488
1489         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1490
1491         if (c->syscall_allow_list) {
1492                 default_action = negative_action;
1493                 action = SCMP_ACT_ALLOW;
1494         } else {
1495                 default_action = SCMP_ACT_ALLOW;
1496                 action = negative_action;
1497         }
1498
1499         if (needs_ambient_hack) {
1500                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1501                 if (r < 0)
1502                         return r;
1503         }
1504
1505         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1506 }
1507
1508 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1509 #ifdef SCMP_ACT_LOG
1510         uint32_t default_action, action;
1511 #endif
1512
1513         assert(u);
1514         assert(c);
1515
1516         if (!context_has_syscall_logs(c))
1517                 return 0;
1518
1519 #ifdef SCMP_ACT_LOG
1520         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1521                 return 0;
1522
1523         if (c->syscall_log_allow_list) {
1524                 /* Log nothing but the ones listed */
1525                 default_action = SCMP_ACT_ALLOW;
1526                 action = SCMP_ACT_LOG;
1527         } else {
1528                 /* Log everything but the ones listed */
1529                 default_action = SCMP_ACT_LOG;
1530                 action = SCMP_ACT_ALLOW;
1531         }
1532
1533         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1534 #else
1535         /* old libseccomp */
1536         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1537         return 0;
1538 #endif
1539 }
1540
1541 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1542         assert(u);
1543         assert(c);
1544
1545         if (set_isempty(c->syscall_archs))
1546                 return 0;
1547
1548         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1549                 return 0;
1550
1551         return seccomp_restrict_archs(c->syscall_archs);
1552 }
1553
1554 static int apply_address_families(const Unit* u, const ExecContext *c) {
1555         assert(u);
1556         assert(c);
1557
1558         if (!context_has_address_families(c))
1559                 return 0;
1560
1561         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1562                 return 0;
1563
1564         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1565 }
1566
1567 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1568         assert(u);
1569         assert(c);
1570
1571         if (!c->memory_deny_write_execute)
1572                 return 0;
1573
1574         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1575                 return 0;
1576
1577         return seccomp_memory_deny_write_execute();
1578 }
1579
1580 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1581         assert(u);
1582         assert(c);
1583
1584         if (!c->restrict_realtime)
1585                 return 0;
1586
1587         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1588                 return 0;
1589
1590         return seccomp_restrict_realtime();
1591 }
1592
1593 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1594         assert(u);
1595         assert(c);
1596
1597         if (!c->restrict_suid_sgid)
1598                 return 0;
1599
1600         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1601                 return 0;
1602
1603         return seccomp_restrict_suid_sgid();
1604 }
1605
1606 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1607         assert(u);
1608         assert(c);
1609
1610         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1611          * let's protect even those systems where this is left on in the kernel. */
1612
1613         if (!c->protect_kernel_tunables)
1614                 return 0;
1615
1616         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1617                 return 0;
1618
1619         return seccomp_protect_sysctl();
1620 }
1621
1622 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1623         assert(u);
1624         assert(c);
1625
1626         /* Turn off module syscalls on ProtectKernelModules=yes */
1627
1628         if (!c->protect_kernel_modules)
1629                 return 0;
1630
1631         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1632                 return 0;
1633
1634         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1635 }
1636
1637 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1638         assert(u);
1639         assert(c);
1640
1641         if (!c->protect_kernel_logs)
1642                 return 0;
1643
1644         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1645                 return 0;
1646
1647         return seccomp_protect_syslog();
1648 }
1649
1650 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1651         assert(u);
1652         assert(c);
1653
1654         if (!c->protect_clock)
1655                 return 0;
1656
1657         if (skip_seccomp_unavailable(u, "ProtectClock="))
1658                 return 0;
1659
1660         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1661 }
1662
1663 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1664         assert(u);
1665         assert(c);
1666
1667         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1668
1669         if (!c->private_devices)
1670                 return 0;
1671
1672         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1673                 return 0;
1674
1675         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1676 }
1677
1678 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1679         assert(u);
1680         assert(c);
1681
1682         if (!exec_context_restrict_namespaces_set(c))
1683                 return 0;
1684
1685         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1686                 return 0;
1687
1688         return seccomp_restrict_namespaces(c->restrict_namespaces);
1689 }
1690
1691 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1692         unsigned long personality;
1693         int r;
1694
1695         assert(u);
1696         assert(c);
1697
1698         if (!c->lock_personality)
1699                 return 0;
1700
1701         if (skip_seccomp_unavailable(u, "LockPersonality="))
1702                 return 0;
1703
1704         personality = c->personality;
1705
1706         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1707         if (personality == PERSONALITY_INVALID) {
1708
1709                 r = opinionated_personality(&personality);
1710                 if (r < 0)
1711                         return r;
1712         }
1713
1714         return seccomp_lock_personality(personality);
1715 }
1716
1717 #endif
1718
1719 #if HAVE_LIBBPF
1720 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1721         assert(u);
1722         assert(c);
1723
1724         if (!exec_context_restrict_filesystems_set(c))
1725                 return 0;
1726
1727         if (!u->manager->restrict_fs) {
1728                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1729                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1730                 return 0;
1731         }
1732
1733         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1734 }
1735 #endif
1736
1737 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1738         assert(u);
1739         assert(c);
1740
1741         if (!c->protect_hostname)
1742                 return 0;
1743
1744         if (ns_type_supported(NAMESPACE_UTS)) {
1745                 if (unshare(CLONE_NEWUTS) < 0) {
1746                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1747                                 *ret_exit_status = EXIT_NAMESPACE;
1748                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1749                         }
1750
1751                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1752                 }
1753         } else
1754                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1755
1756 #if HAVE_SECCOMP
1757         int r;
1758
1759         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1760                 return 0;
1761
1762         r = seccomp_protect_hostname();
1763         if (r < 0) {
1764                 *ret_exit_status = EXIT_SECCOMP;
1765                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1766         }
1767 #endif
1768
1769         return 0;
1770 }
1771
1772 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1773         assert(idle_pipe);
1774
1775         idle_pipe[1] = safe_close(idle_pipe[1]);
1776         idle_pipe[2] = safe_close(idle_pipe[2]);
1777
1778         if (idle_pipe[0] >= 0) {
1779                 int r;
1780
1781                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1782
1783                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1784                         ssize_t n;
1785
1786                         /* Signal systemd that we are bored and want to continue. */
1787                         n = write(idle_pipe[3], "x", 1);
1788                         if (n > 0)
1789                                 /* Wait for systemd to react to the signal above. */
1790                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1791                 }
1792
1793                 idle_pipe[0] = safe_close(idle_pipe[0]);
1794
1795         }
1796
1797         idle_pipe[3] = safe_close(idle_pipe[3]);
1798 }
1799
1800 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1801
1802 static int build_environment(
1803                 const Unit *u,
1804                 const ExecContext *c,
1805                 const ExecParameters *p,
1806                 size_t n_fds,
1807                 const char *home,
1808                 const char *username,
1809                 const char *shell,
1810                 dev_t journal_stream_dev,
1811                 ino_t journal_stream_ino,
1812                 char ***ret) {
1813
1814         _cleanup_strv_free_ char **our_env = NULL;
1815         size_t n_env = 0;
1816         char *x;
1817
1818         assert(u);
1819         assert(c);
1820         assert(p);
1821         assert(ret);
1822
1823 #define N_ENV_VARS 17
1824         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1825         if (!our_env)
1826                 return -ENOMEM;
1827
1828         if (n_fds > 0) {
1829                 _cleanup_free_ char *joined = NULL;
1830
1831                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1832                         return -ENOMEM;
1833                 our_env[n_env++] = x;
1834
1835                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1836                         return -ENOMEM;
1837                 our_env[n_env++] = x;
1838
1839                 joined = strv_join(p->fd_names, ":");
1840                 if (!joined)
1841                         return -ENOMEM;
1842
1843                 x = strjoin("LISTEN_FDNAMES=", joined);
1844                 if (!x)
1845                         return -ENOMEM;
1846                 our_env[n_env++] = x;
1847         }
1848
1849         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1850                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1851                         return -ENOMEM;
1852                 our_env[n_env++] = x;
1853
1854                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1855                         return -ENOMEM;
1856                 our_env[n_env++] = x;
1857         }
1858
1859         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1860          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1861          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1862         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1863                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1864                 if (!x)
1865                         return -ENOMEM;
1866                 our_env[n_env++] = x;
1867         }
1868
1869         if (home) {
1870                 x = strjoin("HOME=", home);
1871                 if (!x)
1872                         return -ENOMEM;
1873
1874                 path_simplify(x + 5);
1875                 our_env[n_env++] = x;
1876         }
1877
1878         if (username) {
1879                 x = strjoin("LOGNAME=", username);
1880                 if (!x)
1881                         return -ENOMEM;
1882                 our_env[n_env++] = x;
1883
1884                 x = strjoin("USER=", username);
1885                 if (!x)
1886                         return -ENOMEM;
1887                 our_env[n_env++] = x;
1888         }
1889
1890         if (shell) {
1891                 x = strjoin("SHELL=", shell);
1892                 if (!x)
1893                         return -ENOMEM;
1894
1895                 path_simplify(x + 6);
1896                 our_env[n_env++] = x;
1897         }
1898
1899         if (!sd_id128_is_null(u->invocation_id)) {
1900                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1901                         return -ENOMEM;
1902
1903                 our_env[n_env++] = x;
1904         }
1905
1906         if (exec_context_needs_term(c)) {
1907                 const char *tty_path, *term = NULL;
1908
1909                 tty_path = exec_context_tty_path(c);
1910
1911                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1912                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1913                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1914
1915                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1916                         term = getenv("TERM");
1917
1918                 if (!term)
1919                         term = default_term_for_tty(tty_path);
1920
1921                 x = strjoin("TERM=", term);
1922                 if (!x)
1923                         return -ENOMEM;
1924                 our_env[n_env++] = x;
1925         }
1926
1927         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1928                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1929                         return -ENOMEM;
1930
1931                 our_env[n_env++] = x;
1932         }
1933
1934         if (c->log_namespace) {
1935                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1936                 if (!x)
1937                         return -ENOMEM;
1938
1939                 our_env[n_env++] = x;
1940         }
1941
1942         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1943                 _cleanup_free_ char *joined = NULL;
1944                 const char *n;
1945
1946                 if (!p->prefix[t])
1947                         continue;
1948
1949                 if (c->directories[t].n_items == 0)
1950                         continue;
1951
1952                 n = exec_directory_env_name_to_string(t);
1953                 if (!n)
1954                         continue;
1955
1956                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1957                         _cleanup_free_ char *prefixed = NULL;
1958
1959                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1960                         if (!prefixed)
1961                                 return -ENOMEM;
1962
1963                         if (!strextend_with_separator(&joined, ":", prefixed))
1964                                 return -ENOMEM;
1965                 }
1966
1967                 x = strjoin(n, "=", joined);
1968                 if (!x)
1969                         return -ENOMEM;
1970
1971                 our_env[n_env++] = x;
1972         }
1973
1974         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1975                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1976                 if (!x)
1977                         return -ENOMEM;
1978
1979                 our_env[n_env++] = x;
1980         }
1981
1982         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1983                 return -ENOMEM;
1984
1985         our_env[n_env++] = x;
1986
1987         our_env[n_env++] = NULL;
1988         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1989 #undef N_ENV_VARS
1990
1991         *ret = TAKE_PTR(our_env);
1992
1993         return 0;
1994 }
1995
1996 static int build_pass_environment(const ExecContext *c, char ***ret) {
1997         _cleanup_strv_free_ char **pass_env = NULL;
1998         size_t n_env = 0;
1999
2000         STRV_FOREACH(i, c->pass_environment) {
2001                 _cleanup_free_ char *x = NULL;
2002                 char *v;
2003
2004                 v = getenv(*i);
2005                 if (!v)
2006                         continue;
2007                 x = strjoin(*i, "=", v);
2008                 if (!x)
2009                         return -ENOMEM;
2010
2011                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2012                         return -ENOMEM;
2013
2014                 pass_env[n_env++] = TAKE_PTR(x);
2015                 pass_env[n_env] = NULL;
2016         }
2017
2018         *ret = TAKE_PTR(pass_env);
2019
2020         return 0;
2021 }
2022
2023 bool exec_needs_mount_namespace(
2024                 const ExecContext *context,
2025                 const ExecParameters *params,
2026                 const ExecRuntime *runtime) {
2027
2028         assert(context);
2029
2030         if (context->root_image)
2031                 return true;
2032
2033         if (!strv_isempty(context->read_write_paths) ||
2034             !strv_isempty(context->read_only_paths) ||
2035             !strv_isempty(context->inaccessible_paths) ||
2036             !strv_isempty(context->exec_paths) ||
2037             !strv_isempty(context->no_exec_paths))
2038                 return true;
2039
2040         if (context->n_bind_mounts > 0)
2041                 return true;
2042
2043         if (context->n_temporary_filesystems > 0)
2044                 return true;
2045
2046         if (context->n_mount_images > 0)
2047                 return true;
2048
2049         if (context->n_extension_images > 0)
2050                 return true;
2051
2052         if (!strv_isempty(context->extension_directories))
2053                 return true;
2054
2055         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2056                 return true;
2057
2058         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2059                 return true;
2060
2061         if (context->private_devices ||
2062             context->private_mounts ||
2063             context->protect_system != PROTECT_SYSTEM_NO ||
2064             context->protect_home != PROTECT_HOME_NO ||
2065             context->protect_kernel_tunables ||
2066             context->protect_kernel_modules ||
2067             context->protect_kernel_logs ||
2068             context->protect_control_groups ||
2069             context->protect_proc != PROTECT_PROC_DEFAULT ||
2070             context->proc_subset != PROC_SUBSET_ALL ||
2071             context->private_ipc ||
2072             context->ipc_namespace_path)
2073                 return true;
2074
2075         if (context->root_directory) {
2076                 if (exec_context_get_effective_mount_apivfs(context))
2077                         return true;
2078
2079                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2080                         if (params && !params->prefix[t])
2081                                 continue;
2082
2083                         if (context->directories[t].n_items > 0)
2084                                 return true;
2085                 }
2086         }
2087
2088         if (context->dynamic_user &&
2089             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2090              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2091              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2092                 return true;
2093
2094         if (context->log_namespace)
2095                 return true;
2096
2097         return false;
2098 }
2099
2100 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2101         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2102         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2103         _cleanup_close_ int unshare_ready_fd = -1;
2104         _cleanup_(sigkill_waitp) pid_t pid = 0;
2105         uint64_t c = 1;
2106         ssize_t n;
2107         int r;
2108
2109         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2110          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2111          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2112          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2113          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2114          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2115          * continues execution normally.
2116          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2117          * does not need CAP_SETUID to write the single line mapping to itself. */
2118
2119         /* Can only set up multiple mappings with CAP_SETUID. */
2120         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2121                 r = asprintf(&uid_map,
2122                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2123                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2124                              ouid, ouid, uid, uid);
2125         else
2126                 r = asprintf(&uid_map,
2127                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2128                              ouid, ouid);
2129
2130         if (r < 0)
2131                 return -ENOMEM;
2132
2133         /* Can only set up multiple mappings with CAP_SETGID. */
2134         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2135                 r = asprintf(&gid_map,
2136                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2137                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2138                              ogid, ogid, gid, gid);
2139         else
2140                 r = asprintf(&gid_map,
2141                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2142                              ogid, ogid);
2143
2144         if (r < 0)
2145                 return -ENOMEM;
2146
2147         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2148          * namespace. */
2149         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2150         if (unshare_ready_fd < 0)
2151                 return -errno;
2152
2153         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2154          * failed. */
2155         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2156                 return -errno;
2157
2158         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2159         if (r < 0)
2160                 return r;
2161         if (r == 0) {
2162                 _cleanup_close_ int fd = -1;
2163                 const char *a;
2164                 pid_t ppid;
2165
2166                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2167                  * here, after the parent opened its own user namespace. */
2168
2169                 ppid = getppid();
2170                 errno_pipe[0] = safe_close(errno_pipe[0]);
2171
2172                 /* Wait until the parent unshared the user namespace */
2173                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2174                         r = -errno;
2175                         goto child_fail;
2176                 }
2177
2178                 /* Disable the setgroups() system call in the child user namespace, for good. */
2179                 a = procfs_file_alloca(ppid, "setgroups");
2180                 fd = open(a, O_WRONLY|O_CLOEXEC);
2181                 if (fd < 0) {
2182                         if (errno != ENOENT) {
2183                                 r = -errno;
2184                                 goto child_fail;
2185                         }
2186
2187                         /* If the file is missing the kernel is too old, let's continue anyway. */
2188                 } else {
2189                         if (write(fd, "deny\n", 5) < 0) {
2190                                 r = -errno;
2191                                 goto child_fail;
2192                         }
2193
2194                         fd = safe_close(fd);
2195                 }
2196
2197                 /* First write the GID map */
2198                 a = procfs_file_alloca(ppid, "gid_map");
2199                 fd = open(a, O_WRONLY|O_CLOEXEC);
2200                 if (fd < 0) {
2201                         r = -errno;
2202                         goto child_fail;
2203                 }
2204                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2205                         r = -errno;
2206                         goto child_fail;
2207                 }
2208                 fd = safe_close(fd);
2209
2210                 /* The write the UID map */
2211                 a = procfs_file_alloca(ppid, "uid_map");
2212                 fd = open(a, O_WRONLY|O_CLOEXEC);
2213                 if (fd < 0) {
2214                         r = -errno;
2215                         goto child_fail;
2216                 }
2217                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2218                         r = -errno;
2219                         goto child_fail;
2220                 }
2221
2222                 _exit(EXIT_SUCCESS);
2223
2224         child_fail:
2225                 (void) write(errno_pipe[1], &r, sizeof(r));
2226                 _exit(EXIT_FAILURE);
2227         }
2228
2229         errno_pipe[1] = safe_close(errno_pipe[1]);
2230
2231         if (unshare(CLONE_NEWUSER) < 0)
2232                 return -errno;
2233
2234         /* Let the child know that the namespace is ready now */
2235         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2236                 return -errno;
2237
2238         /* Try to read an error code from the child */
2239         n = read(errno_pipe[0], &r, sizeof(r));
2240         if (n < 0)
2241                 return -errno;
2242         if (n == sizeof(r)) { /* an error code was sent to us */
2243                 if (r < 0)
2244                         return r;
2245                 return -EIO;
2246         }
2247         if (n != 0) /* on success we should have read 0 bytes */
2248                 return -EIO;
2249
2250         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2251         if (r < 0)
2252                 return r;
2253         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2254                 return -EIO;
2255
2256         return 0;
2257 }
2258
2259 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2260         if (!context->dynamic_user)
2261                 return false;
2262
2263         if (type == EXEC_DIRECTORY_CONFIGURATION)
2264                 return false;
2265
2266         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2267                 return false;
2268
2269         return true;
2270 }
2271
2272 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2273         _cleanup_free_ char *src_abs = NULL;
2274         int r;
2275
2276         assert(source);
2277
2278         src_abs = path_join(root, source);
2279         if (!src_abs)
2280                 return -ENOMEM;
2281
2282         STRV_FOREACH(dst, symlinks) {
2283                 _cleanup_free_ char *dst_abs = NULL;
2284
2285                 dst_abs = path_join(root, *dst);
2286                 if (!dst_abs)
2287                         return -ENOMEM;
2288
2289                 r = mkdir_parents_label(dst_abs, 0755);
2290                 if (r < 0)
2291                         return r;
2292
2293                 r = symlink_idempotent(src_abs, dst_abs, true);
2294                 if (r < 0)
2295                         return r;
2296         }
2297
2298         return 0;
2299 }
2300
2301 static int setup_exec_directory(
2302                 const ExecContext *context,
2303                 const ExecParameters *params,
2304                 uid_t uid,
2305                 gid_t gid,
2306                 ExecDirectoryType type,
2307                 bool needs_mount_namespace,
2308                 int *exit_status) {
2309
2310         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2311                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2312                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2313                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2314                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2315                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2316         };
2317         int r;
2318
2319         assert(context);
2320         assert(params);
2321         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2322         assert(exit_status);
2323
2324         if (!params->prefix[type])
2325                 return 0;
2326
2327         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2328                 if (!uid_is_valid(uid))
2329                         uid = 0;
2330                 if (!gid_is_valid(gid))
2331                         gid = 0;
2332         }
2333
2334         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2335                 _cleanup_free_ char *p = NULL, *pp = NULL;
2336
2337                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2338                 if (!p) {
2339                         r = -ENOMEM;
2340                         goto fail;
2341                 }
2342
2343                 r = mkdir_parents_label(p, 0755);
2344                 if (r < 0)
2345                         goto fail;
2346
2347                 if (exec_directory_is_private(context, type)) {
2348                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2349                          * case we want to avoid leaving a directory around fully accessible that is owned by
2350                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2351                          * trick used by container managers to prohibit host users to get access to files of
2352                          * the same UID in containers: we place everything inside a directory that has an
2353                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2354                          * for unprivileged host code. We then use fs namespacing to make this directory
2355                          * permeable for the service itself.
2356                          *
2357                          * Specifically: for a service which wants a special directory "foo/" we first create
2358                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2359                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2360                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2361                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2362                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2363                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2364                          * for the service and making sure it only gets access to the dirs it needs but no
2365                          * others. Tricky? Yes, absolutely, but it works!
2366                          *
2367                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2368                          * to be owned by the service itself.
2369                          *
2370                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2371                          * for sharing files or sockets with other services. */
2372
2373                         pp = path_join(params->prefix[type], "private");
2374                         if (!pp) {
2375                                 r = -ENOMEM;
2376                                 goto fail;
2377                         }
2378
2379                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2380                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2381                         if (r < 0)
2382                                 goto fail;
2383
2384                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2385                                 r = -ENOMEM;
2386                                 goto fail;
2387                         }
2388
2389                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2390                         r = mkdir_parents_label(pp, 0755);
2391                         if (r < 0)
2392                                 goto fail;
2393
2394                         if (is_dir(p, false) > 0 &&
2395                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2396
2397                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2398                                  * it over. Most likely the service has been upgraded from one that didn't use
2399                                  * DynamicUser=1, to one that does. */
2400
2401                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2402                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2403                                          exec_directory_type_to_string(type), p, pp);
2404
2405                                 if (rename(p, pp) < 0) {
2406                                         r = -errno;
2407                                         goto fail;
2408                                 }
2409                         } else {
2410                                 /* Otherwise, create the actual directory for the service */
2411
2412                                 r = mkdir_label(pp, context->directories[type].mode);
2413                                 if (r < 0 && r != -EEXIST)
2414                                         goto fail;
2415                         }
2416
2417                         if (!context->directories[type].items[i].only_create) {
2418                                 /* And link it up from the original place.
2419                                  * Notes
2420                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2421                                  *    the host, and a new one for the child namespace will be created later.
2422                                  * 2) It is not necessary to create this symlink when one of its parent
2423                                  *    directories is specified and already created. E.g.
2424                                  *        StateDirectory=foo foo/bar
2425                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2426                                  *        pp = "/var/lib/private/foo/bar"
2427                                  *        p = "/var/lib/foo/bar"
2428                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2429                                  *    we do not need to create the symlink, but we cannot create the symlink.
2430                                  *    See issue #24783. */
2431                                 r = symlink_idempotent(pp, p, true);
2432                                 if (r < 0)
2433                                         goto fail;
2434                         }
2435
2436                 } else {
2437                         _cleanup_free_ char *target = NULL;
2438
2439                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2440                             readlink_and_make_absolute(p, &target) >= 0) {
2441                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2442
2443                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2444                                  * by DynamicUser=1 (see above)?
2445                                  *
2446                                  * We do this for all directory types except for ConfigurationDirectory=,
2447                                  * since they all support the private/ symlink logic at least in some
2448                                  * configurations, see above. */
2449
2450                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2451                                 if (r < 0)
2452                                         goto fail;
2453
2454                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2455                                 if (!q) {
2456                                         r = -ENOMEM;
2457                                         goto fail;
2458                                 }
2459
2460                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2461                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2462                                 if (r < 0)
2463                                         goto fail;
2464
2465                                 if (path_equal(q_resolved, target_resolved)) {
2466
2467                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2468                                          * but is no longer. Let's move the directory back up. */
2469
2470                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2471                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2472                                                  exec_directory_type_to_string(type), q, p);
2473
2474                                         if (unlink(p) < 0) {
2475                                                 r = -errno;
2476                                                 goto fail;
2477                                         }
2478
2479                                         if (rename(q, p) < 0) {
2480                                                 r = -errno;
2481                                                 goto fail;
2482                                         }
2483                                 }
2484                         }
2485
2486                         r = mkdir_label(p, context->directories[type].mode);
2487                         if (r < 0) {
2488                                 if (r != -EEXIST)
2489                                         goto fail;
2490
2491                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2492                                         struct stat st;
2493
2494                                         /* Don't change the owner/access mode of the configuration directory,
2495                                          * as in the common case it is not written to by a service, and shall
2496                                          * not be writable. */
2497
2498                                         if (stat(p, &st) < 0) {
2499                                                 r = -errno;
2500                                                 goto fail;
2501                                         }
2502
2503                                         /* Still complain if the access mode doesn't match */
2504                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2505                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2506                                                             "(File system: %o %sMode: %o)",
2507                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2508                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2509
2510                                         continue;
2511                                 }
2512                         }
2513                 }
2514
2515                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2516                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2517                  * current UID/GID ownership.) */
2518                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2519                 if (r < 0)
2520                         goto fail;
2521
2522                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2523                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2524                  * assignments to exist. */
2525                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2526                 if (r < 0)
2527                         goto fail;
2528         }
2529
2530         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2531          * they are set up later, to allow configuring empty var/run/etc. */
2532         if (!needs_mount_namespace)
2533                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2534                         r = create_many_symlinks(params->prefix[type],
2535                                                  context->directories[type].items[i].path,
2536                                                  context->directories[type].items[i].symlinks);
2537                         if (r < 0)
2538                                 goto fail;
2539                 }
2540
2541         return 0;
2542
2543 fail:
2544         *exit_status = exit_status_table[type];
2545         return r;
2546 }
2547
2548 static int write_credential(
2549                 int dfd,
2550                 const char *id,
2551                 const void *data,
2552                 size_t size,
2553                 uid_t uid,
2554                 bool ownership_ok) {
2555
2556         _cleanup_(unlink_and_freep) char *tmp = NULL;
2557         _cleanup_close_ int fd = -1;
2558         int r;
2559
2560         r = tempfn_random_child("", "cred", &tmp);
2561         if (r < 0)
2562                 return r;
2563
2564         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2565         if (fd < 0) {
2566                 tmp = mfree(tmp);
2567                 return -errno;
2568         }
2569
2570         r = loop_write(fd, data, size, /* do_poll = */ false);
2571         if (r < 0)
2572                 return r;
2573
2574         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2575                 return -errno;
2576
2577         if (uid_is_valid(uid) && uid != getuid()) {
2578                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2579                 if (r < 0) {
2580                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2581                                 return r;
2582
2583                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2584                                             * to express: that the user gets read access and nothing
2585                                             * else. But if the backing fs can't support that (e.g. ramfs)
2586                                             * then we can use file ownership instead. But that's only safe if
2587                                             * we can then re-mount the whole thing read-only, so that the
2588                                             * user can no longer chmod() the file to gain write access. */
2589                                 return r;
2590
2591                         if (fchown(fd, uid, GID_INVALID) < 0)
2592                                 return -errno;
2593                 }
2594         }
2595
2596         if (renameat(dfd, tmp, dfd, id) < 0)
2597                 return -errno;
2598
2599         tmp = mfree(tmp);
2600         return 0;
2601 }
2602
2603 static char **credential_search_path(
2604                 const ExecParameters *params,
2605                 bool encrypted) {
2606
2607         _cleanup_strv_free_ char **l = NULL;
2608
2609         assert(params);
2610
2611         /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2612          * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2613          * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2614
2615         if (encrypted) {
2616                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2617                         return NULL;
2618
2619                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2620                         return NULL;
2621         }
2622
2623         if (params->received_credentials_directory)
2624                 if (strv_extend(&l, params->received_credentials_directory) < 0)
2625                         return NULL;
2626
2627         if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2628                 return NULL;
2629
2630         if (DEBUG_LOGGING) {
2631                 _cleanup_free_ char *t = strv_join(l, ":");
2632
2633                 log_debug("Credential search path is: %s", t);
2634         }
2635
2636         return TAKE_PTR(l);
2637 }
2638
2639 static int load_credential(
2640                 const ExecContext *context,
2641                 const ExecParameters *params,
2642                 const char *id,
2643                 const char *path,
2644                 bool encrypted,
2645                 const char *unit,
2646                 int read_dfd,
2647                 int write_dfd,
2648                 uid_t uid,
2649                 bool ownership_ok,
2650                 uint64_t *left) {
2651
2652         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2653         _cleanup_strv_free_ char **search_path = NULL;
2654         _cleanup_(erase_and_freep) char *data = NULL;
2655         _cleanup_free_ char *bindname = NULL;
2656         const char *source = NULL;
2657         bool missing_ok = true;
2658         size_t size, add, maxsz;
2659         int r;
2660
2661         assert(context);
2662         assert(params);
2663         assert(id);
2664         assert(path);
2665         assert(unit);
2666         assert(write_dfd >= 0);
2667         assert(left);
2668
2669         if (read_dfd >= 0) {
2670                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2671                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2672                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2673                  * open it. */
2674
2675                 if (!filename_is_valid(path)) /* safety check */
2676                         return -EINVAL;
2677
2678                 missing_ok = true;
2679                 source = path;
2680
2681         } else if (path_is_absolute(path)) {
2682                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2683                  * sockets */
2684
2685                 if (!path_is_valid(path)) /* safety check */
2686                         return -EINVAL;
2687
2688                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2689
2690                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2691                  * via the source socket address in case we read off an AF_UNIX socket. */
2692                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2693                         return -ENOMEM;
2694
2695                 missing_ok = false;
2696                 source = path;
2697
2698         } else if (credential_name_valid(path)) {
2699                 /* If this is a relative path, take it as credential name relative to the credentials
2700                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2701                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2702
2703                 search_path = credential_search_path(params, encrypted);
2704                 if (!search_path)
2705                         return -ENOMEM;
2706
2707                 missing_ok = true;
2708         } else
2709                 source = NULL;
2710
2711         if (encrypted)
2712                 flags |= READ_FULL_FILE_UNBASE64;
2713
2714         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2715
2716         if (search_path) {
2717                 STRV_FOREACH(d, search_path) {
2718                         _cleanup_free_ char *j = NULL;
2719
2720                         j = path_join(*d, path);
2721                         if (!j)
2722                                 return -ENOMEM;
2723
2724                         r = read_full_file_full(
2725                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2726                                         UINT64_MAX,
2727                                         maxsz,
2728                                         flags,
2729                                         NULL,
2730                                         &data, &size);
2731                         if (r != -ENOENT)
2732                                 break;
2733                 }
2734         } else if (source)
2735                 r = read_full_file_full(
2736                                 read_dfd, source,
2737                                 UINT64_MAX,
2738                                 maxsz,
2739                                 flags,
2740                                 bindname,
2741                                 &data, &size);
2742         else
2743                 r = -ENOENT;
2744
2745         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2746                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2747                  * will get clear errors if we don't pass such a missing credential on as they
2748                  * themselves will get ENOENT when trying to read them, which should not be much
2749                  * worse than when we handle the error here and make it fatal.
2750                  *
2751                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2752                  * we are fine, too. */
2753                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2754                 return 0;
2755         }
2756         if (r < 0)
2757                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2758
2759         if (encrypted) {
2760                 _cleanup_free_ void *plaintext = NULL;
2761                 size_t plaintext_size = 0;
2762
2763                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2764                 if (r < 0)
2765                         return r;
2766
2767                 free_and_replace(data, plaintext);
2768                 size = plaintext_size;
2769         }
2770
2771         add = strlen(id) + size;
2772         if (add > *left)
2773                 return -E2BIG;
2774
2775         r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2776         if (r < 0)
2777                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2778
2779         *left -= add;
2780         return 0;
2781 }
2782
2783 struct load_cred_args {
2784         const ExecContext *context;
2785         const ExecParameters *params;
2786         bool encrypted;
2787         const char *unit;
2788         int dfd;
2789         uid_t uid;
2790         bool ownership_ok;
2791         uint64_t *left;
2792 };
2793
2794 static int load_cred_recurse_dir_cb(
2795                 RecurseDirEvent event,
2796                 const char *path,
2797                 int dir_fd,
2798                 int inode_fd,
2799                 const struct dirent *de,
2800                 const struct statx *sx,
2801                 void *userdata) {
2802
2803         struct load_cred_args *args = ASSERT_PTR(userdata);
2804         _cleanup_free_ char *sub_id = NULL;
2805         int r;
2806
2807         if (event != RECURSE_DIR_ENTRY)
2808                 return RECURSE_DIR_CONTINUE;
2809
2810         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2811                 return RECURSE_DIR_CONTINUE;
2812
2813         sub_id = strreplace(path, "/", "_");
2814         if (!sub_id)
2815                 return -ENOMEM;
2816
2817         if (!credential_name_valid(sub_id))
2818                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2819
2820         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2821                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2822                 return RECURSE_DIR_CONTINUE;
2823         }
2824         if (errno != ENOENT)
2825                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2826
2827         r = load_credential(
2828                         args->context,
2829                         args->params,
2830                         sub_id,
2831                         de->d_name,
2832                         args->encrypted,
2833                         args->unit,
2834                         dir_fd,
2835                         args->dfd,
2836                         args->uid,
2837                         args->ownership_ok,
2838                         args->left);
2839         if (r < 0)
2840                 return r;
2841
2842         return RECURSE_DIR_CONTINUE;
2843 }
2844
2845 static int acquire_credentials(
2846                 const ExecContext *context,
2847                 const ExecParameters *params,
2848                 const char *unit,
2849                 const char *p,
2850                 uid_t uid,
2851                 bool ownership_ok) {
2852
2853         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2854         _cleanup_close_ int dfd = -1;
2855         ExecLoadCredential *lc;
2856         ExecSetCredential *sc;
2857         int r;
2858
2859         assert(context);
2860         assert(p);
2861
2862         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2863         if (dfd < 0)
2864                 return -errno;
2865
2866         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2867         HASHMAP_FOREACH(lc, context->load_credentials) {
2868                 _cleanup_close_ int sub_fd = -1;
2869
2870                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2871                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2872                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
2873                  * propagate a credential passed to us from further up. */
2874
2875                 if (path_is_absolute(lc->path)) {
2876                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2877                         if (sub_fd < 0 && !IN_SET(errno,
2878                                                   ENOTDIR,  /* Not a directory */
2879                                                   ENOENT))  /* Doesn't exist? */
2880                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2881                 }
2882
2883                 if (sub_fd < 0)
2884                         /* Regular file (incl. a credential passed in from higher up) */
2885                         r = load_credential(
2886                                         context,
2887                                         params,
2888                                         lc->id,
2889                                         lc->path,
2890                                         lc->encrypted,
2891                                         unit,
2892                                         -1,
2893                                         dfd,
2894                                         uid,
2895                                         ownership_ok,
2896                                         &left);
2897                 else
2898                         /* Directory */
2899                         r = recurse_dir(
2900                                         sub_fd,
2901                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2902                                         /* statx_mask= */ 0,
2903                                         /* n_depth_max= */ UINT_MAX,
2904                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2905                                         load_cred_recurse_dir_cb,
2906                                         &(struct load_cred_args) {
2907                                                 .context = context,
2908                                                 .params = params,
2909                                                 .encrypted = lc->encrypted,
2910                                                 .unit = unit,
2911                                                 .dfd = dfd,
2912                                                 .uid = uid,
2913                                                 .ownership_ok = ownership_ok,
2914                                                 .left = &left,
2915                                         });
2916                 if (r < 0)
2917                         return r;
2918         }
2919
2920         /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2921          * them, so that they can act as a "default" if the same credential is specified multiple times. */
2922         HASHMAP_FOREACH(sc, context->set_credentials) {
2923                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2924                 const char *data;
2925                 size_t size, add;
2926
2927                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2928                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2929                  * slow and involved, hence it's nice to be able to skip that if the credential already
2930                  * exists anyway. */
2931                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2932                         continue;
2933                 if (errno != ENOENT)
2934                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2935
2936                 if (sc->encrypted) {
2937                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
2938                         if (r < 0)
2939                                 return r;
2940
2941                         data = plaintext;
2942                 } else {
2943                         data = sc->data;
2944                         size = sc->size;
2945                 }
2946
2947                 add = strlen(sc->id) + size;
2948                 if (add > left)
2949                         return -E2BIG;
2950
2951                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2952                 if (r < 0)
2953                         return r;
2954
2955                 left -= add;
2956         }
2957
2958         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2959                 return -errno;
2960
2961         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2962          * accessible */
2963
2964         if (uid_is_valid(uid) && uid != getuid()) {
2965                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2966                 if (r < 0) {
2967                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2968                                 return r;
2969
2970                         if (!ownership_ok)
2971                                 return r;
2972
2973                         if (fchown(dfd, uid, GID_INVALID) < 0)
2974                                 return -errno;
2975                 }
2976         }
2977
2978         return 0;
2979 }
2980
2981 static int setup_credentials_internal(
2982                 const ExecContext *context,
2983                 const ExecParameters *params,
2984                 const char *unit,
2985                 const char *final,        /* This is where the credential store shall eventually end up at */
2986                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2987                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2988                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2989                 uid_t uid) {
2990
2991         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2992                                    * if we mounted something; false if we definitely can't mount anything */
2993         bool final_mounted;
2994         const char *where;
2995
2996         assert(context);
2997         assert(final);
2998         assert(workspace);
2999
3000         if (reuse_workspace) {
3001                 r = path_is_mount_point(workspace, NULL, 0);
3002                 if (r < 0)
3003                         return r;
3004                 if (r > 0)
3005                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3006                 else
3007                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3008         } else
3009                 workspace_mounted = -1; /* ditto */
3010
3011         r = path_is_mount_point(final, NULL, 0);
3012         if (r < 0)
3013                 return r;
3014         if (r > 0) {
3015                 /* If the final place already has something mounted, we use that. If the workspace also has
3016                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3017                  * different). */
3018                 final_mounted = true;
3019
3020                 if (workspace_mounted < 0) {
3021                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3022                          * the final version to the workspace, and make it writable, so that we can make
3023                          * changes */
3024
3025                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3026                         if (r < 0)
3027                                 return r;
3028
3029                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3030                         if (r < 0)
3031                                 return r;
3032
3033                         workspace_mounted = true;
3034                 }
3035         } else
3036                 final_mounted = false;
3037
3038         if (workspace_mounted < 0) {
3039                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3040                 for (int try = 0;; try++) {
3041
3042                         if (try == 0) {
3043                                 /* Try "ramfs" first, since it's not swap backed */
3044                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3045                                 if (r >= 0) {
3046                                         workspace_mounted = true;
3047                                         break;
3048                                 }
3049
3050                         } else if (try == 1) {
3051                                 _cleanup_free_ char *opts = NULL;
3052
3053                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3054                                         return -ENOMEM;
3055
3056                                 /* Fall back to "tmpfs" otherwise */
3057                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3058                                 if (r >= 0) {
3059                                         workspace_mounted = true;
3060                                         break;
3061                                 }
3062
3063                         } else {
3064                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3065                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3066                                 if (r < 0) {
3067                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3068                                                 return r;
3069
3070                                         if (must_mount) /* If we it's not OK to use the plain directory
3071                                                          * fallback, propagate all errors too */
3072                                                 return r;
3073
3074                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3075                                          * proceed for compat with container envs, and just use the final dir
3076                                          * as is. */
3077
3078                                         workspace_mounted = false;
3079                                         break;
3080                                 }
3081
3082                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3083                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3084                                 if (r < 0)
3085                                         return r;
3086
3087                                 workspace_mounted = true;
3088                                 break;
3089                         }
3090                 }
3091         }
3092
3093         assert(!must_mount || workspace_mounted > 0);
3094         where = workspace_mounted ? workspace : final;
3095
3096         (void) label_fix_full(AT_FDCWD, where, final, 0);
3097
3098         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3099         if (r < 0)
3100                 return r;
3101
3102         if (workspace_mounted) {
3103                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3104                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3105                 if (r < 0)
3106                         return r;
3107
3108                 /* And mount it to the final place, read-only */
3109                 if (final_mounted)
3110                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3111                 else
3112                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3113                 if (r < 0)
3114                         return r;
3115         } else {
3116                 _cleanup_free_ char *parent = NULL;
3117
3118                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3119                  * open access to the top-level credential directory and the per-service directory now */
3120
3121                 r = path_extract_directory(final, &parent);
3122                 if (r < 0)
3123                         return r;
3124                 if (chmod(parent, 0755) < 0)
3125                         return -errno;
3126         }
3127
3128         return 0;
3129 }
3130
3131 static int setup_credentials(
3132                 const ExecContext *context,
3133                 const ExecParameters *params,
3134                 const char *unit,
3135                 uid_t uid) {
3136
3137         _cleanup_free_ char *p = NULL, *q = NULL;
3138         int r;
3139
3140         assert(context);
3141         assert(params);
3142
3143         if (!exec_context_has_credentials(context))
3144                 return 0;
3145
3146         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3147                 return -EINVAL;
3148
3149         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3150          * and the subdir we mount over with a read-only file system readable by the service's user */
3151         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3152         if (!q)
3153                 return -ENOMEM;
3154
3155         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3156         if (r < 0 && r != -EEXIST)
3157                 return r;
3158
3159         p = path_join(q, unit);
3160         if (!p)
3161                 return -ENOMEM;
3162
3163         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3164         if (r < 0 && r != -EEXIST)
3165                 return r;
3166
3167         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3168         if (r < 0) {
3169                 _cleanup_free_ char *t = NULL, *u = NULL;
3170
3171                 /* If this is not a privilege or support issue then propagate the error */
3172                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3173                         return r;
3174
3175                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3176                  * it into place, so that users can't access half-initialized credential stores. */
3177                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3178                 if (!t)
3179                         return -ENOMEM;
3180
3181                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3182                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3183                  * after it is fully set up */
3184                 u = path_join(t, unit);
3185                 if (!u)
3186                         return -ENOMEM;
3187
3188                 FOREACH_STRING(i, t, u) {
3189                         r = mkdir_label(i, 0700);
3190                         if (r < 0 && r != -EEXIST)
3191                                 return r;
3192                 }
3193
3194                 r = setup_credentials_internal(
3195                                 context,
3196                                 params,
3197                                 unit,
3198                                 p,       /* final mount point */
3199                                 u,       /* temporary workspace to overmount */
3200                                 true,    /* reuse the workspace if it is already a mount */
3201                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3202                                 uid);
3203
3204                 (void) rmdir(u); /* remove the workspace again if we can. */
3205
3206                 if (r < 0)
3207                         return r;
3208
3209         } else if (r == 0) {
3210
3211                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3212                  * we can use the same directory for all cases, after turning off propagation. Question
3213                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3214                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3215                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3216                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3217                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3218                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3219                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3220                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3221                  * propagation on the former, and then overmount the latter.
3222                  *
3223                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3224                  * for this purpose, but there are few other candidates that work equally well for us, and
3225                  * given that the we do this in a privately namespaced short-lived single-threaded process
3226                  * that no one else sees this should be OK to do. */
3227
3228                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3229                 if (r < 0)
3230                         goto child_fail;
3231
3232                 r = setup_credentials_internal(
3233                                 context,
3234                                 params,
3235                                 unit,
3236                                 p,           /* final mount point */
3237                                 "/dev/shm",  /* temporary workspace to overmount */
3238                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3239                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3240                                 uid);
3241                 if (r < 0)
3242                         goto child_fail;
3243
3244                 _exit(EXIT_SUCCESS);
3245
3246         child_fail:
3247                 _exit(EXIT_FAILURE);
3248         }
3249
3250         return 0;
3251 }
3252
3253 #if ENABLE_SMACK
3254 static int setup_smack(
3255                 const Manager *manager,
3256                 const ExecContext *context,
3257                 int executable_fd) {
3258         int r;
3259
3260         assert(context);
3261         assert(executable_fd >= 0);
3262
3263         if (context->smack_process_label) {
3264                 r = mac_smack_apply_pid(0, context->smack_process_label);
3265                 if (r < 0)
3266                         return r;
3267         } else if (manager->default_smack_process_label) {
3268                 _cleanup_free_ char *exec_label = NULL;
3269
3270                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3271                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3272                         return r;
3273
3274                 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
3275                 if (r < 0)
3276                         return r;
3277         }
3278
3279         return 0;
3280 }
3281 #endif
3282
3283 static int compile_bind_mounts(
3284                 const ExecContext *context,
3285                 const ExecParameters *params,
3286                 BindMount **ret_bind_mounts,
3287                 size_t *ret_n_bind_mounts,
3288                 char ***ret_empty_directories) {
3289
3290         _cleanup_strv_free_ char **empty_directories = NULL;
3291         BindMount *bind_mounts;
3292         size_t n, h = 0;
3293         int r;
3294
3295         assert(context);
3296         assert(params);
3297         assert(ret_bind_mounts);
3298         assert(ret_n_bind_mounts);
3299         assert(ret_empty_directories);
3300
3301         n = context->n_bind_mounts;
3302         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3303                 if (!params->prefix[t])
3304                         continue;
3305
3306                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3307                         n += !context->directories[t].items[i].only_create;
3308         }
3309
3310         if (n <= 0) {
3311                 *ret_bind_mounts = NULL;
3312                 *ret_n_bind_mounts = 0;
3313                 *ret_empty_directories = NULL;
3314                 return 0;
3315         }
3316
3317         bind_mounts = new(BindMount, n);
3318         if (!bind_mounts)
3319                 return -ENOMEM;
3320
3321         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3322                 BindMount *item = context->bind_mounts + i;
3323                 char *s, *d;
3324
3325                 s = strdup(item->source);
3326                 if (!s) {
3327                         r = -ENOMEM;
3328                         goto finish;
3329                 }
3330
3331                 d = strdup(item->destination);
3332                 if (!d) {
3333                         free(s);
3334                         r = -ENOMEM;
3335                         goto finish;
3336                 }
3337
3338                 bind_mounts[h++] = (BindMount) {
3339                         .source = s,
3340                         .destination = d,
3341                         .read_only = item->read_only,
3342                         .recursive = item->recursive,
3343                         .ignore_enoent = item->ignore_enoent,
3344                 };
3345         }
3346
3347         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3348                 if (!params->prefix[t])
3349                         continue;
3350
3351                 if (context->directories[t].n_items == 0)
3352                         continue;
3353
3354                 if (exec_directory_is_private(context, t) &&
3355                     !exec_context_with_rootfs(context)) {
3356                         char *private_root;
3357
3358                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3359                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3360                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3361
3362                         private_root = path_join(params->prefix[t], "private");
3363                         if (!private_root) {
3364                                 r = -ENOMEM;
3365                                 goto finish;
3366                         }
3367
3368                         r = strv_consume(&empty_directories, private_root);
3369                         if (r < 0)
3370                                 goto finish;
3371                 }
3372
3373                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3374                         char *s, *d;
3375
3376                         /* When one of the parent directories is in the list, we cannot create the symlink
3377                          * for the child directory. See also the comments in setup_exec_directory(). */
3378                         if (context->directories[t].items[i].only_create)
3379                                 continue;
3380
3381                         if (exec_directory_is_private(context, t))
3382                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3383                         else
3384                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3385                         if (!s) {
3386                                 r = -ENOMEM;
3387                                 goto finish;
3388                         }
3389
3390                         if (exec_directory_is_private(context, t) &&
3391                             exec_context_with_rootfs(context))
3392                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3393                                  * directory is not created on the root directory. So, let's bind-mount the directory
3394                                  * on the 'non-private' place. */
3395                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3396                         else
3397                                 d = strdup(s);
3398                         if (!d) {
3399                                 free(s);
3400                                 r = -ENOMEM;
3401                                 goto finish;
3402                         }
3403
3404                         bind_mounts[h++] = (BindMount) {
3405                                 .source = s,
3406                                 .destination = d,
3407                                 .read_only = false,
3408                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3409                                 .recursive = true,
3410                                 .ignore_enoent = false,
3411                         };
3412                 }
3413         }
3414
3415         assert(h == n);
3416
3417         *ret_bind_mounts = bind_mounts;
3418         *ret_n_bind_mounts = n;
3419         *ret_empty_directories = TAKE_PTR(empty_directories);
3420
3421         return (int) n;
3422
3423 finish:
3424         bind_mount_free_many(bind_mounts, h);
3425         return r;
3426 }
3427
3428 /* ret_symlinks will contain a list of pairs src:dest that describes
3429  * the symlinks to create later on. For example, the symlinks needed
3430  * to safely give private directories to DynamicUser=1 users. */
3431 static int compile_symlinks(
3432                 const ExecContext *context,
3433                 const ExecParameters *params,
3434                 char ***ret_symlinks) {
3435
3436         _cleanup_strv_free_ char **symlinks = NULL;
3437         int r;
3438
3439         assert(context);
3440         assert(params);
3441         assert(ret_symlinks);
3442
3443         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3444                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3445                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3446
3447                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3448                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3449
3450                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3451                                 dst_abs = path_join(params->prefix[dt], *symlink);
3452                                 if (!src_abs || !dst_abs)
3453                                         return -ENOMEM;
3454
3455                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3456                                 if (r < 0)
3457                                         return r;
3458                         }
3459
3460                         if (!exec_directory_is_private(context, dt) ||
3461                             exec_context_with_rootfs(context) ||
3462                             context->directories[dt].items[i].only_create)
3463                                 continue;
3464
3465                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3466                         if (!private_path)
3467                                 return -ENOMEM;
3468
3469                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3470                         if (!path)
3471                                 return -ENOMEM;
3472
3473                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3474                         if (r < 0)
3475                                 return r;
3476                 }
3477         }
3478
3479         *ret_symlinks = TAKE_PTR(symlinks);
3480
3481         return 0;
3482 }
3483
3484 static bool insist_on_sandboxing(
3485                 const ExecContext *context,
3486                 const char *root_dir,
3487                 const char *root_image,
3488                 const BindMount *bind_mounts,
3489                 size_t n_bind_mounts) {
3490
3491         assert(context);
3492         assert(n_bind_mounts == 0 || bind_mounts);
3493
3494         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3495          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3496          * rearrange stuff in a way we cannot ignore gracefully. */
3497
3498         if (context->n_temporary_filesystems > 0)
3499                 return true;
3500
3501         if (root_dir || root_image)
3502                 return true;
3503
3504         if (context->n_mount_images > 0)
3505                 return true;
3506
3507         if (context->dynamic_user)
3508                 return true;
3509
3510         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3511                 return true;
3512
3513         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3514          * essential. */
3515         for (size_t i = 0; i < n_bind_mounts; i++)
3516                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3517                         return true;
3518
3519         if (context->log_namespace)
3520                 return true;
3521
3522         return false;
3523 }
3524
3525 static int apply_mount_namespace(
3526                 const Unit *u,
3527                 ExecCommandFlags command_flags,
3528                 const ExecContext *context,
3529                 const ExecParameters *params,
3530                 const ExecRuntime *runtime,
3531                 char **error_path) {
3532
3533         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3534         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3535         const char *root_dir = NULL, *root_image = NULL;
3536         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3537                         *extension_dir = NULL;
3538         NamespaceInfo ns_info;
3539         bool needs_sandboxing;
3540         BindMount *bind_mounts = NULL;
3541         size_t n_bind_mounts = 0;
3542         int r;
3543
3544         assert(context);
3545
3546         if (params->flags & EXEC_APPLY_CHROOT) {
3547                 root_image = context->root_image;
3548
3549                 if (!root_image)
3550                         root_dir = context->root_directory;
3551         }
3552
3553         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3554         if (r < 0)
3555                 return r;
3556
3557         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3558         r = compile_symlinks(context, params, &symlinks);
3559         if (r < 0)
3560                 goto finalize;
3561
3562         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3563         if (needs_sandboxing) {
3564                 /* The runtime struct only contains the parent of the private /tmp,
3565                  * which is non-accessible to world users. Inside of it there's a /tmp
3566                  * that is sticky, and that's the one we want to use here.
3567                  * This does not apply when we are using /run/systemd/empty as fallback. */
3568
3569                 if (context->private_tmp && runtime) {
3570                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3571                                 tmp_dir = runtime->tmp_dir;
3572                         else if (runtime->tmp_dir)
3573                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3574
3575                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3576                                 var_tmp_dir = runtime->var_tmp_dir;
3577                         else if (runtime->var_tmp_dir)
3578                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3579                 }
3580
3581                 ns_info = (NamespaceInfo) {
3582                         .ignore_protect_paths = false,
3583                         .private_dev = context->private_devices,
3584                         .protect_control_groups = context->protect_control_groups,
3585                         .protect_kernel_tunables = context->protect_kernel_tunables,
3586                         .protect_kernel_modules = context->protect_kernel_modules,
3587                         .protect_kernel_logs = context->protect_kernel_logs,
3588                         .protect_hostname = context->protect_hostname,
3589                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3590                         .private_mounts = context->private_mounts,
3591                         .protect_home = context->protect_home,
3592                         .protect_system = context->protect_system,
3593                         .protect_proc = context->protect_proc,
3594                         .proc_subset = context->proc_subset,
3595                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3596                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3597                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3598                 };
3599         } else if (!context->dynamic_user && root_dir)
3600                 /*
3601                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3602                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3603                  * fail if we are enable to apply the sandbox inside the mount namespace.
3604                  */
3605                 ns_info = (NamespaceInfo) {
3606                         .ignore_protect_paths = true,
3607                 };
3608         else
3609                 ns_info = (NamespaceInfo) {};
3610
3611         if (context->mount_flags == MS_SHARED)
3612                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3613
3614         if (exec_context_has_credentials(context) &&
3615             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3616             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3617                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3618                 if (!creds_path) {
3619                         r = -ENOMEM;
3620                         goto finalize;
3621                 }
3622         }
3623
3624         if (MANAGER_IS_SYSTEM(u->manager)) {
3625                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3626                 if (!propagate_dir) {
3627                         r = -ENOMEM;
3628                         goto finalize;
3629                 }
3630
3631                 incoming_dir = strdup("/run/systemd/incoming");
3632                 if (!incoming_dir) {
3633                         r = -ENOMEM;
3634                         goto finalize;
3635                 }
3636
3637                 extension_dir = strdup("/run/systemd/unit-extensions");
3638                 if (!extension_dir) {
3639                         r = -ENOMEM;
3640                         goto finalize;
3641                 }
3642         } else
3643                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3644                         r = -ENOMEM;
3645                         goto finalize;
3646                 }
3647
3648         r = setup_namespace(root_dir, root_image, context->root_image_options,
3649                             &ns_info, context->read_write_paths,
3650                             needs_sandboxing ? context->read_only_paths : NULL,
3651                             needs_sandboxing ? context->inaccessible_paths : NULL,
3652                             needs_sandboxing ? context->exec_paths : NULL,
3653                             needs_sandboxing ? context->no_exec_paths : NULL,
3654                             empty_directories,
3655                             symlinks,
3656                             bind_mounts,
3657                             n_bind_mounts,
3658                             context->temporary_filesystems,
3659                             context->n_temporary_filesystems,
3660                             context->mount_images,
3661                             context->n_mount_images,
3662                             tmp_dir,
3663                             var_tmp_dir,
3664                             creds_path,
3665                             context->log_namespace,
3666                             context->mount_flags,
3667                             context->root_hash, context->root_hash_size, context->root_hash_path,
3668                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3669                             context->root_verity,
3670                             context->extension_images,
3671                             context->n_extension_images,
3672                             context->extension_directories,
3673                             propagate_dir,
3674                             incoming_dir,
3675                             extension_dir,
3676                             root_dir || root_image ? params->notify_socket : NULL,
3677                             error_path);
3678
3679         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3680          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3681          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3682          * completely different execution environment. */
3683         if (r == -ENOANO) {
3684                 if (insist_on_sandboxing(
3685                                     context,
3686                                     root_dir, root_image,
3687                                     bind_mounts,
3688                                     n_bind_mounts)) {
3689                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3690                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3691                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3692
3693                         r = -EOPNOTSUPP;
3694                 } else {
3695                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3696                         r = 0;
3697                 }
3698         }
3699
3700 finalize:
3701         bind_mount_free_many(bind_mounts, n_bind_mounts);
3702         return r;
3703 }
3704
3705 static int apply_working_directory(
3706                 const ExecContext *context,
3707                 const ExecParameters *params,
3708                 const char *home,
3709                 int *exit_status) {
3710
3711         const char *d, *wd;
3712
3713         assert(context);
3714         assert(exit_status);
3715
3716         if (context->working_directory_home) {
3717
3718                 if (!home) {
3719                         *exit_status = EXIT_CHDIR;
3720                         return -ENXIO;
3721                 }
3722
3723                 wd = home;
3724
3725         } else
3726                 wd = empty_to_root(context->working_directory);
3727
3728         if (params->flags & EXEC_APPLY_CHROOT)
3729                 d = wd;
3730         else
3731                 d = prefix_roota(context->root_directory, wd);
3732
3733         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3734                 *exit_status = EXIT_CHDIR;
3735                 return -errno;
3736         }
3737
3738         return 0;
3739 }
3740
3741 static int apply_root_directory(
3742                 const ExecContext *context,
3743                 const ExecParameters *params,
3744                 const bool needs_mount_ns,
3745                 int *exit_status) {
3746
3747         assert(context);
3748         assert(exit_status);
3749
3750         if (params->flags & EXEC_APPLY_CHROOT)
3751                 if (!needs_mount_ns && context->root_directory)
3752                         if (chroot(context->root_directory) < 0) {
3753                                 *exit_status = EXIT_CHROOT;
3754                                 return -errno;
3755                         }
3756
3757         return 0;
3758 }
3759
3760 static int setup_keyring(
3761                 const Unit *u,
3762                 const ExecContext *context,
3763                 const ExecParameters *p,
3764                 uid_t uid, gid_t gid) {
3765
3766         key_serial_t keyring;
3767         int r = 0;
3768         uid_t saved_uid;
3769         gid_t saved_gid;
3770
3771         assert(u);
3772         assert(context);
3773         assert(p);
3774
3775         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3776          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3777          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3778          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3779          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3780          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3781
3782         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3783                 return 0;
3784
3785         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3786          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3787          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3788          * & group is just as nasty as acquiring a reference to the user keyring. */
3789
3790         saved_uid = getuid();
3791         saved_gid = getgid();
3792
3793         if (gid_is_valid(gid) && gid != saved_gid) {
3794                 if (setregid(gid, -1) < 0)
3795                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3796         }
3797
3798         if (uid_is_valid(uid) && uid != saved_uid) {
3799                 if (setreuid(uid, -1) < 0) {
3800                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3801                         goto out;
3802                 }
3803         }
3804
3805         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3806         if (keyring == -1) {
3807                 if (errno == ENOSYS)
3808                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3809                 else if (ERRNO_IS_PRIVILEGE(errno))
3810                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3811                 else if (errno == EDQUOT)
3812                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3813                 else
3814                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3815
3816                 goto out;
3817         }
3818
3819         /* When requested link the user keyring into the session keyring. */
3820         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3821
3822                 if (keyctl(KEYCTL_LINK,
3823                            KEY_SPEC_USER_KEYRING,
3824                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3825                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3826                         goto out;
3827                 }
3828         }
3829
3830         /* Restore uid/gid back */
3831         if (uid_is_valid(uid) && uid != saved_uid) {
3832                 if (setreuid(saved_uid, -1) < 0) {
3833                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3834                         goto out;
3835                 }
3836         }
3837
3838         if (gid_is_valid(gid) && gid != saved_gid) {
3839                 if (setregid(saved_gid, -1) < 0)
3840                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3841         }
3842
3843         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3844         if (!sd_id128_is_null(u->invocation_id)) {
3845                 key_serial_t key;
3846
3847                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3848                 if (key == -1)
3849                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3850                 else {
3851                         if (keyctl(KEYCTL_SETPERM, key,
3852                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3853                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3854                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3855                 }
3856         }
3857
3858 out:
3859         /* Revert back uid & gid for the last time, and exit */
3860         /* no extra logging, as only the first already reported error matters */
3861         if (getuid() != saved_uid)
3862                 (void) setreuid(saved_uid, -1);
3863
3864         if (getgid() != saved_gid)
3865                 (void) setregid(saved_gid, -1);
3866
3867         return r;
3868 }
3869
3870 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3871         assert(array);
3872         assert(n);
3873         assert(pair);
3874
3875         if (pair[0] >= 0)
3876                 array[(*n)++] = pair[0];
3877         if (pair[1] >= 0)
3878                 array[(*n)++] = pair[1];
3879 }
3880
3881 static int close_remaining_fds(
3882                 const ExecParameters *params,
3883                 const ExecRuntime *runtime,
3884                 const DynamicCreds *dcreds,
3885                 int user_lookup_fd,
3886                 int socket_fd,
3887                 const int *fds, size_t n_fds) {
3888
3889         size_t n_dont_close = 0;
3890         int dont_close[n_fds + 12];
3891
3892         assert(params);
3893
3894         if (params->stdin_fd >= 0)
3895                 dont_close[n_dont_close++] = params->stdin_fd;
3896         if (params->stdout_fd >= 0)
3897                 dont_close[n_dont_close++] = params->stdout_fd;
3898         if (params->stderr_fd >= 0)
3899                 dont_close[n_dont_close++] = params->stderr_fd;
3900
3901         if (socket_fd >= 0)
3902                 dont_close[n_dont_close++] = socket_fd;
3903         if (n_fds > 0) {
3904                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3905                 n_dont_close += n_fds;
3906         }
3907
3908         if (runtime) {
3909                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3910                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3911         }
3912
3913         if (dcreds) {
3914                 if (dcreds->user)
3915                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3916                 if (dcreds->group)
3917                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3918         }
3919
3920         if (user_lookup_fd >= 0)
3921                 dont_close[n_dont_close++] = user_lookup_fd;
3922
3923         return close_all_fds(dont_close, n_dont_close);
3924 }
3925
3926 static int send_user_lookup(
3927                 Unit *unit,
3928                 int user_lookup_fd,
3929                 uid_t uid,
3930                 gid_t gid) {
3931
3932         assert(unit);
3933
3934         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3935          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3936          * specified. */
3937
3938         if (user_lookup_fd < 0)
3939                 return 0;
3940
3941         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3942                 return 0;
3943
3944         if (writev(user_lookup_fd,
3945                (struct iovec[]) {
3946                            IOVEC_INIT(&uid, sizeof(uid)),
3947                            IOVEC_INIT(&gid, sizeof(gid)),
3948                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3949                 return -errno;
3950
3951         return 0;
3952 }
3953
3954 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3955         int r;
3956
3957         assert(c);
3958         assert(home);
3959         assert(buf);
3960
3961         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3962
3963         if (*home)
3964                 return 0;
3965
3966         if (!c->working_directory_home)
3967                 return 0;
3968
3969         r = get_home_dir(buf);
3970         if (r < 0)
3971                 return r;
3972
3973         *home = *buf;
3974         return 1;
3975 }
3976
3977 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3978         _cleanup_strv_free_ char ** list = NULL;
3979         int r;
3980
3981         assert(c);
3982         assert(p);
3983         assert(ret);
3984
3985         assert(c->dynamic_user);
3986
3987         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3988          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3989          * directories. */
3990
3991         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3992                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3993                         continue;
3994
3995                 if (!p->prefix[t])
3996                         continue;
3997
3998                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3999                         char *e;
4000
4001                         if (exec_directory_is_private(c, t))
4002                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4003                         else
4004                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4005                         if (!e)
4006                                 return -ENOMEM;
4007
4008                         r = strv_consume(&list, e);
4009                         if (r < 0)
4010                                 return r;
4011                 }
4012         }
4013
4014         *ret = TAKE_PTR(list);
4015
4016         return 0;
4017 }
4018
4019 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4020         bool using_subcgroup;
4021         char *p;
4022
4023         assert(params);
4024         assert(ret);
4025
4026         if (!params->cgroup_path)
4027                 return -EINVAL;
4028
4029         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4030          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4031          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4032          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4033          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4034          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4035          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4036          * flag, which is only passed for the former statements, not for the latter. */
4037
4038         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4039         if (using_subcgroup)
4040                 p = path_join(params->cgroup_path, ".control");
4041         else
4042                 p = strdup(params->cgroup_path);
4043         if (!p)
4044                 return -ENOMEM;
4045
4046         *ret = p;
4047         return using_subcgroup;
4048 }
4049
4050 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4051         _cleanup_(cpu_set_reset) CPUSet s = {};
4052         int r;
4053
4054         assert(c);
4055         assert(ret);
4056
4057         if (!c->numa_policy.nodes.set) {
4058                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4059                 return 0;
4060         }
4061
4062         r = numa_to_cpu_set(&c->numa_policy, &s);
4063         if (r < 0)
4064                 return r;
4065
4066         cpu_set_reset(ret);
4067
4068         return cpu_set_add_all(ret, &s);
4069 }
4070
4071 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4072         assert(c);
4073
4074         return c->cpu_affinity_from_numa;
4075 }
4076
4077 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4078         int r;
4079
4080         assert(fds);
4081         assert(n_fds);
4082         assert(*n_fds < fds_size);
4083         assert(ret_fd);
4084
4085         if (fd < 0) {
4086                 *ret_fd = -1;
4087                 return 0;
4088         }
4089
4090         if (fd < 3 + (int) *n_fds) {
4091                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4092                  * the fds we pass to the process (or which are closed only during execve). */
4093
4094                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4095                 if (r < 0)
4096                         return -errno;
4097
4098                 close_and_replace(fd, r);
4099         }
4100
4101         *ret_fd = fds[*n_fds] = fd;
4102         (*n_fds) ++;
4103         return 1;
4104 }
4105
4106 static int exec_child(
4107                 Unit *unit,
4108                 const ExecCommand *command,
4109                 const ExecContext *context,
4110                 const ExecParameters *params,
4111                 ExecRuntime *runtime,
4112                 DynamicCreds *dcreds,
4113                 int socket_fd,
4114                 const int named_iofds[static 3],
4115                 int *fds,
4116                 size_t n_socket_fds,
4117                 size_t n_storage_fds,
4118                 char **files_env,
4119                 int user_lookup_fd,
4120                 int *exit_status) {
4121
4122         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4123         int r, ngids = 0, exec_fd;
4124         _cleanup_free_ gid_t *supplementary_gids = NULL;
4125         const char *username = NULL, *groupname = NULL;
4126         _cleanup_free_ char *home_buffer = NULL;
4127         const char *home = NULL, *shell = NULL;
4128         char **final_argv = NULL;
4129         dev_t journal_stream_dev = 0;
4130         ino_t journal_stream_ino = 0;
4131         bool userns_set_up = false;
4132         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4133                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4134                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4135                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4136 #if HAVE_SELINUX
4137         _cleanup_free_ char *mac_selinux_context_net = NULL;
4138         bool use_selinux = false;
4139 #endif
4140 #if ENABLE_SMACK
4141         bool use_smack = false;
4142 #endif
4143 #if HAVE_APPARMOR
4144         bool use_apparmor = false;
4145 #endif
4146         uid_t saved_uid = getuid();
4147         gid_t saved_gid = getgid();
4148         uid_t uid = UID_INVALID;
4149         gid_t gid = GID_INVALID;
4150         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4151                n_keep_fds; /* total number of fds not to close */
4152         int secure_bits;
4153         _cleanup_free_ gid_t *gids_after_pam = NULL;
4154         int ngids_after_pam = 0;
4155
4156         assert(unit);
4157         assert(command);
4158         assert(context);
4159         assert(params);
4160         assert(exit_status);
4161
4162         /* Explicitly test for CVE-2021-4034 inspired invocations */
4163         assert(command->path);
4164         assert(!strv_isempty(command->argv));
4165
4166         rename_process_from_path(command->path);
4167
4168         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4169          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4170          * both of which will be demoted to SIG_DFL. */
4171         (void) default_signals(SIGNALS_CRASH_HANDLER,
4172                                SIGNALS_IGNORE);
4173
4174         if (context->ignore_sigpipe)
4175                 (void) ignore_signals(SIGPIPE);
4176
4177         r = reset_signal_mask();
4178         if (r < 0) {
4179                 *exit_status = EXIT_SIGNAL_MASK;
4180                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4181         }
4182
4183         if (params->idle_pipe)
4184                 do_idle_pipe_dance(params->idle_pipe);
4185
4186         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4187          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4188          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4189          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4190
4191         log_forget_fds();
4192         log_set_open_when_needed(true);
4193
4194         /* In case anything used libc syslog(), close this here, too */
4195         closelog();
4196
4197         int keep_fds[n_fds + 3];
4198         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4199         n_keep_fds = n_fds;
4200
4201         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4202         if (r < 0) {
4203                 *exit_status = EXIT_FDS;
4204                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4205         }
4206
4207 #if HAVE_LIBBPF
4208         if (unit->manager->restrict_fs) {
4209                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4210                 if (bpf_map_fd < 0) {
4211                         *exit_status = EXIT_FDS;
4212                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4213                 }
4214
4215                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4216                 if (r < 0) {
4217                         *exit_status = EXIT_FDS;
4218                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4219                 }
4220         }
4221 #endif
4222
4223         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4224         if (r < 0) {
4225                 *exit_status = EXIT_FDS;
4226                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4227         }
4228
4229         if (!context->same_pgrp &&
4230             setsid() < 0) {
4231                 *exit_status = EXIT_SETSID;
4232                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4233         }
4234
4235         exec_context_tty_reset(context, params);
4236
4237         if (unit_shall_confirm_spawn(unit)) {
4238                 _cleanup_free_ char *cmdline = NULL;
4239
4240                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4241                 if (!cmdline) {
4242                         *exit_status = EXIT_MEMORY;
4243                         return log_oom();
4244                 }
4245
4246                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4247                 if (r != CONFIRM_EXECUTE) {
4248                         if (r == CONFIRM_PRETEND_SUCCESS) {
4249                                 *exit_status = EXIT_SUCCESS;
4250                                 return 0;
4251                         }
4252                         *exit_status = EXIT_CONFIRM;
4253                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4254                                                     "Execution cancelled by the user");
4255                 }
4256         }
4257
4258         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4259          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4260          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4261          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4262          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4263         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4264             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4265                 *exit_status = EXIT_MEMORY;
4266                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4267         }
4268
4269         if (context->dynamic_user && dcreds) {
4270                 _cleanup_strv_free_ char **suggested_paths = NULL;
4271
4272                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4273                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4274                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4275                         *exit_status = EXIT_USER;
4276                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4277                 }
4278
4279                 r = compile_suggested_paths(context, params, &suggested_paths);
4280                 if (r < 0) {
4281                         *exit_status = EXIT_MEMORY;
4282                         return log_oom();
4283                 }
4284
4285                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4286                 if (r < 0) {
4287                         *exit_status = EXIT_USER;
4288                         if (r == -EILSEQ)
4289                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4290                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4291                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4292                 }
4293
4294                 if (!uid_is_valid(uid)) {
4295                         *exit_status = EXIT_USER;
4296                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4297                 }
4298
4299                 if (!gid_is_valid(gid)) {
4300                         *exit_status = EXIT_USER;
4301                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4302                 }
4303
4304                 if (dcreds->user)
4305                         username = dcreds->user->name;
4306
4307         } else {
4308                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4309                 if (r < 0) {
4310                         *exit_status = EXIT_USER;
4311                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4312                 }
4313
4314                 r = get_fixed_group(context, &groupname, &gid);
4315                 if (r < 0) {
4316                         *exit_status = EXIT_GROUP;
4317                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4318                 }
4319         }
4320
4321         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4322         r = get_supplementary_groups(context, username, groupname, gid,
4323                                      &supplementary_gids, &ngids);
4324         if (r < 0) {
4325                 *exit_status = EXIT_GROUP;
4326                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4327         }
4328
4329         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4330         if (r < 0) {
4331                 *exit_status = EXIT_USER;
4332                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4333         }
4334
4335         user_lookup_fd = safe_close(user_lookup_fd);
4336
4337         r = acquire_home(context, uid, &home, &home_buffer);
4338         if (r < 0) {
4339                 *exit_status = EXIT_CHDIR;
4340                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4341         }
4342
4343         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4344          * must sure to drop O_NONBLOCK */
4345         if (socket_fd >= 0)
4346                 (void) fd_nonblock(socket_fd, false);
4347
4348         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4349          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4350         if (params->cgroup_path) {
4351                 _cleanup_free_ char *p = NULL;
4352
4353                 r = exec_parameters_get_cgroup_path(params, &p);
4354                 if (r < 0) {
4355                         *exit_status = EXIT_CGROUP;
4356                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4357                 }
4358
4359                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4360                 if (r == -EUCLEAN) {
4361                         *exit_status = EXIT_CGROUP;
4362                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4363                                                     "because the cgroup or one of its parents or "
4364                                                     "siblings is in the threaded mode: %m", p);
4365                 }
4366                 if (r < 0) {
4367                         *exit_status = EXIT_CGROUP;
4368                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4369                 }
4370         }
4371
4372         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4373                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4374                 if (r < 0) {
4375                         *exit_status = EXIT_NETWORK;
4376                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4377                 }
4378         }
4379
4380         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4381                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4382                 if (r < 0) {
4383                         *exit_status = EXIT_NAMESPACE;
4384                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4385                 }
4386         }
4387
4388         r = setup_input(context, params, socket_fd, named_iofds);
4389         if (r < 0) {
4390                 *exit_status = EXIT_STDIN;
4391                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4392         }
4393
4394         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4395         if (r < 0) {
4396                 *exit_status = EXIT_STDOUT;
4397                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4398         }
4399
4400         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4401         if (r < 0) {
4402                 *exit_status = EXIT_STDERR;
4403                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4404         }
4405
4406         if (context->oom_score_adjust_set) {
4407                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4408                  * prohibit write access to this file, and we shouldn't trip up over that. */
4409                 r = set_oom_score_adjust(context->oom_score_adjust);
4410                 if (ERRNO_IS_PRIVILEGE(r))
4411                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4412                 else if (r < 0) {
4413                         *exit_status = EXIT_OOM_ADJUST;
4414                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4415                 }
4416         }
4417
4418         if (context->coredump_filter_set) {
4419                 r = set_coredump_filter(context->coredump_filter);
4420                 if (ERRNO_IS_PRIVILEGE(r))
4421                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4422                 else if (r < 0)
4423                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4424         }
4425
4426         if (context->nice_set) {
4427                 r = setpriority_closest(context->nice);
4428                 if (r < 0)
4429                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4430         }
4431
4432         if (context->cpu_sched_set) {
4433                 struct sched_param param = {
4434                         .sched_priority = context->cpu_sched_priority,
4435                 };
4436
4437                 r = sched_setscheduler(0,
4438                                        context->cpu_sched_policy |
4439                                        (context->cpu_sched_reset_on_fork ?
4440                                         SCHED_RESET_ON_FORK : 0),
4441                                        &param);
4442                 if (r < 0) {
4443                         *exit_status = EXIT_SETSCHEDULER;
4444                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4445                 }
4446         }
4447
4448         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4449                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4450                 const CPUSet *cpu_set;
4451
4452                 if (context->cpu_affinity_from_numa) {
4453                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4454                         if (r < 0) {
4455                                 *exit_status = EXIT_CPUAFFINITY;
4456                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4457                         }
4458
4459                         cpu_set = &converted_cpu_set;
4460                 } else
4461                         cpu_set = &context->cpu_set;
4462
4463                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4464                         *exit_status = EXIT_CPUAFFINITY;
4465                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4466                 }
4467         }
4468
4469         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4470                 r = apply_numa_policy(&context->numa_policy);
4471                 if (r == -EOPNOTSUPP)
4472                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4473                 else if (r < 0) {
4474                         *exit_status = EXIT_NUMA_POLICY;
4475                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4476                 }
4477         }
4478
4479         if (context->ioprio_set)
4480                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4481                         *exit_status = EXIT_IOPRIO;
4482                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4483                 }
4484
4485         if (context->timer_slack_nsec != NSEC_INFINITY)
4486                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4487                         *exit_status = EXIT_TIMERSLACK;
4488                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4489                 }
4490
4491         if (context->personality != PERSONALITY_INVALID) {
4492                 r = safe_personality(context->personality);
4493                 if (r < 0) {
4494                         *exit_status = EXIT_PERSONALITY;
4495                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4496                 }
4497         }
4498
4499         if (context->utmp_id) {
4500                 const char *line = context->tty_path ?
4501                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4502                         NULL;
4503                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4504                                       line,
4505                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4506                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4507                                       USER_PROCESS,
4508                                       username);
4509         }
4510
4511         if (uid_is_valid(uid)) {
4512                 r = chown_terminal(STDIN_FILENO, uid);
4513                 if (r < 0) {
4514                         *exit_status = EXIT_STDIN;
4515                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4516                 }
4517         }
4518
4519         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4520          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4521          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4522          * touch a single hierarchy too. */
4523         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4524                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4525                 if (r < 0) {
4526                         *exit_status = EXIT_CGROUP;
4527                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4528                 }
4529         }
4530
4531         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4532
4533         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4534                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4535                 if (r < 0)
4536                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4537         }
4538
4539         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4540                 r = setup_credentials(context, params, unit->id, uid);
4541                 if (r < 0) {
4542                         *exit_status = EXIT_CREDENTIALS;
4543                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4544                 }
4545         }
4546
4547         r = build_environment(
4548                         unit,
4549                         context,
4550                         params,
4551                         n_fds,
4552                         home,
4553                         username,
4554                         shell,
4555                         journal_stream_dev,
4556                         journal_stream_ino,
4557                         &our_env);
4558         if (r < 0) {
4559                 *exit_status = EXIT_MEMORY;
4560                 return log_oom();
4561         }
4562
4563         r = build_pass_environment(context, &pass_env);
4564         if (r < 0) {
4565                 *exit_status = EXIT_MEMORY;
4566                 return log_oom();
4567         }
4568
4569         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4570          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4571          * not specify PATH but the unit has ExecSearchPath. */
4572         if (!strv_isempty(context->exec_search_path)) {
4573                 _cleanup_free_ char *joined = NULL;
4574
4575                 joined = strv_join(context->exec_search_path, ":");
4576                 if (!joined) {
4577                         *exit_status = EXIT_MEMORY;
4578                         return log_oom();
4579                 }
4580
4581                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4582                 if (r < 0) {
4583                         *exit_status = EXIT_MEMORY;
4584                         return log_oom();
4585                 }
4586         }
4587
4588         accum_env = strv_env_merge(params->environment,
4589                                    our_env,
4590                                    joined_exec_search_path,
4591                                    pass_env,
4592                                    context->environment,
4593                                    files_env);
4594         if (!accum_env) {
4595                 *exit_status = EXIT_MEMORY;
4596                 return log_oom();
4597         }
4598         accum_env = strv_env_clean(accum_env);
4599
4600         (void) umask(context->umask);
4601
4602         r = setup_keyring(unit, context, params, uid, gid);
4603         if (r < 0) {
4604                 *exit_status = EXIT_KEYRING;
4605                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4606         }
4607
4608         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4609          * from it. */
4610         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4611
4612         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4613          * for it, and the kernel doesn't actually support ambient caps. */
4614         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4615
4616         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4617          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4618          * desired. */
4619         if (needs_ambient_hack)
4620                 needs_setuid = false;
4621         else
4622                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4623
4624         if (needs_sandboxing) {
4625                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4626                  * /sys being present. The actual MAC context application will happen later, as late as
4627                  * possible, to avoid impacting our own code paths. */
4628
4629 #if HAVE_SELINUX
4630                 use_selinux = mac_selinux_use();
4631 #endif
4632 #if ENABLE_SMACK
4633                 use_smack = mac_smack_use();
4634 #endif
4635 #if HAVE_APPARMOR
4636                 use_apparmor = mac_apparmor_use();
4637 #endif
4638         }
4639
4640         if (needs_sandboxing) {
4641                 int which_failed;
4642
4643                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4644                  * is set here. (See below.) */
4645
4646                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4647                 if (r < 0) {
4648                         *exit_status = EXIT_LIMITS;
4649                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4650                 }
4651         }
4652
4653         if (needs_setuid && context->pam_name && username) {
4654                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4655                  * wins here. (See above.) */
4656
4657                 /* All fds passed in the fds array will be closed in the pam child process. */
4658                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4659                 if (r < 0) {
4660                         *exit_status = EXIT_PAM;
4661                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4662                 }
4663
4664                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4665                 if (ngids_after_pam < 0) {
4666                         *exit_status = EXIT_MEMORY;
4667                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4668                 }
4669         }
4670
4671         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4672                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4673                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4674                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4675
4676                 userns_set_up = true;
4677                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4678                 if (r < 0) {
4679                         *exit_status = EXIT_USER;
4680                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4681                 }
4682         }
4683
4684         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4685
4686                 if (ns_type_supported(NAMESPACE_NET)) {
4687                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4688                         if (r == -EPERM)
4689                                 log_unit_warning_errno(unit, r,
4690                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4691                         else if (r < 0) {
4692                                 *exit_status = EXIT_NETWORK;
4693                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4694                         }
4695                 } else if (context->network_namespace_path) {
4696                         *exit_status = EXIT_NETWORK;
4697                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4698                                                     "NetworkNamespacePath= is not supported, refusing.");
4699                 } else
4700                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4701         }
4702
4703         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4704
4705                 if (ns_type_supported(NAMESPACE_IPC)) {
4706                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4707                         if (r == -EPERM)
4708                                 log_unit_warning_errno(unit, r,
4709                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4710                         else if (r < 0) {
4711                                 *exit_status = EXIT_NAMESPACE;
4712                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4713                         }
4714                 } else if (context->ipc_namespace_path) {
4715                         *exit_status = EXIT_NAMESPACE;
4716                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4717                                                     "IPCNamespacePath= is not supported, refusing.");
4718                 } else
4719                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4720         }
4721
4722         if (needs_mount_namespace) {
4723                 _cleanup_free_ char *error_path = NULL;
4724
4725                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4726                 if (r < 0) {
4727                         *exit_status = EXIT_NAMESPACE;
4728                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4729                                                     error_path ? ": " : "", strempty(error_path));
4730                 }
4731         }
4732
4733         if (needs_sandboxing) {
4734                 r = apply_protect_hostname(unit, context, exit_status);
4735                 if (r < 0)
4736                         return r;
4737         }
4738
4739         /* Drop groups as early as possible.
4740          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4741          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4742         if (needs_setuid) {
4743                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4744                 int ngids_to_enforce = 0;
4745
4746                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4747                                                    ngids,
4748                                                    gids_after_pam,
4749                                                    ngids_after_pam,
4750                                                    &gids_to_enforce);
4751                 if (ngids_to_enforce < 0) {
4752                         *exit_status = EXIT_MEMORY;
4753                         return log_unit_error_errno(unit,
4754                                                     ngids_to_enforce,
4755                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4756                 }
4757
4758                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4759                 if (r < 0) {
4760                         *exit_status = EXIT_GROUP;
4761                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4762                 }
4763         }
4764
4765         /* If the user namespace was not set up above, try to do it now.
4766          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4767          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4768          * case of mount namespaces being less privileged when the mount point list is copied from a
4769          * different user namespace). */
4770
4771         if (needs_sandboxing && context->private_users && !userns_set_up) {
4772                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4773                 if (r < 0) {
4774                         *exit_status = EXIT_USER;
4775                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4776                 }
4777         }
4778
4779         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4780          * shall execute. */
4781
4782         _cleanup_free_ char *executable = NULL;
4783         _cleanup_close_ int executable_fd = -1;
4784         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4785         if (r < 0) {
4786                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4787                         log_unit_struct_errno(unit, LOG_INFO, r,
4788                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4789                                               LOG_UNIT_INVOCATION_ID(unit),
4790                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4791                                                                command->path),
4792                                               "EXECUTABLE=%s", command->path);
4793                         return 0;
4794                 }
4795
4796                 *exit_status = EXIT_EXEC;
4797
4798                 return log_unit_struct_errno(unit, LOG_INFO, r,
4799                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4800                                              LOG_UNIT_INVOCATION_ID(unit),
4801                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4802                                                               command->path),
4803                                              "EXECUTABLE=%s", command->path);
4804         }
4805
4806         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4807         if (r < 0) {
4808                 *exit_status = EXIT_FDS;
4809                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4810         }
4811
4812 #if HAVE_SELINUX
4813         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4814                 int fd = -1;
4815
4816                 if (socket_fd >= 0)
4817                         fd = socket_fd;
4818                 else if (params->n_socket_fds == 1)
4819                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4820                          * use context from that fd to compute the label. */
4821                         fd = params->fds[0];
4822
4823                 if (fd >= 0) {
4824                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4825                         if (r < 0) {
4826                                 if (!context->selinux_context_ignore) {
4827                                         *exit_status = EXIT_SELINUX_CONTEXT;
4828                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4829                                 }
4830                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4831                         }
4832                 }
4833         }
4834 #endif
4835
4836         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4837          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4838          * however if we have it as we want to keep it open until the final execve(). */
4839
4840         r = close_all_fds(keep_fds, n_keep_fds);
4841         if (r >= 0)
4842                 r = shift_fds(fds, n_fds);
4843         if (r >= 0)
4844                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4845         if (r < 0) {
4846                 *exit_status = EXIT_FDS;
4847                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4848         }
4849
4850         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4851          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4852          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4853          * came this far. */
4854
4855         secure_bits = context->secure_bits;
4856
4857         if (needs_sandboxing) {
4858                 uint64_t bset;
4859
4860                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4861                  * requested. (Note this is placed after the general resource limit initialization, see
4862                  * above, in order to take precedence.) */
4863                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4864                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4865                                 *exit_status = EXIT_LIMITS;
4866                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4867                         }
4868                 }
4869
4870 #if ENABLE_SMACK
4871                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4872                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4873                 if (use_smack) {
4874                         r = setup_smack(unit->manager, context, executable_fd);
4875                         if (r < 0 && !context->smack_process_label_ignore) {
4876                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4877                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4878                         }
4879                 }
4880 #endif
4881
4882                 bset = context->capability_bounding_set;
4883                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4884                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4885                  * instead of us doing that */
4886                 if (needs_ambient_hack)
4887                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4888                                 (UINT64_C(1) << CAP_SETUID) |
4889                                 (UINT64_C(1) << CAP_SETGID);
4890
4891                 if (!cap_test_all(bset)) {
4892                         r = capability_bounding_set_drop(bset, false);
4893                         if (r < 0) {
4894                                 *exit_status = EXIT_CAPABILITIES;
4895                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4896                         }
4897                 }
4898
4899                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4900                  * keep-caps set.
4901                  * To be able to raise the ambient capabilities after setresuid() they have to be
4902                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4903                  * After setresuid() the ambient capabilities can be raised as they are present in
4904                  * the permitted and inhertiable set. However it is possible that someone wants to
4905                  * set ambient capabilities without changing the user, so we also set the ambient
4906                  * capabilities here.
4907                  * The requested ambient capabilities are raised in the inheritable set if the
4908                  * second argument is true. */
4909                 if (!needs_ambient_hack) {
4910                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4911                         if (r < 0) {
4912                                 *exit_status = EXIT_CAPABILITIES;
4913                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4914                         }
4915                 }
4916         }
4917
4918         /* chroot to root directory first, before we lose the ability to chroot */
4919         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4920         if (r < 0)
4921                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4922
4923         if (needs_setuid) {
4924                 if (uid_is_valid(uid)) {
4925                         r = enforce_user(context, uid);
4926                         if (r < 0) {
4927                                 *exit_status = EXIT_USER;
4928                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4929                         }
4930
4931                         if (!needs_ambient_hack &&
4932                             context->capability_ambient_set != 0) {
4933
4934                                 /* Raise the ambient capabilities after user change. */
4935                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4936                                 if (r < 0) {
4937                                         *exit_status = EXIT_CAPABILITIES;
4938                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4939                                 }
4940                         }
4941                 }
4942         }
4943
4944         /* Apply working directory here, because the working directory might be on NFS and only the user running
4945          * this service might have the correct privilege to change to the working directory */
4946         r = apply_working_directory(context, params, home, exit_status);
4947         if (r < 0)
4948                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4949
4950         if (needs_sandboxing) {
4951                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4952                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4953                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4954                  * are restricted. */
4955
4956 #if HAVE_SELINUX
4957                 if (use_selinux) {
4958                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4959
4960                         if (exec_context) {
4961                                 r = setexeccon(exec_context);
4962                                 if (r < 0) {
4963                                         if (!context->selinux_context_ignore) {
4964                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4965                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4966                                         }
4967                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4968                                 }
4969                         }
4970                 }
4971 #endif
4972
4973 #if HAVE_APPARMOR
4974                 if (use_apparmor && context->apparmor_profile) {
4975                         r = aa_change_onexec(context->apparmor_profile);
4976                         if (r < 0 && !context->apparmor_profile_ignore) {
4977                                 *exit_status = EXIT_APPARMOR_PROFILE;
4978                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4979                         }
4980                 }
4981 #endif
4982
4983                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4984                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4985                  * CAP_SETPCAP. */
4986                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4987                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4988                          * effective set here.
4989                          * The effective set is overwritten during execve  with the following  values:
4990                          * - ambient set (for non-root processes)
4991                          * - (inheritable | bounding) set for root processes)
4992                          *
4993                          * Hence there is no security impact to raise it in the effective set before execve
4994                          */
4995                         r = capability_gain_cap_setpcap(NULL);
4996                         if (r < 0) {
4997                                 *exit_status = EXIT_CAPABILITIES;
4998                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4999                         }
5000                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5001                                 *exit_status = EXIT_SECUREBITS;
5002                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5003                         }
5004                 }
5005
5006                 if (context_has_no_new_privileges(context))
5007                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5008                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5009                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5010                         }
5011
5012 #if HAVE_SECCOMP
5013                 r = apply_address_families(unit, context);
5014                 if (r < 0) {
5015                         *exit_status = EXIT_ADDRESS_FAMILIES;
5016                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5017                 }
5018
5019                 r = apply_memory_deny_write_execute(unit, context);
5020                 if (r < 0) {
5021                         *exit_status = EXIT_SECCOMP;
5022                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5023                 }
5024
5025                 r = apply_restrict_realtime(unit, context);
5026                 if (r < 0) {
5027                         *exit_status = EXIT_SECCOMP;
5028                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5029                 }
5030
5031                 r = apply_restrict_suid_sgid(unit, context);
5032                 if (r < 0) {
5033                         *exit_status = EXIT_SECCOMP;
5034                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5035                 }
5036
5037                 r = apply_restrict_namespaces(unit, context);
5038                 if (r < 0) {
5039                         *exit_status = EXIT_SECCOMP;
5040                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5041                 }
5042
5043                 r = apply_protect_sysctl(unit, context);
5044                 if (r < 0) {
5045                         *exit_status = EXIT_SECCOMP;
5046                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5047                 }
5048
5049                 r = apply_protect_kernel_modules(unit, context);
5050                 if (r < 0) {
5051                         *exit_status = EXIT_SECCOMP;
5052                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5053                 }
5054
5055                 r = apply_protect_kernel_logs(unit, context);
5056                 if (r < 0) {
5057                         *exit_status = EXIT_SECCOMP;
5058                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5059                 }
5060
5061                 r = apply_protect_clock(unit, context);
5062                 if (r < 0) {
5063                         *exit_status = EXIT_SECCOMP;
5064                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5065                 }
5066
5067                 r = apply_private_devices(unit, context);
5068                 if (r < 0) {
5069                         *exit_status = EXIT_SECCOMP;
5070                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5071                 }
5072
5073                 r = apply_syscall_archs(unit, context);
5074                 if (r < 0) {
5075                         *exit_status = EXIT_SECCOMP;
5076                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5077                 }
5078
5079                 r = apply_lock_personality(unit, context);
5080                 if (r < 0) {
5081                         *exit_status = EXIT_SECCOMP;
5082                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5083                 }
5084
5085                 r = apply_syscall_log(unit, context);
5086                 if (r < 0) {
5087                         *exit_status = EXIT_SECCOMP;
5088                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5089                 }
5090
5091                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5092                  * by the filter as little as possible. */
5093                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5094                 if (r < 0) {
5095                         *exit_status = EXIT_SECCOMP;
5096                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5097                 }
5098 #endif
5099
5100 #if HAVE_LIBBPF
5101                 r = apply_restrict_filesystems(unit, context);
5102                 if (r < 0) {
5103                         *exit_status = EXIT_BPF;
5104                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5105                 }
5106 #endif
5107
5108         }
5109
5110         if (!strv_isempty(context->unset_environment)) {
5111                 char **ee = NULL;
5112
5113                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5114                 if (!ee) {
5115                         *exit_status = EXIT_MEMORY;
5116                         return log_oom();
5117                 }
5118
5119                 strv_free_and_replace(accum_env, ee);
5120         }
5121
5122         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5123                 replaced_argv = replace_env_argv(command->argv, accum_env);
5124                 if (!replaced_argv) {
5125                         *exit_status = EXIT_MEMORY;
5126                         return log_oom();
5127                 }
5128                 final_argv = replaced_argv;
5129         } else
5130                 final_argv = command->argv;
5131
5132         if (DEBUG_LOGGING) {
5133                 _cleanup_free_ char *line = NULL;
5134
5135                 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5136                 if (!line) {
5137                         *exit_status = EXIT_MEMORY;
5138                         return log_oom();
5139                 }
5140
5141                 log_unit_struct(unit, LOG_DEBUG,
5142                                 "EXECUTABLE=%s", executable,
5143                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5144         }
5145
5146         if (exec_fd >= 0) {
5147                 uint8_t hot = 1;
5148
5149                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5150                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5151
5152                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5153                         *exit_status = EXIT_EXEC;
5154                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5155                 }
5156         }
5157
5158         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5159
5160         if (exec_fd >= 0) {
5161                 uint8_t hot = 0;
5162
5163                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5164                  * that POLLHUP on it no longer means execve() succeeded. */
5165
5166                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5167                         *exit_status = EXIT_EXEC;
5168                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5169                 }
5170         }
5171
5172         *exit_status = EXIT_EXEC;
5173         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5174 }
5175
5176 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5177 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5178
5179 int exec_spawn(Unit *unit,
5180                ExecCommand *command,
5181                const ExecContext *context,
5182                const ExecParameters *params,
5183                ExecRuntime *runtime,
5184                DynamicCreds *dcreds,
5185                pid_t *ret) {
5186
5187         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5188         _cleanup_free_ char *subcgroup_path = NULL;
5189         _cleanup_strv_free_ char **files_env = NULL;
5190         size_t n_storage_fds = 0, n_socket_fds = 0;
5191         _cleanup_free_ char *line = NULL;
5192         pid_t pid;
5193
5194         assert(unit);
5195         assert(command);
5196         assert(context);
5197         assert(ret);
5198         assert(params);
5199         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5200
5201         if (context->std_input == EXEC_INPUT_SOCKET ||
5202             context->std_output == EXEC_OUTPUT_SOCKET ||
5203             context->std_error == EXEC_OUTPUT_SOCKET) {
5204
5205                 if (params->n_socket_fds > 1)
5206                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5207
5208                 if (params->n_socket_fds == 0)
5209                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5210
5211                 socket_fd = params->fds[0];
5212         } else {
5213                 socket_fd = -1;
5214                 fds = params->fds;
5215                 n_socket_fds = params->n_socket_fds;
5216                 n_storage_fds = params->n_storage_fds;
5217         }
5218
5219         r = exec_context_named_iofds(context, params, named_iofds);
5220         if (r < 0)
5221                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5222
5223         r = exec_context_load_environment(unit, context, &files_env);
5224         if (r < 0)
5225                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5226
5227         line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5228         if (!line)
5229                 return log_oom();
5230
5231         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5232            and, until the next SELinux policy changes, we save further reloads in future children. */
5233         mac_selinux_maybe_reload();
5234
5235         log_unit_struct(unit, LOG_DEBUG,
5236                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5237                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5238                                                            the mount namespace in the child, but we want to log
5239                                                            from the parent, so we need to use the (possibly
5240                                                            inaccurate) path here. */
5241                         LOG_UNIT_INVOCATION_ID(unit));
5242
5243         if (params->cgroup_path) {
5244                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5245                 if (r < 0)
5246                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5247                 if (r > 0) { /* We are using a child cgroup */
5248                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5249                         if (r < 0)
5250                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5251
5252                         /* Normally we would not propagate the oomd xattrs to children but since we created this
5253                          * sub-cgroup internally we should do it. */
5254                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5255                 }
5256         }
5257
5258         pid = fork();
5259         if (pid < 0)
5260                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5261
5262         if (pid == 0) {
5263                 int exit_status = EXIT_SUCCESS;
5264
5265                 r = exec_child(unit,
5266                                command,
5267                                context,
5268                                params,
5269                                runtime,
5270                                dcreds,
5271                                socket_fd,
5272                                named_iofds,
5273                                fds,
5274                                n_socket_fds,
5275                                n_storage_fds,
5276                                files_env,
5277                                unit->manager->user_lookup_fds[1],
5278                                &exit_status);
5279
5280                 if (r < 0) {
5281                         const char *status =
5282                                 exit_status_to_string(exit_status,
5283                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5284
5285                         log_unit_struct_errno(unit, LOG_ERR, r,
5286                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5287                                               LOG_UNIT_INVOCATION_ID(unit),
5288                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5289                                                                status, command->path),
5290                                               "EXECUTABLE=%s", command->path);
5291                 }
5292
5293                 _exit(exit_status);
5294         }
5295
5296         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5297
5298         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5299          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5300          * process will be killed too). */
5301         if (subcgroup_path)
5302                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5303
5304         exec_status_start(&command->exec_status, pid);
5305
5306         *ret = pid;
5307         return 0;
5308 }
5309
5310 void exec_context_init(ExecContext *c) {
5311         assert(c);
5312
5313         c->umask = 0022;
5314         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5315         c->cpu_sched_policy = SCHED_OTHER;
5316         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5317         c->syslog_level_prefix = true;
5318         c->ignore_sigpipe = true;
5319         c->timer_slack_nsec = NSEC_INFINITY;
5320         c->personality = PERSONALITY_INVALID;
5321         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5322                 c->directories[t].mode = 0755;
5323         c->timeout_clean_usec = USEC_INFINITY;
5324         c->capability_bounding_set = CAP_ALL;
5325         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5326         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5327         c->log_level_max = -1;
5328 #if HAVE_SECCOMP
5329         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5330 #endif
5331         c->tty_rows = UINT_MAX;
5332         c->tty_cols = UINT_MAX;
5333         numa_policy_reset(&c->numa_policy);
5334 }
5335
5336 void exec_context_done(ExecContext *c) {
5337         assert(c);
5338
5339         c->environment = strv_free(c->environment);
5340         c->environment_files = strv_free(c->environment_files);
5341         c->pass_environment = strv_free(c->pass_environment);
5342         c->unset_environment = strv_free(c->unset_environment);
5343
5344         rlimit_free_all(c->rlimit);
5345
5346         for (size_t l = 0; l < 3; l++) {
5347                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5348                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5349         }
5350
5351         c->working_directory = mfree(c->working_directory);
5352         c->root_directory = mfree(c->root_directory);
5353         c->root_image = mfree(c->root_image);
5354         c->root_image_options = mount_options_free_all(c->root_image_options);
5355         c->root_hash = mfree(c->root_hash);
5356         c->root_hash_size = 0;
5357         c->root_hash_path = mfree(c->root_hash_path);
5358         c->root_hash_sig = mfree(c->root_hash_sig);
5359         c->root_hash_sig_size = 0;
5360         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5361         c->root_verity = mfree(c->root_verity);
5362         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5363         c->extension_directories = strv_free(c->extension_directories);
5364         c->tty_path = mfree(c->tty_path);
5365         c->syslog_identifier = mfree(c->syslog_identifier);
5366         c->user = mfree(c->user);
5367         c->group = mfree(c->group);
5368
5369         c->supplementary_groups = strv_free(c->supplementary_groups);
5370
5371         c->pam_name = mfree(c->pam_name);
5372
5373         c->read_only_paths = strv_free(c->read_only_paths);
5374         c->read_write_paths = strv_free(c->read_write_paths);
5375         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5376         c->exec_paths = strv_free(c->exec_paths);
5377         c->no_exec_paths = strv_free(c->no_exec_paths);
5378         c->exec_search_path = strv_free(c->exec_search_path);
5379
5380         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5381         c->bind_mounts = NULL;
5382         c->n_bind_mounts = 0;
5383         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5384         c->temporary_filesystems = NULL;
5385         c->n_temporary_filesystems = 0;
5386         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5387
5388         cpu_set_reset(&c->cpu_set);
5389         numa_policy_reset(&c->numa_policy);
5390
5391         c->utmp_id = mfree(c->utmp_id);
5392         c->selinux_context = mfree(c->selinux_context);
5393         c->apparmor_profile = mfree(c->apparmor_profile);
5394         c->smack_process_label = mfree(c->smack_process_label);
5395
5396         c->restrict_filesystems = set_free(c->restrict_filesystems);
5397
5398         c->syscall_filter = hashmap_free(c->syscall_filter);
5399         c->syscall_archs = set_free(c->syscall_archs);
5400         c->address_families = set_free(c->address_families);
5401
5402         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5403                 exec_directory_done(&c->directories[t]);
5404
5405         c->log_level_max = -1;
5406
5407         exec_context_free_log_extra_fields(c);
5408
5409         c->log_ratelimit_interval_usec = 0;
5410         c->log_ratelimit_burst = 0;
5411
5412         c->stdin_data = mfree(c->stdin_data);
5413         c->stdin_data_size = 0;
5414
5415         c->network_namespace_path = mfree(c->network_namespace_path);
5416         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5417
5418         c->log_namespace = mfree(c->log_namespace);
5419
5420         c->load_credentials = hashmap_free(c->load_credentials);
5421         c->set_credentials = hashmap_free(c->set_credentials);
5422 }
5423
5424 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5425         assert(c);
5426
5427         if (!runtime_prefix)
5428                 return 0;
5429
5430         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5431                 _cleanup_free_ char *p = NULL;
5432
5433                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5434                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5435                 else
5436                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5437                 if (!p)
5438                         return -ENOMEM;
5439
5440                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5441                  * service next. */
5442                 (void) rm_rf(p, REMOVE_ROOT);
5443
5444                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5445                         _cleanup_free_ char *symlink_abs = NULL;
5446
5447                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5448                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5449                         else
5450                                 symlink_abs = path_join(runtime_prefix, *symlink);
5451                         if (!symlink_abs)
5452                                 return -ENOMEM;
5453
5454                         (void) unlink(symlink_abs);
5455                 }
5456
5457         }
5458
5459         return 0;
5460 }
5461
5462 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5463         _cleanup_free_ char *p = NULL;
5464
5465         assert(c);
5466
5467         if (!runtime_prefix || !unit)
5468                 return 0;
5469
5470         p = path_join(runtime_prefix, "credentials", unit);
5471         if (!p)
5472                 return -ENOMEM;
5473
5474         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5475          * unmount it, and afterwards remove the mount point */
5476         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5477         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5478
5479         return 0;
5480 }
5481
5482 static void exec_command_done(ExecCommand *c) {
5483         assert(c);
5484
5485         c->path = mfree(c->path);
5486         c->argv = strv_free(c->argv);
5487 }
5488
5489 void exec_command_done_array(ExecCommand *c, size_t n) {
5490         for (size_t i = 0; i < n; i++)
5491                 exec_command_done(c+i);
5492 }
5493
5494 ExecCommand* exec_command_free_list(ExecCommand *c) {
5495         ExecCommand *i;
5496
5497         while ((i = c)) {
5498                 LIST_REMOVE(command, c, i);
5499                 exec_command_done(i);
5500                 free(i);
5501         }
5502
5503         return NULL;
5504 }
5505
5506 void exec_command_free_array(ExecCommand **c, size_t n) {
5507         for (size_t i = 0; i < n; i++)
5508                 c[i] = exec_command_free_list(c[i]);
5509 }
5510
5511 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5512         for (size_t i = 0; i < n; i++)
5513                 exec_status_reset(&c[i].exec_status);
5514 }
5515
5516 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5517         for (size_t i = 0; i < n; i++)
5518                 LIST_FOREACH(command, z, c[i])
5519                         exec_status_reset(&z->exec_status);
5520 }
5521
5522 typedef struct InvalidEnvInfo {
5523         const Unit *unit;
5524         const char *path;
5525 } InvalidEnvInfo;
5526
5527 static void invalid_env(const char *p, void *userdata) {
5528         InvalidEnvInfo *info = userdata;
5529
5530         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5531 }
5532
5533 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5534         assert(c);
5535
5536         switch (fd_index) {
5537
5538         case STDIN_FILENO:
5539                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5540                         return NULL;
5541
5542                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5543
5544         case STDOUT_FILENO:
5545                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5546                         return NULL;
5547
5548                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5549
5550         case STDERR_FILENO:
5551                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5552                         return NULL;
5553
5554                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5555
5556         default:
5557                 return NULL;
5558         }
5559 }
5560
5561 static int exec_context_named_iofds(
5562                 const ExecContext *c,
5563                 const ExecParameters *p,
5564                 int named_iofds[static 3]) {
5565
5566         size_t targets;
5567         const char* stdio_fdname[3];
5568         size_t n_fds;
5569
5570         assert(c);
5571         assert(p);
5572         assert(named_iofds);
5573
5574         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5575                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5576                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5577
5578         for (size_t i = 0; i < 3; i++)
5579                 stdio_fdname[i] = exec_context_fdname(c, i);
5580
5581         n_fds = p->n_storage_fds + p->n_socket_fds;
5582
5583         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5584                 if (named_iofds[STDIN_FILENO] < 0 &&
5585                     c->std_input == EXEC_INPUT_NAMED_FD &&
5586                     stdio_fdname[STDIN_FILENO] &&
5587                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5588
5589                         named_iofds[STDIN_FILENO] = p->fds[i];
5590                         targets--;
5591
5592                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5593                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5594                            stdio_fdname[STDOUT_FILENO] &&
5595                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5596
5597                         named_iofds[STDOUT_FILENO] = p->fds[i];
5598                         targets--;
5599
5600                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5601                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5602                            stdio_fdname[STDERR_FILENO] &&
5603                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5604
5605                         named_iofds[STDERR_FILENO] = p->fds[i];
5606                         targets--;
5607                 }
5608
5609         return targets == 0 ? 0 : -ENOENT;
5610 }
5611
5612 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5613         _cleanup_strv_free_ char **v = NULL;
5614         int r;
5615
5616         assert(c);
5617         assert(ret);
5618
5619         STRV_FOREACH(i, c->environment_files) {
5620                 _cleanup_globfree_ glob_t pglob = {};
5621                 bool ignore = false;
5622                 char *fn = *i;
5623
5624                 if (fn[0] == '-') {
5625                         ignore = true;
5626                         fn++;
5627                 }
5628
5629                 if (!path_is_absolute(fn)) {
5630                         if (ignore)
5631                                 continue;
5632                         return -EINVAL;
5633                 }
5634
5635                 /* Filename supports globbing, take all matching files */
5636                 r = safe_glob(fn, 0, &pglob);
5637                 if (r < 0) {
5638                         if (ignore)
5639                                 continue;
5640                         return r;
5641                 }
5642
5643                 /* When we don't match anything, -ENOENT should be returned */
5644                 assert(pglob.gl_pathc > 0);
5645
5646                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5647                         _cleanup_strv_free_ char **p = NULL;
5648
5649                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5650                         if (r < 0) {
5651                                 if (ignore)
5652                                         continue;
5653                                 return r;
5654                         }
5655
5656                         /* Log invalid environment variables with filename */
5657                         if (p) {
5658                                 InvalidEnvInfo info = {
5659                                         .unit = unit,
5660                                         .path = pglob.gl_pathv[n]
5661                                 };
5662
5663                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5664                         }
5665
5666                         if (!v)
5667                                 v = TAKE_PTR(p);
5668                         else {
5669                                 char **m = strv_env_merge(v, p);
5670                                 if (!m)
5671                                         return -ENOMEM;
5672
5673                                 strv_free_and_replace(v, m);
5674                         }
5675                 }
5676         }
5677
5678         *ret = TAKE_PTR(v);
5679
5680         return 0;
5681 }
5682
5683 static bool tty_may_match_dev_console(const char *tty) {
5684         _cleanup_free_ char *resolved = NULL;
5685
5686         if (!tty)
5687                 return true;
5688
5689         tty = skip_dev_prefix(tty);
5690
5691         /* trivial identity? */
5692         if (streq(tty, "console"))
5693                 return true;
5694
5695         if (resolve_dev_console(&resolved) < 0)
5696                 return true; /* if we could not resolve, assume it may */
5697
5698         /* "tty0" means the active VC, so it may be the same sometimes */
5699         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5700 }
5701
5702 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5703         assert(ec);
5704
5705         return ec->tty_reset ||
5706                 ec->tty_vhangup ||
5707                 ec->tty_vt_disallocate ||
5708                 is_terminal_input(ec->std_input) ||
5709                 is_terminal_output(ec->std_output) ||
5710                 is_terminal_output(ec->std_error);
5711 }
5712
5713 bool exec_context_may_touch_console(const ExecContext *ec) {
5714
5715         return exec_context_may_touch_tty(ec) &&
5716                tty_may_match_dev_console(exec_context_tty_path(ec));
5717 }
5718
5719 static void strv_fprintf(FILE *f, char **l) {
5720         assert(f);
5721
5722         STRV_FOREACH(g, l)
5723                 fprintf(f, " %s", *g);
5724 }
5725
5726 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5727         assert(f);
5728         assert(prefix);
5729         assert(name);
5730
5731         if (!strv_isempty(strv)) {
5732                 fprintf(f, "%s%s:", prefix, name);
5733                 strv_fprintf(f, strv);
5734                 fputs("\n", f);
5735         }
5736 }
5737
5738 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5739         int r;
5740
5741         assert(c);
5742         assert(f);
5743
5744         prefix = strempty(prefix);
5745
5746         fprintf(f,
5747                 "%sUMask: %04o\n"
5748                 "%sWorkingDirectory: %s\n"
5749                 "%sRootDirectory: %s\n"
5750                 "%sNonBlocking: %s\n"
5751                 "%sPrivateTmp: %s\n"
5752                 "%sPrivateDevices: %s\n"
5753                 "%sProtectKernelTunables: %s\n"
5754                 "%sProtectKernelModules: %s\n"
5755                 "%sProtectKernelLogs: %s\n"
5756                 "%sProtectClock: %s\n"
5757                 "%sProtectControlGroups: %s\n"
5758                 "%sPrivateNetwork: %s\n"
5759                 "%sPrivateUsers: %s\n"
5760                 "%sProtectHome: %s\n"
5761                 "%sProtectSystem: %s\n"
5762                 "%sMountAPIVFS: %s\n"
5763                 "%sIgnoreSIGPIPE: %s\n"
5764                 "%sMemoryDenyWriteExecute: %s\n"
5765                 "%sRestrictRealtime: %s\n"
5766                 "%sRestrictSUIDSGID: %s\n"
5767                 "%sKeyringMode: %s\n"
5768                 "%sProtectHostname: %s\n"
5769                 "%sProtectProc: %s\n"
5770                 "%sProcSubset: %s\n",
5771                 prefix, c->umask,
5772                 prefix, empty_to_root(c->working_directory),
5773                 prefix, empty_to_root(c->root_directory),
5774                 prefix, yes_no(c->non_blocking),
5775                 prefix, yes_no(c->private_tmp),
5776                 prefix, yes_no(c->private_devices),
5777                 prefix, yes_no(c->protect_kernel_tunables),
5778                 prefix, yes_no(c->protect_kernel_modules),
5779                 prefix, yes_no(c->protect_kernel_logs),
5780                 prefix, yes_no(c->protect_clock),
5781                 prefix, yes_no(c->protect_control_groups),
5782                 prefix, yes_no(c->private_network),
5783                 prefix, yes_no(c->private_users),
5784                 prefix, protect_home_to_string(c->protect_home),
5785                 prefix, protect_system_to_string(c->protect_system),
5786                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5787                 prefix, yes_no(c->ignore_sigpipe),
5788                 prefix, yes_no(c->memory_deny_write_execute),
5789                 prefix, yes_no(c->restrict_realtime),
5790                 prefix, yes_no(c->restrict_suid_sgid),
5791                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5792                 prefix, yes_no(c->protect_hostname),
5793                 prefix, protect_proc_to_string(c->protect_proc),
5794                 prefix, proc_subset_to_string(c->proc_subset));
5795
5796         if (c->root_image)
5797                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5798
5799         if (c->root_image_options) {
5800                 fprintf(f, "%sRootImageOptions:", prefix);
5801                 LIST_FOREACH(mount_options, o, c->root_image_options)
5802                         if (!isempty(o->options))
5803                                 fprintf(f, " %s:%s",
5804                                         partition_designator_to_string(o->partition_designator),
5805                                         o->options);
5806                 fprintf(f, "\n");
5807         }
5808
5809         if (c->root_hash) {
5810                 _cleanup_free_ char *encoded = NULL;
5811                 encoded = hexmem(c->root_hash, c->root_hash_size);
5812                 if (encoded)
5813                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5814         }
5815
5816         if (c->root_hash_path)
5817                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5818
5819         if (c->root_hash_sig) {
5820                 _cleanup_free_ char *encoded = NULL;
5821                 ssize_t len;
5822                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5823                 if (len)
5824                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5825         }
5826
5827         if (c->root_hash_sig_path)
5828                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5829
5830         if (c->root_verity)
5831                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5832
5833         STRV_FOREACH(e, c->environment)
5834                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5835
5836         STRV_FOREACH(e, c->environment_files)
5837                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5838
5839         STRV_FOREACH(e, c->pass_environment)
5840                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5841
5842         STRV_FOREACH(e, c->unset_environment)
5843                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5844
5845         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5846
5847         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5848                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5849
5850                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5851                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5852
5853                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5854                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5855                 }
5856         }
5857
5858         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5859
5860         if (c->nice_set)
5861                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5862
5863         if (c->oom_score_adjust_set)
5864                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5865
5866         if (c->coredump_filter_set)
5867                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5868
5869         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5870                 if (c->rlimit[i]) {
5871                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5872                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5873                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5874                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5875                 }
5876
5877         if (c->ioprio_set) {
5878                 _cleanup_free_ char *class_str = NULL;
5879
5880                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5881                 if (r >= 0)
5882                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5883
5884                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5885         }
5886
5887         if (c->cpu_sched_set) {
5888                 _cleanup_free_ char *policy_str = NULL;
5889
5890                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5891                 if (r >= 0)
5892                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5893
5894                 fprintf(f,
5895                         "%sCPUSchedulingPriority: %i\n"
5896                         "%sCPUSchedulingResetOnFork: %s\n",
5897                         prefix, c->cpu_sched_priority,
5898                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5899         }
5900
5901         if (c->cpu_set.set) {
5902                 _cleanup_free_ char *affinity = NULL;
5903
5904                 affinity = cpu_set_to_range_string(&c->cpu_set);
5905                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5906         }
5907
5908         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5909                 _cleanup_free_ char *nodes = NULL;
5910
5911                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5912                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5913                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5914         }
5915
5916         if (c->timer_slack_nsec != NSEC_INFINITY)
5917                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5918
5919         fprintf(f,
5920                 "%sStandardInput: %s\n"
5921                 "%sStandardOutput: %s\n"
5922                 "%sStandardError: %s\n",
5923                 prefix, exec_input_to_string(c->std_input),
5924                 prefix, exec_output_to_string(c->std_output),
5925                 prefix, exec_output_to_string(c->std_error));
5926
5927         if (c->std_input == EXEC_INPUT_NAMED_FD)
5928                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5929         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5930                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5931         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5932                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5933
5934         if (c->std_input == EXEC_INPUT_FILE)
5935                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5936         if (c->std_output == EXEC_OUTPUT_FILE)
5937                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5938         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5939                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5940         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5941                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5942         if (c->std_error == EXEC_OUTPUT_FILE)
5943                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5944         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5945                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5946         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5947                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5948
5949         if (c->tty_path)
5950                 fprintf(f,
5951                         "%sTTYPath: %s\n"
5952                         "%sTTYReset: %s\n"
5953                         "%sTTYVHangup: %s\n"
5954                         "%sTTYVTDisallocate: %s\n"
5955                         "%sTTYRows: %u\n"
5956                         "%sTTYColumns: %u\n",
5957                         prefix, c->tty_path,
5958                         prefix, yes_no(c->tty_reset),
5959                         prefix, yes_no(c->tty_vhangup),
5960                         prefix, yes_no(c->tty_vt_disallocate),
5961                         prefix, c->tty_rows,
5962                         prefix, c->tty_cols);
5963
5964         if (IN_SET(c->std_output,
5965                    EXEC_OUTPUT_KMSG,
5966                    EXEC_OUTPUT_JOURNAL,
5967                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5968                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5969             IN_SET(c->std_error,
5970                    EXEC_OUTPUT_KMSG,
5971                    EXEC_OUTPUT_JOURNAL,
5972                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5973                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5974
5975                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5976
5977                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5978                 if (r >= 0)
5979                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5980
5981                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5982                 if (r >= 0)
5983                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5984         }
5985
5986         if (c->log_level_max >= 0) {
5987                 _cleanup_free_ char *t = NULL;
5988
5989                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5990
5991                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5992         }
5993
5994         if (c->log_ratelimit_interval_usec > 0)
5995                 fprintf(f,
5996                         "%sLogRateLimitIntervalSec: %s\n",
5997                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5998
5999         if (c->log_ratelimit_burst > 0)
6000                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6001
6002         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6003                 fprintf(f, "%sLogExtraFields: ", prefix);
6004                 fwrite(c->log_extra_fields[j].iov_base,
6005                        1, c->log_extra_fields[j].iov_len,
6006                        f);
6007                 fputc('\n', f);
6008         }
6009
6010         if (c->log_namespace)
6011                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6012
6013         if (c->secure_bits) {
6014                 _cleanup_free_ char *str = NULL;
6015
6016                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6017                 if (r >= 0)
6018                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6019         }
6020
6021         if (c->capability_bounding_set != CAP_ALL) {
6022                 _cleanup_free_ char *str = NULL;
6023
6024                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6025                 if (r >= 0)
6026                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6027         }
6028
6029         if (c->capability_ambient_set != 0) {
6030                 _cleanup_free_ char *str = NULL;
6031
6032                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6033                 if (r >= 0)
6034                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6035         }
6036
6037         if (c->user)
6038                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6039         if (c->group)
6040                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6041
6042         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6043
6044         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6045
6046         if (c->pam_name)
6047                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6048
6049         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6050         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6051         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6052         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6053         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6054         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6055
6056         for (size_t i = 0; i < c->n_bind_mounts; i++)
6057                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6058                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6059                         c->bind_mounts[i].ignore_enoent ? "-": "",
6060                         c->bind_mounts[i].source,
6061                         c->bind_mounts[i].destination,
6062                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6063
6064         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6065                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6066
6067                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6068                         t->path,
6069                         isempty(t->options) ? "" : ":",
6070                         strempty(t->options));
6071         }
6072
6073         if (c->utmp_id)
6074                 fprintf(f,
6075                         "%sUtmpIdentifier: %s\n",
6076                         prefix, c->utmp_id);
6077
6078         if (c->selinux_context)
6079                 fprintf(f,
6080                         "%sSELinuxContext: %s%s\n",
6081                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6082
6083         if (c->apparmor_profile)
6084                 fprintf(f,
6085                         "%sAppArmorProfile: %s%s\n",
6086                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6087
6088         if (c->smack_process_label)
6089                 fprintf(f,
6090                         "%sSmackProcessLabel: %s%s\n",
6091                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6092
6093         if (c->personality != PERSONALITY_INVALID)
6094                 fprintf(f,
6095                         "%sPersonality: %s\n",
6096                         prefix, strna(personality_to_string(c->personality)));
6097
6098         fprintf(f,
6099                 "%sLockPersonality: %s\n",
6100                 prefix, yes_no(c->lock_personality));
6101
6102         if (c->syscall_filter) {
6103                 fprintf(f,
6104                         "%sSystemCallFilter: ",
6105                         prefix);
6106
6107                 if (!c->syscall_allow_list)
6108                         fputc('~', f);
6109
6110 #if HAVE_SECCOMP
6111                 void *id, *val;
6112                 bool first = true;
6113                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6114                         _cleanup_free_ char *name = NULL;
6115                         const char *errno_name = NULL;
6116                         int num = PTR_TO_INT(val);
6117
6118                         if (first)
6119                                 first = false;
6120                         else
6121                                 fputc(' ', f);
6122
6123                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6124                         fputs(strna(name), f);
6125
6126                         if (num >= 0) {
6127                                 errno_name = seccomp_errno_or_action_to_string(num);
6128                                 if (errno_name)
6129                                         fprintf(f, ":%s", errno_name);
6130                                 else
6131                                         fprintf(f, ":%d", num);
6132                         }
6133                 }
6134 #endif
6135
6136                 fputc('\n', f);
6137         }
6138
6139         if (c->syscall_archs) {
6140                 fprintf(f,
6141                         "%sSystemCallArchitectures:",
6142                         prefix);
6143
6144 #if HAVE_SECCOMP
6145                 void *id;
6146                 SET_FOREACH(id, c->syscall_archs)
6147                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6148 #endif
6149                 fputc('\n', f);
6150         }
6151
6152         if (exec_context_restrict_namespaces_set(c)) {
6153                 _cleanup_free_ char *s = NULL;
6154
6155                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6156                 if (r >= 0)
6157                         fprintf(f, "%sRestrictNamespaces: %s\n",
6158                                 prefix, strna(s));
6159         }
6160
6161 #if HAVE_LIBBPF
6162         if (exec_context_restrict_filesystems_set(c)) {
6163                 char *fs;
6164                 SET_FOREACH(fs, c->restrict_filesystems)
6165                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6166         }
6167 #endif
6168
6169         if (c->network_namespace_path)
6170                 fprintf(f,
6171                         "%sNetworkNamespacePath: %s\n",
6172                         prefix, c->network_namespace_path);
6173
6174         if (c->syscall_errno > 0) {
6175                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6176
6177 #if HAVE_SECCOMP
6178                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6179                 if (errno_name)
6180                         fputs(errno_name, f);
6181                 else
6182                         fprintf(f, "%d", c->syscall_errno);
6183 #endif
6184                 fputc('\n', f);
6185         }
6186
6187         for (size_t i = 0; i < c->n_mount_images; i++) {
6188                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6189                         c->mount_images[i].ignore_enoent ? "-": "",
6190                         c->mount_images[i].source,
6191                         c->mount_images[i].destination);
6192                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6193                         fprintf(f, ":%s:%s",
6194                                 partition_designator_to_string(o->partition_designator),
6195                                 strempty(o->options));
6196                 fprintf(f, "\n");
6197         }
6198
6199         for (size_t i = 0; i < c->n_extension_images; i++) {
6200                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6201                         c->extension_images[i].ignore_enoent ? "-": "",
6202                         c->extension_images[i].source);
6203                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6204                         fprintf(f, ":%s:%s",
6205                                 partition_designator_to_string(o->partition_designator),
6206                                 strempty(o->options));
6207                 fprintf(f, "\n");
6208         }
6209
6210         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6211 }
6212
6213 bool exec_context_maintains_privileges(const ExecContext *c) {
6214         assert(c);
6215
6216         /* Returns true if the process forked off would run under
6217          * an unchanged UID or as root. */
6218
6219         if (!c->user)
6220                 return true;
6221
6222         if (streq(c->user, "root") || streq(c->user, "0"))
6223                 return true;
6224
6225         return false;
6226 }
6227
6228 int exec_context_get_effective_ioprio(const ExecContext *c) {
6229         int p;
6230
6231         assert(c);
6232
6233         if (c->ioprio_set)
6234                 return c->ioprio;
6235
6236         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6237         if (p < 0)
6238                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6239
6240         return ioprio_normalize(p);
6241 }
6242
6243 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6244         assert(c);
6245
6246         /* Explicit setting wins */
6247         if (c->mount_apivfs_set)
6248                 return c->mount_apivfs;
6249
6250         /* Default to "yes" if root directory or image are specified */
6251         if (exec_context_with_rootfs(c))
6252                 return true;
6253
6254         return false;
6255 }
6256
6257 void exec_context_free_log_extra_fields(ExecContext *c) {
6258         assert(c);
6259
6260         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6261                 free(c->log_extra_fields[l].iov_base);
6262         c->log_extra_fields = mfree(c->log_extra_fields);
6263         c->n_log_extra_fields = 0;
6264 }
6265
6266 void exec_context_revert_tty(ExecContext *c) {
6267         _cleanup_close_ int fd = -1;
6268         const char *path;
6269         struct stat st;
6270         int r;
6271
6272         assert(c);
6273
6274         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6275         exec_context_tty_reset(c, NULL);
6276
6277         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6278          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6279          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6280         if (!exec_context_may_touch_tty(c))
6281                 return;
6282
6283         path = exec_context_tty_path(c);
6284         if (!path)
6285                 return;
6286
6287         fd = open(path, O_PATH|O_CLOEXEC);
6288         if (fd < 0)
6289                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6290                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6291                                              path);
6292
6293         if (fstat(fd, &st) < 0)
6294                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6295
6296         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6297          * if things are a character device, since a proper check either means we'd have to open the TTY and
6298          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6299          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6300          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6301         if (!S_ISCHR(st.st_mode))
6302                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6303
6304         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6305         if (r < 0)
6306                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6307 }
6308
6309 int exec_context_get_clean_directories(
6310                 ExecContext *c,
6311                 char **prefix,
6312                 ExecCleanMask mask,
6313                 char ***ret) {
6314
6315         _cleanup_strv_free_ char **l = NULL;
6316         int r;
6317
6318         assert(c);
6319         assert(prefix);
6320         assert(ret);
6321
6322         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6323                 if (!FLAGS_SET(mask, 1U << t))
6324                         continue;
6325
6326                 if (!prefix[t])
6327                         continue;
6328
6329                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6330                         char *j;
6331
6332                         j = path_join(prefix[t], c->directories[t].items[i].path);
6333                         if (!j)
6334                                 return -ENOMEM;
6335
6336                         r = strv_consume(&l, j);
6337                         if (r < 0)
6338                                 return r;
6339
6340                         /* Also remove private directories unconditionally. */
6341                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6342                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6343                                 if (!j)
6344                                         return -ENOMEM;
6345
6346                                 r = strv_consume(&l, j);
6347                                 if (r < 0)
6348                                         return r;
6349                         }
6350
6351                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6352                                 j = path_join(prefix[t], *symlink);
6353                                 if (!j)
6354                                         return -ENOMEM;
6355
6356                                 r = strv_consume(&l, j);
6357                                 if (r < 0)
6358                                         return r;
6359                         }
6360                 }
6361         }
6362
6363         *ret = TAKE_PTR(l);
6364         return 0;
6365 }
6366
6367 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6368         ExecCleanMask mask = 0;
6369
6370         assert(c);
6371         assert(ret);
6372
6373         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6374                 if (c->directories[t].n_items > 0)
6375                         mask |= 1U << t;
6376
6377         *ret = mask;
6378         return 0;
6379 }
6380
6381 void exec_status_start(ExecStatus *s, pid_t pid) {
6382         assert(s);
6383
6384         *s = (ExecStatus) {
6385                 .pid = pid,
6386         };
6387
6388         dual_timestamp_get(&s->start_timestamp);
6389 }
6390
6391 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6392         assert(s);
6393
6394         if (s->pid != pid)
6395                 *s = (ExecStatus) {
6396                         .pid = pid,
6397                 };
6398
6399         dual_timestamp_get(&s->exit_timestamp);
6400
6401         s->code = code;
6402         s->status = status;
6403
6404         if (context && context->utmp_id)
6405                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6406 }
6407
6408 void exec_status_reset(ExecStatus *s) {
6409         assert(s);
6410
6411         *s = (ExecStatus) {};
6412 }
6413
6414 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6415         assert(s);
6416         assert(f);
6417
6418         if (s->pid <= 0)
6419                 return;
6420
6421         prefix = strempty(prefix);
6422
6423         fprintf(f,
6424                 "%sPID: "PID_FMT"\n",
6425                 prefix, s->pid);
6426
6427         if (dual_timestamp_is_set(&s->start_timestamp))
6428                 fprintf(f,
6429                         "%sStart Timestamp: %s\n",
6430                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6431
6432         if (dual_timestamp_is_set(&s->exit_timestamp))
6433                 fprintf(f,
6434                         "%sExit Timestamp: %s\n"
6435                         "%sExit Code: %s\n"
6436                         "%sExit Status: %i\n",
6437                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6438                         prefix, sigchld_code_to_string(s->code),
6439                         prefix, s->status);
6440 }
6441
6442 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6443         _cleanup_free_ char *cmd = NULL;
6444         const char *prefix2;
6445
6446         assert(c);
6447         assert(f);
6448
6449         prefix = strempty(prefix);
6450         prefix2 = strjoina(prefix, "\t");
6451
6452         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6453
6454         fprintf(f,
6455                 "%sCommand Line: %s\n",
6456                 prefix, strnull(cmd));
6457
6458         exec_status_dump(&c->exec_status, f, prefix2);
6459 }
6460
6461 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6462         assert(f);
6463
6464         prefix = strempty(prefix);
6465
6466         LIST_FOREACH(command, i, c)
6467                 exec_command_dump(i, f, prefix);
6468 }
6469
6470 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6471         ExecCommand *end;
6472
6473         assert(l);
6474         assert(e);
6475
6476         if (*l) {
6477                 /* It's kind of important, that we keep the order here */
6478                 LIST_FIND_TAIL(command, *l, end);
6479                 LIST_INSERT_AFTER(command, *l, end, e);
6480         } else
6481               *l = e;
6482 }
6483
6484 int exec_command_set(ExecCommand *c, const char *path, ...) {
6485         va_list ap;
6486         char **l, *p;
6487
6488         assert(c);
6489         assert(path);
6490
6491         va_start(ap, path);
6492         l = strv_new_ap(path, ap);
6493         va_end(ap);
6494
6495         if (!l)
6496                 return -ENOMEM;
6497
6498         p = strdup(path);
6499         if (!p) {
6500                 strv_free(l);
6501                 return -ENOMEM;
6502         }
6503
6504         free_and_replace(c->path, p);
6505
6506         return strv_free_and_replace(c->argv, l);
6507 }
6508
6509 int exec_command_append(ExecCommand *c, const char *path, ...) {
6510         _cleanup_strv_free_ char **l = NULL;
6511         va_list ap;
6512         int r;
6513
6514         assert(c);
6515         assert(path);
6516
6517         va_start(ap, path);
6518         l = strv_new_ap(path, ap);
6519         va_end(ap);
6520
6521         if (!l)
6522                 return -ENOMEM;
6523
6524         r = strv_extend_strv(&c->argv, l, false);
6525         if (r < 0)
6526                 return r;
6527
6528         return 0;
6529 }
6530
6531 static void *remove_tmpdir_thread(void *p) {
6532         _cleanup_free_ char *path = p;
6533
6534         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6535         return NULL;
6536 }
6537
6538 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6539         int r;
6540
6541         if (!rt)
6542                 return NULL;
6543
6544         if (rt->manager)
6545                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6546
6547         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6548
6549         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6550                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6551
6552                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6553                 if (r < 0)
6554                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6555                 else
6556                         rt->tmp_dir = NULL;
6557         }
6558
6559         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6560                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6561
6562                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6563                 if (r < 0)
6564                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6565                 else
6566                         rt->var_tmp_dir = NULL;
6567         }
6568
6569         rt->id = mfree(rt->id);
6570         rt->tmp_dir = mfree(rt->tmp_dir);
6571         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6572         safe_close_pair(rt->netns_storage_socket);
6573         safe_close_pair(rt->ipcns_storage_socket);
6574         return mfree(rt);
6575 }
6576
6577 static void exec_runtime_freep(ExecRuntime **rt) {
6578         (void) exec_runtime_free(*rt, false);
6579 }
6580
6581 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6582         _cleanup_free_ char *id_copy = NULL;
6583         ExecRuntime *n;
6584
6585         assert(ret);
6586
6587         id_copy = strdup(id);
6588         if (!id_copy)
6589                 return -ENOMEM;
6590
6591         n = new(ExecRuntime, 1);
6592         if (!n)
6593                 return -ENOMEM;
6594
6595         *n = (ExecRuntime) {
6596                 .id = TAKE_PTR(id_copy),
6597                 .netns_storage_socket = { -1, -1 },
6598                 .ipcns_storage_socket = { -1, -1 },
6599         };
6600
6601         *ret = n;
6602         return 0;
6603 }
6604
6605 static int exec_runtime_add(
6606                 Manager *m,
6607                 const char *id,
6608                 char **tmp_dir,
6609                 char **var_tmp_dir,
6610                 int netns_storage_socket[2],
6611                 int ipcns_storage_socket[2],
6612                 ExecRuntime **ret) {
6613
6614         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6615         int r;
6616
6617         assert(m);
6618         assert(id);
6619
6620         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6621
6622         r = exec_runtime_allocate(&rt, id);
6623         if (r < 0)
6624                 return r;
6625
6626         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6627         if (r < 0)
6628                 return r;
6629
6630         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6631         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6632         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6633
6634         if (netns_storage_socket) {
6635                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6636                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6637         }
6638
6639         if (ipcns_storage_socket) {
6640                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6641                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6642         }
6643
6644         rt->manager = m;
6645
6646         if (ret)
6647                 *ret = rt;
6648         /* do not remove created ExecRuntime object when the operation succeeds. */
6649         TAKE_PTR(rt);
6650         return 0;
6651 }
6652
6653 static int exec_runtime_make(
6654                 Manager *m,
6655                 const ExecContext *c,
6656                 const char *id,
6657                 ExecRuntime **ret) {
6658
6659         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6660         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6661         int r;
6662
6663         assert(m);
6664         assert(c);
6665         assert(id);
6666
6667         /* It is not necessary to create ExecRuntime object. */
6668         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6669                 *ret = NULL;
6670                 return 0;
6671         }
6672
6673         if (c->private_tmp &&
6674             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6675               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6676                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6677                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6678                 if (r < 0)
6679                         return r;
6680         }
6681
6682         if (c->private_network || c->network_namespace_path) {
6683                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6684                         return -errno;
6685         }
6686
6687         if (c->private_ipc || c->ipc_namespace_path) {
6688                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6689                         return -errno;
6690         }
6691
6692         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6693         if (r < 0)
6694                 return r;
6695
6696         return 1;
6697 }
6698
6699 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6700         ExecRuntime *rt;
6701         int r;
6702
6703         assert(m);
6704         assert(id);
6705         assert(ret);
6706
6707         rt = hashmap_get(m->exec_runtime_by_id, id);
6708         if (rt)
6709                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6710                 goto ref;
6711
6712         if (!create) {
6713                 *ret = NULL;
6714                 return 0;
6715         }
6716
6717         /* If not found, then create a new object. */
6718         r = exec_runtime_make(m, c, id, &rt);
6719         if (r < 0)
6720                 return r;
6721         if (r == 0) {
6722                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6723                 *ret = NULL;
6724                 return 0;
6725         }
6726
6727 ref:
6728         /* increment reference counter. */
6729         rt->n_ref++;
6730         *ret = rt;
6731         return 1;
6732 }
6733
6734 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6735         if (!rt)
6736                 return NULL;
6737
6738         assert(rt->n_ref > 0);
6739
6740         rt->n_ref--;
6741         if (rt->n_ref > 0)
6742                 return NULL;
6743
6744         return exec_runtime_free(rt, destroy);
6745 }
6746
6747 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6748         ExecRuntime *rt;
6749
6750         assert(m);
6751         assert(f);
6752         assert(fds);
6753
6754         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6755                 fprintf(f, "exec-runtime=%s", rt->id);
6756
6757                 if (rt->tmp_dir)
6758                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6759
6760                 if (rt->var_tmp_dir)
6761                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6762
6763                 if (rt->netns_storage_socket[0] >= 0) {
6764                         int copy;
6765
6766                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6767                         if (copy < 0)
6768                                 return copy;
6769
6770                         fprintf(f, " netns-socket-0=%i", copy);
6771                 }
6772
6773                 if (rt->netns_storage_socket[1] >= 0) {
6774                         int copy;
6775
6776                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6777                         if (copy < 0)
6778                                 return copy;
6779
6780                         fprintf(f, " netns-socket-1=%i", copy);
6781                 }
6782
6783                 if (rt->ipcns_storage_socket[0] >= 0) {
6784                         int copy;
6785
6786                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6787                         if (copy < 0)
6788                                 return copy;
6789
6790                         fprintf(f, " ipcns-socket-0=%i", copy);
6791                 }
6792
6793                 if (rt->ipcns_storage_socket[1] >= 0) {
6794                         int copy;
6795
6796                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6797                         if (copy < 0)
6798                                 return copy;
6799
6800                         fprintf(f, " ipcns-socket-1=%i", copy);
6801                 }
6802
6803                 fputc('\n', f);
6804         }
6805
6806         return 0;
6807 }
6808
6809 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6810         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6811         ExecRuntime *rt;
6812         int r;
6813
6814         /* This is for the migration from old (v237 or earlier) deserialization text.
6815          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6816          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6817          * so or not from the serialized text, then we always creates a new object owned by this. */
6818
6819         assert(u);
6820         assert(key);
6821         assert(value);
6822
6823         /* Manager manages ExecRuntime objects by the unit id.
6824          * So, we omit the serialized text when the unit does not have id (yet?)... */
6825         if (isempty(u->id)) {
6826                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6827                 return 0;
6828         }
6829
6830         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6831                 return log_oom();
6832
6833         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6834         if (!rt) {
6835                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6836                         return log_oom();
6837
6838                 rt = rt_create;
6839         }
6840
6841         if (streq(key, "tmp-dir")) {
6842                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6843                         return -ENOMEM;
6844
6845         } else if (streq(key, "var-tmp-dir")) {
6846                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6847                         return -ENOMEM;
6848
6849         } else if (streq(key, "netns-socket-0")) {
6850                 int fd;
6851
6852                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6853                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6854                         return 0;
6855                 }
6856
6857                 safe_close(rt->netns_storage_socket[0]);
6858                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6859
6860         } else if (streq(key, "netns-socket-1")) {
6861                 int fd;
6862
6863                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6864                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6865                         return 0;
6866                 }
6867
6868                 safe_close(rt->netns_storage_socket[1]);
6869                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6870
6871         } else
6872                 return 0;
6873
6874         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6875         if (rt_create) {
6876                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6877                 if (r < 0) {
6878                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6879                         return 0;
6880                 }
6881
6882                 rt_create->manager = u->manager;
6883
6884                 /* Avoid cleanup */
6885                 TAKE_PTR(rt_create);
6886         }
6887
6888         return 1;
6889 }
6890
6891 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6892         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6893         char *id = NULL;
6894         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6895         const char *p, *v = ASSERT_PTR(value);
6896         size_t n;
6897
6898         assert(m);
6899         assert(fds);
6900
6901         n = strcspn(v, " ");
6902         id = strndupa_safe(v, n);
6903         if (v[n] != ' ')
6904                 goto finalize;
6905         p = v + n + 1;
6906
6907         v = startswith(p, "tmp-dir=");
6908         if (v) {
6909                 n = strcspn(v, " ");
6910                 tmp_dir = strndup(v, n);
6911                 if (!tmp_dir)
6912                         return log_oom();
6913                 if (v[n] != ' ')
6914                         goto finalize;
6915                 p = v + n + 1;
6916         }
6917
6918         v = startswith(p, "var-tmp-dir=");
6919         if (v) {
6920                 n = strcspn(v, " ");
6921                 var_tmp_dir = strndup(v, n);
6922                 if (!var_tmp_dir)
6923                         return log_oom();
6924                 if (v[n] != ' ')
6925                         goto finalize;
6926                 p = v + n + 1;
6927         }
6928
6929         v = startswith(p, "netns-socket-0=");
6930         if (v) {
6931                 char *buf;
6932
6933                 n = strcspn(v, " ");
6934                 buf = strndupa_safe(v, n);
6935
6936                 r = safe_atoi(buf, &netns_fdpair[0]);
6937                 if (r < 0)
6938                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6939                 if (!fdset_contains(fds, netns_fdpair[0]))
6940                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6941                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6942                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6943                 if (v[n] != ' ')
6944                         goto finalize;
6945                 p = v + n + 1;
6946         }
6947
6948         v = startswith(p, "netns-socket-1=");
6949         if (v) {
6950                 char *buf;
6951
6952                 n = strcspn(v, " ");
6953                 buf = strndupa_safe(v, n);
6954
6955                 r = safe_atoi(buf, &netns_fdpair[1]);
6956                 if (r < 0)
6957                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6958                 if (!fdset_contains(fds, netns_fdpair[1]))
6959                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6960                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6961                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6962                 if (v[n] != ' ')
6963                         goto finalize;
6964                 p = v + n + 1;
6965         }
6966
6967         v = startswith(p, "ipcns-socket-0=");
6968         if (v) {
6969                 char *buf;
6970
6971                 n = strcspn(v, " ");
6972                 buf = strndupa_safe(v, n);
6973
6974                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6975                 if (r < 0)
6976                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6977                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6978                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6979                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6980                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6981                 if (v[n] != ' ')
6982                         goto finalize;
6983                 p = v + n + 1;
6984         }
6985
6986         v = startswith(p, "ipcns-socket-1=");
6987         if (v) {
6988                 char *buf;
6989
6990                 n = strcspn(v, " ");
6991                 buf = strndupa_safe(v, n);
6992
6993                 r = safe_atoi(buf, &ipcns_fdpair[1]);
6994                 if (r < 0)
6995                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6996                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6997                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6998                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6999                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7000         }
7001
7002 finalize:
7003         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7004         if (r < 0)
7005                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7006         return 0;
7007 }
7008
7009 void exec_runtime_vacuum(Manager *m) {
7010         ExecRuntime *rt;
7011
7012         assert(m);
7013
7014         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7015
7016         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
7017                 if (rt->n_ref > 0)
7018                         continue;
7019
7020                 (void) exec_runtime_free(rt, false);
7021         }
7022 }
7023
7024 void exec_params_clear(ExecParameters *p) {
7025         if (!p)
7026                 return;
7027
7028         p->environment = strv_free(p->environment);
7029         p->fd_names = strv_free(p->fd_names);
7030         p->fds = mfree(p->fds);
7031         p->exec_fd = safe_close(p->exec_fd);
7032 }
7033
7034 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7035         if (!sc)
7036                 return NULL;
7037
7038         free(sc->id);
7039         free(sc->data);
7040         return mfree(sc);
7041 }
7042
7043 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7044         if (!lc)
7045                 return NULL;
7046
7047         free(lc->id);
7048         free(lc->path);
7049         return mfree(lc);
7050 }
7051
7052 void exec_directory_done(ExecDirectory *d) {
7053         if (!d)
7054                 return;
7055
7056         for (size_t i = 0; i < d->n_items; i++) {
7057                 free(d->items[i].path);
7058                 strv_free(d->items[i].symlinks);
7059         }
7060
7061         d->items = mfree(d->items);
7062         d->n_items = 0;
7063         d->mode = 0755;
7064 }
7065
7066 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7067         assert(d);
7068         assert(path);
7069
7070         for (size_t i = 0; i < d->n_items; i++)
7071                 if (path_equal(d->items[i].path, path))
7072                         return &d->items[i];
7073
7074         return NULL;
7075 }
7076
7077 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7078         _cleanup_strv_free_ char **s = NULL;
7079         _cleanup_free_ char *p = NULL;
7080         ExecDirectoryItem *existing;
7081         int r;
7082
7083         assert(d);
7084         assert(path);
7085
7086         existing = exec_directory_find(d, path);
7087         if (existing) {
7088                 r = strv_extend(&existing->symlinks, symlink);
7089                 if (r < 0)
7090                         return r;
7091
7092                 return 0; /* existing item is updated */
7093         }
7094
7095         p = strdup(path);
7096         if (!p)
7097                 return -ENOMEM;
7098
7099         if (symlink) {
7100                 s = strv_new(symlink);
7101                 if (!s)
7102                         return -ENOMEM;
7103         }
7104
7105         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7106                 return -ENOMEM;
7107
7108         d->items[d->n_items++] = (ExecDirectoryItem) {
7109                 .path = TAKE_PTR(p),
7110                 .symlinks = TAKE_PTR(s),
7111         };
7112
7113         return 1; /* new item is added */
7114 }
7115
7116 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7117         assert(a);
7118         assert(b);
7119
7120         return path_compare(a->path, b->path);
7121 }
7122
7123 void exec_directory_sort(ExecDirectory *d) {
7124         assert(d);
7125
7126         /* Sort the exec directories to make always parent directories processed at first in
7127          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7128          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7129          * list. See also comments in setup_exec_directory() and issue #24783. */
7130
7131         if (d->n_items <= 1)
7132                 return;
7133
7134         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7135
7136         for (size_t i = 1; i < d->n_items; i++)
7137                 for (size_t j = 0; j < i; j++)
7138                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7139                                 d->items[i].only_create = true;
7140                                 break;
7141                         }
7142 }
7143
7144 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7145 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7146
7147 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7148         [EXEC_INPUT_NULL] = "null",
7149         [EXEC_INPUT_TTY] = "tty",
7150         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7151         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7152         [EXEC_INPUT_SOCKET] = "socket",
7153         [EXEC_INPUT_NAMED_FD] = "fd",
7154         [EXEC_INPUT_DATA] = "data",
7155         [EXEC_INPUT_FILE] = "file",
7156 };
7157
7158 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7159
7160 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7161         [EXEC_OUTPUT_INHERIT] = "inherit",
7162         [EXEC_OUTPUT_NULL] = "null",
7163         [EXEC_OUTPUT_TTY] = "tty",
7164         [EXEC_OUTPUT_KMSG] = "kmsg",
7165         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7166         [EXEC_OUTPUT_JOURNAL] = "journal",
7167         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7168         [EXEC_OUTPUT_SOCKET] = "socket",
7169         [EXEC_OUTPUT_NAMED_FD] = "fd",
7170         [EXEC_OUTPUT_FILE] = "file",
7171         [EXEC_OUTPUT_FILE_APPEND] = "append",
7172         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7173 };
7174
7175 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7176
7177 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7178         [EXEC_UTMP_INIT] = "init",
7179         [EXEC_UTMP_LOGIN] = "login",
7180         [EXEC_UTMP_USER] = "user",
7181 };
7182
7183 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7184
7185 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7186         [EXEC_PRESERVE_NO] = "no",
7187         [EXEC_PRESERVE_YES] = "yes",
7188         [EXEC_PRESERVE_RESTART] = "restart",
7189 };
7190
7191 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7192
7193 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7194 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7195         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7196         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7197         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7198         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7199         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7200 };
7201
7202 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7203
7204 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7205 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7206         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7207         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7208         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7209         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7210         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7211 };
7212
7213 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7214
7215 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7216  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7217  * directories, specifically .timer units with their timestamp touch file. */
7218 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7219         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7220         [EXEC_DIRECTORY_STATE] = "state",
7221         [EXEC_DIRECTORY_CACHE] = "cache",
7222         [EXEC_DIRECTORY_LOGS] = "logs",
7223         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7224 };
7225
7226 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7227
7228 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7229  * the service payload in. */
7230 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7231         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7232         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7233         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7234         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7235         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7236 };
7237
7238 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7239
7240 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7241         [EXEC_KEYRING_INHERIT] = "inherit",
7242         [EXEC_KEYRING_PRIVATE] = "private",
7243         [EXEC_KEYRING_SHARED] = "shared",
7244 };
7245
7246 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);