src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "bpf-lsm.h"
  45 #include "cap-list.h"
  46 #include "capability-util.h"
  47 #include "cgroup-setup.h"
  48 #include "chase-symlinks.h"
  49 #include "chown-recursive.h"
  50 #include "cpu-set-util.h"
  51 #include "creds-util.h"
  52 #include "data-fd-util.h"
  53 #include "def.h"
  54 #include "env-file.h"
  55 #include "env-util.h"
  56 #include "errno-list.h"
  57 #include "escape.h"
  58 #include "execute.h"
  59 #include "exit-status.h"
  60 #include "fd-util.h"
  61 #include "fileio.h"
  62 #include "format-util.h"
  63 #include "glob-util.h"
  64 #include "hexdecoct.h"
  65 #include "io-util.h"
  66 #include "ioprio-util.h"
  67 #include "label.h"
  68 #include "log.h"
  69 #include "macro.h"
  70 #include "manager.h"
  71 #include "manager-dump.h"
  72 #include "memory-util.h"
  73 #include "missing_fs.h"
  74 #include "missing_ioprio.h"
  75 #include "mkdir-label.h"
  76 #include "mount-util.h"
  77 #include "mountpoint-util.h"
  78 #include "namespace.h"
  79 #include "parse-util.h"
  80 #include "path-util.h"
  81 #include "process-util.h"
  82 #include "random-util.h"
  83 #include "recurse-dir.h"
  84 #include "rlimit-util.h"
  85 #include "rm-rf.h"
  86 #if HAVE_SECCOMP
  87 #include "seccomp-util.h"
  88 #endif
  89 #include "securebits-util.h"
  90 #include "selinux-util.h"
  91 #include "signal-util.h"
  92 #include "smack-util.h"
  93 #include "socket-util.h"
  94 #include "special.h"
  95 #include "stat-util.h"
  96 #include "string-table.h"
  97 #include "string-util.h"
  98 #include "strv.h"
  99 #include "syslog-util.h"
 100 #include "terminal-util.h"
 101 #include "tmpfile-util.h"
 102 #include "umask-util.h"
 103 #include "unit-serialize.h"
 104 #include "user-util.h"
 105 #include "utmp-wtmp.h"
 106
 107 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 108 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 109
 110 #define SNDBUF_SIZE (8*1024*1024)
 111
 112 static int shift_fds(int fds[], size_t n_fds) {
 113         if (n_fds <= 0)
 114                 return 0;
 115
 116         /* Modifies the fds array! (sorts it) */
 117
 118         assert(fds);
 119
 120         for (int start = 0;;) {
 121                 int restart_from = -1;
 122
 123                 for (int i = start; i < (int) n_fds; i++) {
 124                         int nfd;
 125
 126                         /* Already at right index? */
 127                         if (fds[i] == i+3)
 128                                 continue;
 129
 130                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 131                         if (nfd < 0)
 132                                 return -errno;
 133
 134                         safe_close(fds[i]);
 135                         fds[i] = nfd;
 136
 137                         /* Hmm, the fd we wanted isn't free? Then
 138                          * let's remember that and try again from here */
 139                         if (nfd != i+3 && restart_from < 0)
 140                                 restart_from = i;
 141                 }
 142
 143                 if (restart_from < 0)
 144                         break;
 145
 146                 start = restart_from;
 147         }
 148
 149         return 0;
 150 }
 151
 152 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 153         size_t n_fds;
 154         int r;
 155
 156         n_fds = n_socket_fds + n_storage_fds;
 157         if (n_fds <= 0)
 158                 return 0;
 159
 160         assert(fds);
 161
 162         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 163          * O_NONBLOCK only applies to socket activation though. */
 164
 165         for (size_t i = 0; i < n_fds; i++) {
 166
 167                 if (i < n_socket_fds) {
 168                         r = fd_nonblock(fds[i], nonblock);
 169                         if (r < 0)
 170                                 return r;
 171                 }
 172
 173                 /* We unconditionally drop FD_CLOEXEC from the fds,
 174                  * since after all we want to pass these fds to our
 175                  * children */
 176
 177                 r = fd_cloexec(fds[i], false);
 178                 if (r < 0)
 179                         return r;
 180         }
 181
 182         return 0;
 183 }
 184
 185 static const char *exec_context_tty_path(const ExecContext *context) {
 186         assert(context);
 187
 188         if (context->stdio_as_fds)
 189                 return NULL;
 190
 191         if (context->tty_path)
 192                 return context->tty_path;
 193
 194         return "/dev/console";
 195 }
 196
 197 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 198         const char *path;
 199
 200         assert(context);
 201
 202         path = exec_context_tty_path(context);
 203
 204         if (context->tty_vhangup) {
 205                 if (p && p->stdin_fd >= 0)
 206                         (void) terminal_vhangup_fd(p->stdin_fd);
 207                 else if (path)
 208                         (void) terminal_vhangup(path);
 209         }
 210
 211         if (context->tty_reset) {
 212                 if (p && p->stdin_fd >= 0)
 213                         (void) reset_terminal_fd(p->stdin_fd, true);
 214                 else if (path)
 215                         (void) reset_terminal(path);
 216         }
 217
 218         if (p && p->stdin_fd >= 0)
 219                 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
 220
 221         if (context->tty_vt_disallocate && path)
 222                 (void) vt_disallocate(path);
 223 }
 224
 225 static bool is_terminal_input(ExecInput i) {
 226         return IN_SET(i,
 227                       EXEC_INPUT_TTY,
 228                       EXEC_INPUT_TTY_FORCE,
 229                       EXEC_INPUT_TTY_FAIL);
 230 }
 231
 232 static bool is_terminal_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_TTY,
 235                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 236                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 237 }
 238
 239 static bool is_kmsg_output(ExecOutput o) {
 240         return IN_SET(o,
 241                       EXEC_OUTPUT_KMSG,
 242                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 243 }
 244
 245 static bool exec_context_needs_term(const ExecContext *c) {
 246         assert(c);
 247
 248         /* Return true if the execution context suggests we should set $TERM to something useful. */
 249
 250         if (is_terminal_input(c->std_input))
 251                 return true;
 252
 253         if (is_terminal_output(c->std_output))
 254                 return true;
 255
 256         if (is_terminal_output(c->std_error))
 257                 return true;
 258
 259         return !!c->tty_path;
 260 }
 261
 262 static int open_null_as(int flags, int nfd) {
 263         int fd;
 264
 265         assert(nfd >= 0);
 266
 267         fd = open("/dev/null", flags|O_NOCTTY);
 268         if (fd < 0)
 269                 return -errno;
 270
 271         return move_fd(fd, nfd, false);
 272 }
 273
 274 static int connect_journal_socket(
 275                 int fd,
 276                 const char *log_namespace,
 277                 uid_t uid,
 278                 gid_t gid) {
 279
 280         union sockaddr_union sa;
 281         socklen_t sa_len;
 282         uid_t olduid = UID_INVALID;
 283         gid_t oldgid = GID_INVALID;
 284         const char *j;
 285         int r;
 286
 287         j = log_namespace ?
 288                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 289                 "/run/systemd/journal/stdout";
 290         r = sockaddr_un_set_path(&sa.un, j);
 291         if (r < 0)
 292                 return r;
 293         sa_len = r;
 294
 295         if (gid_is_valid(gid)) {
 296                 oldgid = getgid();
 297
 298                 if (setegid(gid) < 0)
 299                         return -errno;
 300         }
 301
 302         if (uid_is_valid(uid)) {
 303                 olduid = getuid();
 304
 305                 if (seteuid(uid) < 0) {
 306                         r = -errno;
 307                         goto restore_gid;
 308                 }
 309         }
 310
 311         r = RET_NERRNO(connect(fd, &sa.sa, sa_len));
 312
 313         /* If we fail to restore the uid or gid, things will likely
 314            fail later on. This should only happen if an LSM interferes. */
 315
 316         if (uid_is_valid(uid))
 317                 (void) seteuid(olduid);
 318
 319  restore_gid:
 320         if (gid_is_valid(gid))
 321                 (void) setegid(oldgid);
 322
 323         return r;
 324 }
 325
 326 static int connect_logger_as(
 327                 const Unit *unit,
 328                 const ExecContext *context,
 329                 const ExecParameters *params,
 330                 ExecOutput output,
 331                 const char *ident,
 332                 int nfd,
 333                 uid_t uid,
 334                 gid_t gid) {
 335
 336         _cleanup_close_ int fd = -1;
 337         int r;
 338
 339         assert(context);
 340         assert(params);
 341         assert(output < _EXEC_OUTPUT_MAX);
 342         assert(ident);
 343         assert(nfd >= 0);
 344
 345         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 346         if (fd < 0)
 347                 return -errno;
 348
 349         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 350         if (r < 0)
 351                 return r;
 352
 353         if (shutdown(fd, SHUT_RD) < 0)
 354                 return -errno;
 355
 356         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 357
 358         if (dprintf(fd,
 359                 "%s\n"
 360                 "%s\n"
 361                 "%i\n"
 362                 "%i\n"
 363                 "%i\n"
 364                 "%i\n"
 365                 "%i\n",
 366                 context->syslog_identifier ?: ident,
 367                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 368                 context->syslog_priority,
 369                 !!context->syslog_level_prefix,
 370                 false,
 371                 is_kmsg_output(output),
 372                 is_terminal_output(output)) < 0)
 373                 return -errno;
 374
 375         return move_fd(TAKE_FD(fd), nfd, false);
 376 }
 377
 378 static int open_terminal_as(const char *path, int flags, int nfd) {
 379         int fd;
 380
 381         assert(path);
 382         assert(nfd >= 0);
 383
 384         fd = open_terminal(path, flags | O_NOCTTY);
 385         if (fd < 0)
 386                 return fd;
 387
 388         return move_fd(fd, nfd, false);
 389 }
 390
 391 static int acquire_path(const char *path, int flags, mode_t mode) {
 392         union sockaddr_union sa;
 393         socklen_t sa_len;
 394         _cleanup_close_ int fd = -1;
 395         int r;
 396
 397         assert(path);
 398
 399         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 400                 flags |= O_CREAT;
 401
 402         fd = open(path, flags|O_NOCTTY, mode);
 403         if (fd >= 0)
 404                 return TAKE_FD(fd);
 405
 406         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 407                 return -errno;
 408
 409         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 410
 411         r = sockaddr_un_set_path(&sa.un, path);
 412         if (r < 0)
 413                 return r == -EINVAL ? -ENXIO : r;
 414         sa_len = r;
 415
 416         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 417         if (fd < 0)
 418                 return -errno;
 419
 420         if (connect(fd, &sa.sa, sa_len) < 0)
 421                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 422                                                            * indication that this wasn't an AF_UNIX socket after all */
 423
 424         if ((flags & O_ACCMODE) == O_RDONLY)
 425                 r = shutdown(fd, SHUT_WR);
 426         else if ((flags & O_ACCMODE) == O_WRONLY)
 427                 r = shutdown(fd, SHUT_RD);
 428         else
 429                 r = 0;
 430         if (r < 0)
 431                 return -errno;
 432
 433         return TAKE_FD(fd);
 434 }
 435
 436 static int fixup_input(
 437                 const ExecContext *context,
 438                 int socket_fd,
 439                 bool apply_tty_stdin) {
 440
 441         ExecInput std_input;
 442
 443         assert(context);
 444
 445         std_input = context->std_input;
 446
 447         if (is_terminal_input(std_input) && !apply_tty_stdin)
 448                 return EXEC_INPUT_NULL;
 449
 450         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 451                 return EXEC_INPUT_NULL;
 452
 453         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 454                 return EXEC_INPUT_NULL;
 455
 456         return std_input;
 457 }
 458
 459 static int fixup_output(ExecOutput output, int socket_fd) {
 460
 461         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 462                 return EXEC_OUTPUT_INHERIT;
 463
 464         return output;
 465 }
 466
 467 static int setup_input(
 468                 const ExecContext *context,
 469                 const ExecParameters *params,
 470                 int socket_fd,
 471                 const int named_iofds[static 3]) {
 472
 473         ExecInput i;
 474         int r;
 475
 476         assert(context);
 477         assert(params);
 478         assert(named_iofds);
 479
 480         if (params->stdin_fd >= 0) {
 481                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 482                         return -errno;
 483
 484                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 485                 if (isatty(STDIN_FILENO)) {
 486                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 487                         (void) reset_terminal_fd(STDIN_FILENO, true);
 488                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
 489                 }
 490
 491                 return STDIN_FILENO;
 492         }
 493
 494         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 495
 496         switch (i) {
 497
 498         case EXEC_INPUT_NULL:
 499                 return open_null_as(O_RDONLY, STDIN_FILENO);
 500
 501         case EXEC_INPUT_TTY:
 502         case EXEC_INPUT_TTY_FORCE:
 503         case EXEC_INPUT_TTY_FAIL: {
 504                 int fd;
 505
 506                 fd = acquire_terminal(exec_context_tty_path(context),
 507                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 508                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 509                                                                   ACQUIRE_TERMINAL_WAIT,
 510                                       USEC_INFINITY);
 511                 if (fd < 0)
 512                         return fd;
 513
 514                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
 515                 if (r < 0)
 516                         return r;
 517
 518                 return move_fd(fd, STDIN_FILENO, false);
 519         }
 520
 521         case EXEC_INPUT_SOCKET:
 522                 assert(socket_fd >= 0);
 523
 524                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 525
 526         case EXEC_INPUT_NAMED_FD:
 527                 assert(named_iofds[STDIN_FILENO] >= 0);
 528
 529                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 530                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 531
 532         case EXEC_INPUT_DATA: {
 533                 int fd;
 534
 535                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 536                 if (fd < 0)
 537                         return fd;
 538
 539                 return move_fd(fd, STDIN_FILENO, false);
 540         }
 541
 542         case EXEC_INPUT_FILE: {
 543                 bool rw;
 544                 int fd;
 545
 546                 assert(context->stdio_file[STDIN_FILENO]);
 547
 548                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 549                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 550
 551                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 552                 if (fd < 0)
 553                         return fd;
 554
 555                 return move_fd(fd, STDIN_FILENO, false);
 556         }
 557
 558         default:
 559                 assert_not_reached();
 560         }
 561 }
 562
 563 static bool can_inherit_stderr_from_stdout(
 564                 const ExecContext *context,
 565                 ExecOutput o,
 566                 ExecOutput e) {
 567
 568         assert(context);
 569
 570         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 571          * stderr fd */
 572
 573         if (e == EXEC_OUTPUT_INHERIT)
 574                 return true;
 575         if (e != o)
 576                 return false;
 577
 578         if (e == EXEC_OUTPUT_NAMED_FD)
 579                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 580
 581         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 582                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 583
 584         return true;
 585 }
 586
 587 static int setup_output(
 588                 const Unit *unit,
 589                 const ExecContext *context,
 590                 const ExecParameters *params,
 591                 int fileno,
 592                 int socket_fd,
 593                 const int named_iofds[static 3],
 594                 const char *ident,
 595                 uid_t uid,
 596                 gid_t gid,
 597                 dev_t *journal_stream_dev,
 598                 ino_t *journal_stream_ino) {
 599
 600         ExecOutput o;
 601         ExecInput i;
 602         int r;
 603
 604         assert(unit);
 605         assert(context);
 606         assert(params);
 607         assert(ident);
 608         assert(journal_stream_dev);
 609         assert(journal_stream_ino);
 610
 611         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 612
 613                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 614                         return -errno;
 615
 616                 return STDOUT_FILENO;
 617         }
 618
 619         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 620                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 621                         return -errno;
 622
 623                 return STDERR_FILENO;
 624         }
 625
 626         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 627         o = fixup_output(context->std_output, socket_fd);
 628
 629         if (fileno == STDERR_FILENO) {
 630                 ExecOutput e;
 631                 e = fixup_output(context->std_error, socket_fd);
 632
 633                 /* This expects the input and output are already set up */
 634
 635                 /* Don't change the stderr file descriptor if we inherit all
 636                  * the way and are not on a tty */
 637                 if (e == EXEC_OUTPUT_INHERIT &&
 638                     o == EXEC_OUTPUT_INHERIT &&
 639                     i == EXEC_INPUT_NULL &&
 640                     !is_terminal_input(context->std_input) &&
 641                     getppid() != 1)
 642                         return fileno;
 643
 644                 /* Duplicate from stdout if possible */
 645                 if (can_inherit_stderr_from_stdout(context, o, e))
 646                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 647
 648                 o = e;
 649
 650         } else if (o == EXEC_OUTPUT_INHERIT) {
 651                 /* If input got downgraded, inherit the original value */
 652                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 653                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 654
 655                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 656                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 657                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 658
 659                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 660                 if (getppid() != 1)
 661                         return fileno;
 662
 663                 /* We need to open /dev/null here anew, to get the right access mode. */
 664                 return open_null_as(O_WRONLY, fileno);
 665         }
 666
 667         switch (o) {
 668
 669         case EXEC_OUTPUT_NULL:
 670                 return open_null_as(O_WRONLY, fileno);
 671
 672         case EXEC_OUTPUT_TTY:
 673                 if (is_terminal_input(i))
 674                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 675
 676                 /* We don't reset the terminal if this is just about output */
 677                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 678
 679         case EXEC_OUTPUT_KMSG:
 680         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 681         case EXEC_OUTPUT_JOURNAL:
 682         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 683                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 684                 if (r < 0) {
 685                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 686                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 687                         r = open_null_as(O_WRONLY, fileno);
 688                 } else {
 689                         struct stat st;
 690
 691                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 692                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 693                          * services to detect whether they are connected to the journal or not.
 694                          *
 695                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 696                          * about STDERR as that's usually the best way to do logging. */
 697
 698                         if (fstat(fileno, &st) >= 0 &&
 699                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 700                                 *journal_stream_dev = st.st_dev;
 701                                 *journal_stream_ino = st.st_ino;
 702                         }
 703                 }
 704                 return r;
 705
 706         case EXEC_OUTPUT_SOCKET:
 707                 assert(socket_fd >= 0);
 708
 709                 return RET_NERRNO(dup2(socket_fd, fileno));
 710
 711         case EXEC_OUTPUT_NAMED_FD:
 712                 assert(named_iofds[fileno] >= 0);
 713
 714                 (void) fd_nonblock(named_iofds[fileno], false);
 715                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 716
 717         case EXEC_OUTPUT_FILE:
 718         case EXEC_OUTPUT_FILE_APPEND:
 719         case EXEC_OUTPUT_FILE_TRUNCATE: {
 720                 bool rw;
 721                 int fd, flags;
 722
 723                 assert(context->stdio_file[fileno]);
 724
 725                 rw = context->std_input == EXEC_INPUT_FILE &&
 726                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 727
 728                 if (rw)
 729                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 730
 731                 flags = O_WRONLY;
 732                 if (o == EXEC_OUTPUT_FILE_APPEND)
 733                         flags |= O_APPEND;
 734                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 735                         flags |= O_TRUNC;
 736
 737                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 738                 if (fd < 0)
 739                         return fd;
 740
 741                 return move_fd(fd, fileno, 0);
 742         }
 743
 744         default:
 745                 assert_not_reached();
 746         }
 747 }
 748
 749 static int chown_terminal(int fd, uid_t uid) {
 750         int r;
 751
 752         assert(fd >= 0);
 753
 754         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 755         if (isatty(fd) < 1) {
 756                 if (IN_SET(errno, EINVAL, ENOTTY))
 757                         return 0; /* not a tty */
 758
 759                 return -errno;
 760         }
 761
 762         /* This might fail. What matters are the results. */
 763         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 764         if (r < 0)
 765                 return r;
 766
 767         return 1;
 768 }
 769
 770 static int setup_confirm_stdio(
 771                 const ExecContext *context,
 772                 const char *vc,
 773                 int *ret_saved_stdin,
 774                 int *ret_saved_stdout) {
 775
 776         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 777         int r;
 778
 779         assert(ret_saved_stdin);
 780         assert(ret_saved_stdout);
 781
 782         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 783         if (saved_stdin < 0)
 784                 return -errno;
 785
 786         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 787         if (saved_stdout < 0)
 788                 return -errno;
 789
 790         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 791         if (fd < 0)
 792                 return fd;
 793
 794         r = chown_terminal(fd, getuid());
 795         if (r < 0)
 796                 return r;
 797
 798         r = reset_terminal_fd(fd, true);
 799         if (r < 0)
 800                 return r;
 801
 802         r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
 803         if (r < 0)
 804                 return r;
 805
 806         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 807         TAKE_FD(fd);
 808         if (r < 0)
 809                 return r;
 810
 811         *ret_saved_stdin = TAKE_FD(saved_stdin);
 812         *ret_saved_stdout = TAKE_FD(saved_stdout);
 813         return 0;
 814 }
 815
 816 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 817         assert(err < 0);
 818
 819         if (err == -ETIMEDOUT)
 820                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 821         else {
 822                 errno = -err;
 823                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 824         }
 825 }
 826
 827 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 828         _cleanup_close_ int fd = -1;
 829
 830         assert(vc);
 831
 832         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 833         if (fd < 0)
 834                 return;
 835
 836         write_confirm_error_fd(err, fd, u);
 837 }
 838
 839 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 840         int r = 0;
 841
 842         assert(saved_stdin);
 843         assert(saved_stdout);
 844
 845         release_terminal();
 846
 847         if (*saved_stdin >= 0)
 848                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 849                         r = -errno;
 850
 851         if (*saved_stdout >= 0)
 852                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 853                         r = -errno;
 854
 855         *saved_stdin = safe_close(*saved_stdin);
 856         *saved_stdout = safe_close(*saved_stdout);
 857
 858         return r;
 859 }
 860
 861 enum {
 862         CONFIRM_PRETEND_FAILURE = -1,
 863         CONFIRM_PRETEND_SUCCESS =  0,
 864         CONFIRM_EXECUTE = 1,
 865 };
 866
 867 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 868         int saved_stdout = -1, saved_stdin = -1, r;
 869         _cleanup_free_ char *e = NULL;
 870         char c;
 871
 872         /* For any internal errors, assume a positive response. */
 873         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 874         if (r < 0) {
 875                 write_confirm_error(r, vc, u);
 876                 return CONFIRM_EXECUTE;
 877         }
 878
 879         /* confirm_spawn might have been disabled while we were sleeping. */
 880         if (manager_is_confirm_spawn_disabled(u->manager)) {
 881                 r = 1;
 882                 goto restore_stdio;
 883         }
 884
 885         e = ellipsize(cmdline, 60, 100);
 886         if (!e) {
 887                 log_oom();
 888                 r = CONFIRM_EXECUTE;
 889                 goto restore_stdio;
 890         }
 891
 892         for (;;) {
 893                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 894                 if (r < 0) {
 895                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 896                         r = CONFIRM_EXECUTE;
 897                         goto restore_stdio;
 898                 }
 899
 900                 switch (c) {
 901                 case 'c':
 902                         printf("Resuming normal execution.\n");
 903                         manager_disable_confirm_spawn();
 904                         r = 1;
 905                         break;
 906                 case 'D':
 907                         unit_dump(u, stdout, "  ");
 908                         continue; /* ask again */
 909                 case 'f':
 910                         printf("Failing execution.\n");
 911                         r = CONFIRM_PRETEND_FAILURE;
 912                         break;
 913                 case 'h':
 914                         printf("  c - continue, proceed without asking anymore\n"
 915                                "  D - dump, show the state of the unit\n"
 916                                "  f - fail, don't execute the command and pretend it failed\n"
 917                                "  h - help\n"
 918                                "  i - info, show a short summary of the unit\n"
 919                                "  j - jobs, show jobs that are in progress\n"
 920                                "  s - skip, don't execute the command and pretend it succeeded\n"
 921                                "  y - yes, execute the command\n");
 922                         continue; /* ask again */
 923                 case 'i':
 924                         printf("  Description: %s\n"
 925                                "  Unit:        %s\n"
 926                                "  Command:     %s\n",
 927                                u->id, u->description, cmdline);
 928                         continue; /* ask again */
 929                 case 'j':
 930                         manager_dump_jobs(u->manager, stdout, "  ");
 931                         continue; /* ask again */
 932                 case 'n':
 933                         /* 'n' was removed in favor of 'f'. */
 934                         printf("Didn't understand 'n', did you mean 'f'?\n");
 935                         continue; /* ask again */
 936                 case 's':
 937                         printf("Skipping execution.\n");
 938                         r = CONFIRM_PRETEND_SUCCESS;
 939                         break;
 940                 case 'y':
 941                         r = CONFIRM_EXECUTE;
 942                         break;
 943                 default:
 944                         assert_not_reached();
 945                 }
 946                 break;
 947         }
 948
 949 restore_stdio:
 950         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 951         return r;
 952 }
 953
 954 static int get_fixed_user(const ExecContext *c, const char **user,
 955                           uid_t *uid, gid_t *gid,
 956                           const char **home, const char **shell) {
 957         int r;
 958         const char *name;
 959
 960         assert(c);
 961
 962         if (!c->user)
 963                 return 0;
 964
 965         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 966          * (i.e. are "/" or "/bin/nologin"). */
 967
 968         name = c->user;
 969         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 970         if (r < 0)
 971                 return r;
 972
 973         *user = name;
 974         return 0;
 975 }
 976
 977 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 978         int r;
 979         const char *name;
 980
 981         assert(c);
 982
 983         if (!c->group)
 984                 return 0;
 985
 986         name = c->group;
 987         r = get_group_creds(&name, gid, 0);
 988         if (r < 0)
 989                 return r;
 990
 991         *group = name;
 992         return 0;
 993 }
 994
 995 static int get_supplementary_groups(const ExecContext *c, const char *user,
 996                                     const char *group, gid_t gid,
 997                                     gid_t **supplementary_gids, int *ngids) {
 998         char **i;
 999         int r, k = 0;
1000         int ngroups_max;
1001         bool keep_groups = false;
1002         gid_t *groups = NULL;
1003         _cleanup_free_ gid_t *l_gids = NULL;
1004
1005         assert(c);
1006
1007         /*
1008          * If user is given, then lookup GID and supplementary groups list.
1009          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1010          * here and as early as possible so we keep the list of supplementary
1011          * groups of the caller.
1012          */
1013         if (user && gid_is_valid(gid) && gid != 0) {
1014                 /* First step, initialize groups from /etc/groups */
1015                 if (initgroups(user, gid) < 0)
1016                         return -errno;
1017
1018                 keep_groups = true;
1019         }
1020
1021         if (strv_isempty(c->supplementary_groups))
1022                 return 0;
1023
1024         /*
1025          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1026          * be positive, otherwise fail.
1027          */
1028         errno = 0;
1029         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1030         if (ngroups_max <= 0)
1031                 return errno_or_else(EOPNOTSUPP);
1032
1033         l_gids = new(gid_t, ngroups_max);
1034         if (!l_gids)
1035                 return -ENOMEM;
1036
1037         if (keep_groups) {
1038                 /*
1039                  * Lookup the list of groups that the user belongs to, we
1040                  * avoid NSS lookups here too for gid=0.
1041                  */
1042                 k = ngroups_max;
1043                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1044                         return -EINVAL;
1045         } else
1046                 k = 0;
1047
1048         STRV_FOREACH(i, c->supplementary_groups) {
1049                 const char *g;
1050
1051                 if (k >= ngroups_max)
1052                         return -E2BIG;
1053
1054                 g = *i;
1055                 r = get_group_creds(&g, l_gids+k, 0);
1056                 if (r < 0)
1057                         return r;
1058
1059                 k++;
1060         }
1061
1062         /*
1063          * Sets ngids to zero to drop all supplementary groups, happens
1064          * when we are under root and SupplementaryGroups= is empty.
1065          */
1066         if (k == 0) {
1067                 *ngids = 0;
1068                 return 0;
1069         }
1070
1071         /* Otherwise get the final list of supplementary groups */
1072         groups = memdup(l_gids, sizeof(gid_t) * k);
1073         if (!groups)
1074                 return -ENOMEM;
1075
1076         *supplementary_gids = groups;
1077         *ngids = k;
1078
1079         groups = NULL;
1080
1081         return 0;
1082 }
1083
1084 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1085         int r;
1086
1087         /* Handle SupplementaryGroups= if it is not empty */
1088         if (ngids > 0) {
1089                 r = maybe_setgroups(ngids, supplementary_gids);
1090                 if (r < 0)
1091                         return r;
1092         }
1093
1094         if (gid_is_valid(gid)) {
1095                 /* Then set our gids */
1096                 if (setresgid(gid, gid, gid) < 0)
1097                         return -errno;
1098         }
1099
1100         return 0;
1101 }
1102
1103 static int set_securebits(int bits, int mask) {
1104         int current, applied;
1105         current = prctl(PR_GET_SECUREBITS);
1106         if (current < 0)
1107                 return -errno;
1108         /* Clear all securebits defined in mask and set bits */
1109         applied = (current & ~mask) | bits;
1110         if (current == applied)
1111                 return 0;
1112         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1113                 return -errno;
1114         return 1;
1115 }
1116
1117 static int enforce_user(const ExecContext *context, uid_t uid) {
1118         assert(context);
1119         int r;
1120
1121         if (!uid_is_valid(uid))
1122                 return 0;
1123
1124         /* Sets (but doesn't look up) the uid and make sure we keep the
1125          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1126          * required, so we also need keep-caps in this case.
1127          */
1128
1129         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1130
1131                 /* First step: If we need to keep capabilities but
1132                  * drop privileges we need to make sure we keep our
1133                  * caps, while we drop privileges. */
1134                 if (uid != 0) {
1135                         /* Add KEEP_CAPS to the securebits */
1136                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1137                         if (r < 0)
1138                                 return r;
1139                 }
1140         }
1141
1142         /* Second step: actually set the uids */
1143         if (setresuid(uid, uid, uid) < 0)
1144                 return -errno;
1145
1146         /* At this point we should have all necessary capabilities but
1147            are otherwise a normal user. However, the caps might got
1148            corrupted due to the setresuid() so we need clean them up
1149            later. This is done outside of this call. */
1150
1151         return 0;
1152 }
1153
1154 #if HAVE_PAM
1155
1156 static int null_conv(
1157                 int num_msg,
1158                 const struct pam_message **msg,
1159                 struct pam_response **resp,
1160                 void *appdata_ptr) {
1161
1162         /* We don't support conversations */
1163
1164         return PAM_CONV_ERR;
1165 }
1166
1167 #endif
1168
1169 static int setup_pam(
1170                 const char *name,
1171                 const char *user,
1172                 uid_t uid,
1173                 gid_t gid,
1174                 const char *tty,
1175                 char ***env,
1176                 const int fds[], size_t n_fds) {
1177
1178 #if HAVE_PAM
1179
1180         static const struct pam_conv conv = {
1181                 .conv = null_conv,
1182                 .appdata_ptr = NULL
1183         };
1184
1185         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1186         pam_handle_t *handle = NULL;
1187         sigset_t old_ss;
1188         int pam_code = PAM_SUCCESS, r;
1189         char **nv, **e = NULL;
1190         bool close_session = false;
1191         pid_t pam_pid = 0, parent_pid;
1192         int flags = 0;
1193
1194         assert(name);
1195         assert(user);
1196         assert(env);
1197
1198         /* We set up PAM in the parent process, then fork. The child
1199          * will then stay around until killed via PR_GET_PDEATHSIG or
1200          * systemd via the cgroup logic. It will then remove the PAM
1201          * session again. The parent process will exec() the actual
1202          * daemon. We do things this way to ensure that the main PID
1203          * of the daemon is the one we initially fork()ed. */
1204
1205         r = barrier_create(&barrier);
1206         if (r < 0)
1207                 goto fail;
1208
1209         if (log_get_max_level() < LOG_DEBUG)
1210                 flags |= PAM_SILENT;
1211
1212         pam_code = pam_start(name, user, &conv, &handle);
1213         if (pam_code != PAM_SUCCESS) {
1214                 handle = NULL;
1215                 goto fail;
1216         }
1217
1218         if (!tty) {
1219                 _cleanup_free_ char *q = NULL;
1220
1221                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1222                  * out if that's the case, and read the TTY off it. */
1223
1224                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1225                         tty = strjoina("/dev/", q);
1226         }
1227
1228         if (tty) {
1229                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1230                 if (pam_code != PAM_SUCCESS)
1231                         goto fail;
1232         }
1233
1234         STRV_FOREACH(nv, *env) {
1235                 pam_code = pam_putenv(handle, *nv);
1236                 if (pam_code != PAM_SUCCESS)
1237                         goto fail;
1238         }
1239
1240         pam_code = pam_acct_mgmt(handle, flags);
1241         if (pam_code != PAM_SUCCESS)
1242                 goto fail;
1243
1244         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1245         if (pam_code != PAM_SUCCESS)
1246                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1247
1248         pam_code = pam_open_session(handle, flags);
1249         if (pam_code != PAM_SUCCESS)
1250                 goto fail;
1251
1252         close_session = true;
1253
1254         e = pam_getenvlist(handle);
1255         if (!e) {
1256                 pam_code = PAM_BUF_ERR;
1257                 goto fail;
1258         }
1259
1260         /* Block SIGTERM, so that we know that it won't get lost in
1261          * the child */
1262
1263         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1264
1265         parent_pid = getpid_cached();
1266
1267         r = safe_fork("(sd-pam)", 0, &pam_pid);
1268         if (r < 0)
1269                 goto fail;
1270         if (r == 0) {
1271                 int sig, ret = EXIT_PAM;
1272
1273                 /* The child's job is to reset the PAM session on
1274                  * termination */
1275                 barrier_set_role(&barrier, BARRIER_CHILD);
1276
1277                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1278                  * those fds are open here that have been opened by PAM. */
1279                 (void) close_many(fds, n_fds);
1280
1281                 /* Drop privileges - we don't need any to pam_close_session
1282                  * and this will make PR_SET_PDEATHSIG work in most cases.
1283                  * If this fails, ignore the error - but expect sd-pam threads
1284                  * to fail to exit normally */
1285
1286                 r = maybe_setgroups(0, NULL);
1287                 if (r < 0)
1288                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1289                 if (setresgid(gid, gid, gid) < 0)
1290                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1291                 if (setresuid(uid, uid, uid) < 0)
1292                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1293
1294                 (void) ignore_signals(SIGPIPE);
1295
1296                 /* Wait until our parent died. This will only work if
1297                  * the above setresuid() succeeds, otherwise the kernel
1298                  * will not allow unprivileged parents kill their privileged
1299                  * children this way. We rely on the control groups kill logic
1300                  * to do the rest for us. */
1301                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1302                         goto child_finish;
1303
1304                 /* Tell the parent that our setup is done. This is especially
1305                  * important regarding dropping privileges. Otherwise, unit
1306                  * setup might race against our setresuid(2) call.
1307                  *
1308                  * If the parent aborted, we'll detect this below, hence ignore
1309                  * return failure here. */
1310                 (void) barrier_place(&barrier);
1311
1312                 /* Check if our parent process might already have died? */
1313                 if (getppid() == parent_pid) {
1314                         sigset_t ss;
1315
1316                         assert_se(sigemptyset(&ss) >= 0);
1317                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1318
1319                         for (;;) {
1320                                 if (sigwait(&ss, &sig) < 0) {
1321                                         if (errno == EINTR)
1322                                                 continue;
1323
1324                                         goto child_finish;
1325                                 }
1326
1327                                 assert(sig == SIGTERM);
1328                                 break;
1329                         }
1330                 }
1331
1332                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1333                 if (pam_code != PAM_SUCCESS)
1334                         goto child_finish;
1335
1336                 /* If our parent died we'll end the session */
1337                 if (getppid() != parent_pid) {
1338                         pam_code = pam_close_session(handle, flags);
1339                         if (pam_code != PAM_SUCCESS)
1340                                 goto child_finish;
1341                 }
1342
1343                 ret = 0;
1344
1345         child_finish:
1346                 pam_end(handle, pam_code | flags);
1347                 _exit(ret);
1348         }
1349
1350         barrier_set_role(&barrier, BARRIER_PARENT);
1351
1352         /* If the child was forked off successfully it will do all the
1353          * cleanups, so forget about the handle here. */
1354         handle = NULL;
1355
1356         /* Unblock SIGTERM again in the parent */
1357         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1358
1359         /* We close the log explicitly here, since the PAM modules
1360          * might have opened it, but we don't want this fd around. */
1361         closelog();
1362
1363         /* Synchronously wait for the child to initialize. We don't care for
1364          * errors as we cannot recover. However, warn loudly if it happens. */
1365         if (!barrier_place_and_sync(&barrier))
1366                 log_error("PAM initialization failed");
1367
1368         return strv_free_and_replace(*env, e);
1369
1370 fail:
1371         if (pam_code != PAM_SUCCESS) {
1372                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1373                 r = -EPERM;  /* PAM errors do not map to errno */
1374         } else
1375                 log_error_errno(r, "PAM failed: %m");
1376
1377         if (handle) {
1378                 if (close_session)
1379                         pam_code = pam_close_session(handle, flags);
1380
1381                 pam_end(handle, pam_code | flags);
1382         }
1383
1384         strv_free(e);
1385         closelog();
1386
1387         return r;
1388 #else
1389         return 0;
1390 #endif
1391 }
1392
1393 static void rename_process_from_path(const char *path) {
1394         char process_name[11];
1395         const char *p;
1396         size_t l;
1397
1398         /* This resulting string must fit in 10 chars (i.e. the length
1399          * of "/sbin/init") to look pretty in /bin/ps */
1400
1401         p = basename(path);
1402         if (isempty(p)) {
1403                 rename_process("(...)");
1404                 return;
1405         }
1406
1407         l = strlen(p);
1408         if (l > 8) {
1409                 /* The end of the process name is usually more
1410                  * interesting, since the first bit might just be
1411                  * "systemd-" */
1412                 p = p + l - 8;
1413                 l = 8;
1414         }
1415
1416         process_name[0] = '(';
1417         memcpy(process_name+1, p, l);
1418         process_name[1+l] = ')';
1419         process_name[1+l+1] = 0;
1420
1421         rename_process(process_name);
1422 }
1423
1424 static bool context_has_address_families(const ExecContext *c) {
1425         assert(c);
1426
1427         return c->address_families_allow_list ||
1428                 !set_isempty(c->address_families);
1429 }
1430
1431 static bool context_has_syscall_filters(const ExecContext *c) {
1432         assert(c);
1433
1434         return c->syscall_allow_list ||
1435                 !hashmap_isempty(c->syscall_filter);
1436 }
1437
1438 static bool context_has_syscall_logs(const ExecContext *c) {
1439         assert(c);
1440
1441         return c->syscall_log_allow_list ||
1442                 !hashmap_isempty(c->syscall_log);
1443 }
1444
1445 static bool context_has_no_new_privileges(const ExecContext *c) {
1446         assert(c);
1447
1448         if (c->no_new_privileges)
1449                 return true;
1450
1451         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1452                 return false;
1453
1454         /* We need NNP if we have any form of seccomp and are unprivileged */
1455         return c->lock_personality ||
1456                 c->memory_deny_write_execute ||
1457                 c->private_devices ||
1458                 c->protect_clock ||
1459                 c->protect_hostname ||
1460                 c->protect_kernel_tunables ||
1461                 c->protect_kernel_modules ||
1462                 c->protect_kernel_logs ||
1463                 context_has_address_families(c) ||
1464                 exec_context_restrict_namespaces_set(c) ||
1465                 c->restrict_realtime ||
1466                 c->restrict_suid_sgid ||
1467                 !set_isempty(c->syscall_archs) ||
1468                 context_has_syscall_filters(c) ||
1469                 context_has_syscall_logs(c);
1470 }
1471
1472 static bool exec_context_has_credentials(const ExecContext *context) {
1473
1474         assert(context);
1475
1476         return !hashmap_isempty(context->set_credentials) ||
1477                 !hashmap_isempty(context->load_credentials);
1478 }
1479
1480 #if HAVE_SECCOMP
1481
1482 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1483
1484         if (is_seccomp_available())
1485                 return false;
1486
1487         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1488         return true;
1489 }
1490
1491 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1492         uint32_t negative_action, default_action, action;
1493         int r;
1494
1495         assert(u);
1496         assert(c);
1497
1498         if (!context_has_syscall_filters(c))
1499                 return 0;
1500
1501         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1502                 return 0;
1503
1504         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1505
1506         if (c->syscall_allow_list) {
1507                 default_action = negative_action;
1508                 action = SCMP_ACT_ALLOW;
1509         } else {
1510                 default_action = SCMP_ACT_ALLOW;
1511                 action = negative_action;
1512         }
1513
1514         if (needs_ambient_hack) {
1515                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1516                 if (r < 0)
1517                         return r;
1518         }
1519
1520         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1521 }
1522
1523 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1524 #ifdef SCMP_ACT_LOG
1525         uint32_t default_action, action;
1526 #endif
1527
1528         assert(u);
1529         assert(c);
1530
1531         if (!context_has_syscall_logs(c))
1532                 return 0;
1533
1534 #ifdef SCMP_ACT_LOG
1535         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1536                 return 0;
1537
1538         if (c->syscall_log_allow_list) {
1539                 /* Log nothing but the ones listed */
1540                 default_action = SCMP_ACT_ALLOW;
1541                 action = SCMP_ACT_LOG;
1542         } else {
1543                 /* Log everything but the ones listed */
1544                 default_action = SCMP_ACT_LOG;
1545                 action = SCMP_ACT_ALLOW;
1546         }
1547
1548         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1549 #else
1550         /* old libseccomp */
1551         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1552         return 0;
1553 #endif
1554 }
1555
1556 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1557         assert(u);
1558         assert(c);
1559
1560         if (set_isempty(c->syscall_archs))
1561                 return 0;
1562
1563         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1564                 return 0;
1565
1566         return seccomp_restrict_archs(c->syscall_archs);
1567 }
1568
1569 static int apply_address_families(const Unit* u, const ExecContext *c) {
1570         assert(u);
1571         assert(c);
1572
1573         if (!context_has_address_families(c))
1574                 return 0;
1575
1576         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1577                 return 0;
1578
1579         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1580 }
1581
1582 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1583         assert(u);
1584         assert(c);
1585
1586         if (!c->memory_deny_write_execute)
1587                 return 0;
1588
1589         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1590                 return 0;
1591
1592         return seccomp_memory_deny_write_execute();
1593 }
1594
1595 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1596         assert(u);
1597         assert(c);
1598
1599         if (!c->restrict_realtime)
1600                 return 0;
1601
1602         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1603                 return 0;
1604
1605         return seccomp_restrict_realtime();
1606 }
1607
1608 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1609         assert(u);
1610         assert(c);
1611
1612         if (!c->restrict_suid_sgid)
1613                 return 0;
1614
1615         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1616                 return 0;
1617
1618         return seccomp_restrict_suid_sgid();
1619 }
1620
1621 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1622         assert(u);
1623         assert(c);
1624
1625         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1626          * let's protect even those systems where this is left on in the kernel. */
1627
1628         if (!c->protect_kernel_tunables)
1629                 return 0;
1630
1631         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1632                 return 0;
1633
1634         return seccomp_protect_sysctl();
1635 }
1636
1637 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1638         assert(u);
1639         assert(c);
1640
1641         /* Turn off module syscalls on ProtectKernelModules=yes */
1642
1643         if (!c->protect_kernel_modules)
1644                 return 0;
1645
1646         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1647                 return 0;
1648
1649         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1650 }
1651
1652 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1653         assert(u);
1654         assert(c);
1655
1656         if (!c->protect_kernel_logs)
1657                 return 0;
1658
1659         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1660                 return 0;
1661
1662         return seccomp_protect_syslog();
1663 }
1664
1665 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1666         assert(u);
1667         assert(c);
1668
1669         if (!c->protect_clock)
1670                 return 0;
1671
1672         if (skip_seccomp_unavailable(u, "ProtectClock="))
1673                 return 0;
1674
1675         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1676 }
1677
1678 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1679         assert(u);
1680         assert(c);
1681
1682         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1683
1684         if (!c->private_devices)
1685                 return 0;
1686
1687         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1688                 return 0;
1689
1690         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1691 }
1692
1693 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1694         assert(u);
1695         assert(c);
1696
1697         if (!exec_context_restrict_namespaces_set(c))
1698                 return 0;
1699
1700         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1701                 return 0;
1702
1703         return seccomp_restrict_namespaces(c->restrict_namespaces);
1704 }
1705
1706 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1707         unsigned long personality;
1708         int r;
1709
1710         assert(u);
1711         assert(c);
1712
1713         if (!c->lock_personality)
1714                 return 0;
1715
1716         if (skip_seccomp_unavailable(u, "LockPersonality="))
1717                 return 0;
1718
1719         personality = c->personality;
1720
1721         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1722         if (personality == PERSONALITY_INVALID) {
1723
1724                 r = opinionated_personality(&personality);
1725                 if (r < 0)
1726                         return r;
1727         }
1728
1729         return seccomp_lock_personality(personality);
1730 }
1731
1732 #endif
1733
1734 #if HAVE_LIBBPF
1735 static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
1736         assert(u);
1737         assert(u->manager);
1738
1739         if (lsm_bpf_supported())
1740                 return false;
1741
1742         /* lsm_bpf_setup succeeded */
1743         if (u->manager->restrict_fs)
1744                 return false;
1745
1746         log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
1747         return true;
1748 }
1749
1750 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1751         assert(u);
1752         assert(c);
1753
1754         if (!exec_context_restrict_filesystems_set(c))
1755                 return 0;
1756
1757         if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
1758                 return 0;
1759
1760         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1761 }
1762 #endif
1763
1764 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1765         assert(u);
1766         assert(c);
1767
1768         if (!c->protect_hostname)
1769                 return 0;
1770
1771         if (ns_type_supported(NAMESPACE_UTS)) {
1772                 if (unshare(CLONE_NEWUTS) < 0) {
1773                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1774                                 *ret_exit_status = EXIT_NAMESPACE;
1775                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1776                         }
1777
1778                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1779                 }
1780         } else
1781                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1782
1783 #if HAVE_SECCOMP
1784         int r;
1785
1786         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1787                 return 0;
1788
1789         r = seccomp_protect_hostname();
1790         if (r < 0) {
1791                 *ret_exit_status = EXIT_SECCOMP;
1792                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1793         }
1794 #endif
1795
1796         return 0;
1797 }
1798
1799 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1800         assert(idle_pipe);
1801
1802         idle_pipe[1] = safe_close(idle_pipe[1]);
1803         idle_pipe[2] = safe_close(idle_pipe[2]);
1804
1805         if (idle_pipe[0] >= 0) {
1806                 int r;
1807
1808                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1809
1810                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1811                         ssize_t n;
1812
1813                         /* Signal systemd that we are bored and want to continue. */
1814                         n = write(idle_pipe[3], "x", 1);
1815                         if (n > 0)
1816                                 /* Wait for systemd to react to the signal above. */
1817                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1818                 }
1819
1820                 idle_pipe[0] = safe_close(idle_pipe[0]);
1821
1822         }
1823
1824         idle_pipe[3] = safe_close(idle_pipe[3]);
1825 }
1826
1827 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1828
1829 static int build_environment(
1830                 const Unit *u,
1831                 const ExecContext *c,
1832                 const ExecParameters *p,
1833                 size_t n_fds,
1834                 const char *home,
1835                 const char *username,
1836                 const char *shell,
1837                 dev_t journal_stream_dev,
1838                 ino_t journal_stream_ino,
1839                 char ***ret) {
1840
1841         _cleanup_strv_free_ char **our_env = NULL;
1842         size_t n_env = 0;
1843         char *x;
1844
1845         assert(u);
1846         assert(c);
1847         assert(p);
1848         assert(ret);
1849
1850 #define N_ENV_VARS 17
1851         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1852         if (!our_env)
1853                 return -ENOMEM;
1854
1855         if (n_fds > 0) {
1856                 _cleanup_free_ char *joined = NULL;
1857
1858                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1859                         return -ENOMEM;
1860                 our_env[n_env++] = x;
1861
1862                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1863                         return -ENOMEM;
1864                 our_env[n_env++] = x;
1865
1866                 joined = strv_join(p->fd_names, ":");
1867                 if (!joined)
1868                         return -ENOMEM;
1869
1870                 x = strjoin("LISTEN_FDNAMES=", joined);
1871                 if (!x)
1872                         return -ENOMEM;
1873                 our_env[n_env++] = x;
1874         }
1875
1876         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1877                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1878                         return -ENOMEM;
1879                 our_env[n_env++] = x;
1880
1881                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1882                         return -ENOMEM;
1883                 our_env[n_env++] = x;
1884         }
1885
1886         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1887          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1888          * check the database directly. */
1889         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1890                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1891                 if (!x)
1892                         return -ENOMEM;
1893                 our_env[n_env++] = x;
1894         }
1895
1896         if (home) {
1897                 x = strjoin("HOME=", home);
1898                 if (!x)
1899                         return -ENOMEM;
1900
1901                 path_simplify(x + 5);
1902                 our_env[n_env++] = x;
1903         }
1904
1905         if (username) {
1906                 x = strjoin("LOGNAME=", username);
1907                 if (!x)
1908                         return -ENOMEM;
1909                 our_env[n_env++] = x;
1910
1911                 x = strjoin("USER=", username);
1912                 if (!x)
1913                         return -ENOMEM;
1914                 our_env[n_env++] = x;
1915         }
1916
1917         if (shell) {
1918                 x = strjoin("SHELL=", shell);
1919                 if (!x)
1920                         return -ENOMEM;
1921
1922                 path_simplify(x + 6);
1923                 our_env[n_env++] = x;
1924         }
1925
1926         if (!sd_id128_is_null(u->invocation_id)) {
1927                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1928                         return -ENOMEM;
1929
1930                 our_env[n_env++] = x;
1931         }
1932
1933         if (exec_context_needs_term(c)) {
1934                 const char *tty_path, *term = NULL;
1935
1936                 tty_path = exec_context_tty_path(c);
1937
1938                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1939                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1940                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1941
1942                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1943                         term = getenv("TERM");
1944
1945                 if (!term)
1946                         term = default_term_for_tty(tty_path);
1947
1948                 x = strjoin("TERM=", term);
1949                 if (!x)
1950                         return -ENOMEM;
1951                 our_env[n_env++] = x;
1952         }
1953
1954         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1955                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1956                         return -ENOMEM;
1957
1958                 our_env[n_env++] = x;
1959         }
1960
1961         if (c->log_namespace) {
1962                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1963                 if (!x)
1964                         return -ENOMEM;
1965
1966                 our_env[n_env++] = x;
1967         }
1968
1969         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1970                 _cleanup_free_ char *joined = NULL;
1971                 const char *n;
1972
1973                 if (!p->prefix[t])
1974                         continue;
1975
1976                 if (c->directories[t].n_items == 0)
1977                         continue;
1978
1979                 n = exec_directory_env_name_to_string(t);
1980                 if (!n)
1981                         continue;
1982
1983                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1984                         _cleanup_free_ char *prefixed = NULL;
1985
1986                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1987                         if (!prefixed)
1988                                 return -ENOMEM;
1989
1990                         if (!strextend_with_separator(&joined, ":", prefixed))
1991                                 return -ENOMEM;
1992                 }
1993
1994                 x = strjoin(n, "=", joined);
1995                 if (!x)
1996                         return -ENOMEM;
1997
1998                 our_env[n_env++] = x;
1999         }
2000
2001         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2002                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2003                 if (!x)
2004                         return -ENOMEM;
2005
2006                 our_env[n_env++] = x;
2007         }
2008
2009         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2010                 return -ENOMEM;
2011
2012         our_env[n_env++] = x;
2013
2014         our_env[n_env++] = NULL;
2015         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2016 #undef N_ENV_VARS
2017
2018         *ret = TAKE_PTR(our_env);
2019
2020         return 0;
2021 }
2022
2023 static int build_pass_environment(const ExecContext *c, char ***ret) {
2024         _cleanup_strv_free_ char **pass_env = NULL;
2025         size_t n_env = 0;
2026         char **i;
2027
2028         STRV_FOREACH(i, c->pass_environment) {
2029                 _cleanup_free_ char *x = NULL;
2030                 char *v;
2031
2032                 v = getenv(*i);
2033                 if (!v)
2034                         continue;
2035                 x = strjoin(*i, "=", v);
2036                 if (!x)
2037                         return -ENOMEM;
2038
2039                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2040                         return -ENOMEM;
2041
2042                 pass_env[n_env++] = TAKE_PTR(x);
2043                 pass_env[n_env] = NULL;
2044         }
2045
2046         *ret = TAKE_PTR(pass_env);
2047
2048         return 0;
2049 }
2050
2051 bool exec_needs_mount_namespace(
2052                 const ExecContext *context,
2053                 const ExecParameters *params,
2054                 const ExecRuntime *runtime) {
2055
2056         assert(context);
2057
2058         if (context->root_image)
2059                 return true;
2060
2061         if (!strv_isempty(context->read_write_paths) ||
2062             !strv_isempty(context->read_only_paths) ||
2063             !strv_isempty(context->inaccessible_paths) ||
2064             !strv_isempty(context->exec_paths) ||
2065             !strv_isempty(context->no_exec_paths))
2066                 return true;
2067
2068         if (context->n_bind_mounts > 0)
2069                 return true;
2070
2071         if (context->n_temporary_filesystems > 0)
2072                 return true;
2073
2074         if (context->n_mount_images > 0)
2075                 return true;
2076
2077         if (context->n_extension_images > 0)
2078                 return true;
2079
2080         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2081                 return true;
2082
2083         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2084                 return true;
2085
2086         if (context->private_devices ||
2087             context->private_mounts ||
2088             context->protect_system != PROTECT_SYSTEM_NO ||
2089             context->protect_home != PROTECT_HOME_NO ||
2090             context->protect_kernel_tunables ||
2091             context->protect_kernel_modules ||
2092             context->protect_kernel_logs ||
2093             context->protect_control_groups ||
2094             context->protect_proc != PROTECT_PROC_DEFAULT ||
2095             context->proc_subset != PROC_SUBSET_ALL ||
2096             context->private_ipc ||
2097             context->ipc_namespace_path)
2098                 return true;
2099
2100         if (context->root_directory) {
2101                 if (exec_context_get_effective_mount_apivfs(context))
2102                         return true;
2103
2104                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2105                         if (params && !params->prefix[t])
2106                                 continue;
2107
2108                         if (context->directories[t].n_items > 0)
2109                                 return true;
2110                 }
2111         }
2112
2113         if (context->dynamic_user &&
2114             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2115              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2116              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2117                 return true;
2118
2119         if (context->log_namespace)
2120                 return true;
2121
2122         return false;
2123 }
2124
2125 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2126         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2127         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2128         _cleanup_close_ int unshare_ready_fd = -1;
2129         _cleanup_(sigkill_waitp) pid_t pid = 0;
2130         uint64_t c = 1;
2131         ssize_t n;
2132         int r;
2133
2134         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2135          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2136          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2137          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2138          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2139          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2140          * continues execution normally.
2141          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2142          * does not need CAP_SETUID to write the single line mapping to itself. */
2143
2144         /* Can only set up multiple mappings with CAP_SETUID. */
2145         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2146                 r = asprintf(&uid_map,
2147                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2148                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2149                              ouid, ouid, uid, uid);
2150         else
2151                 r = asprintf(&uid_map,
2152                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2153                              ouid, ouid);
2154
2155         if (r < 0)
2156                 return -ENOMEM;
2157
2158         /* Can only set up multiple mappings with CAP_SETGID. */
2159         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2160                 r = asprintf(&gid_map,
2161                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2162                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2163                              ogid, ogid, gid, gid);
2164         else
2165                 r = asprintf(&gid_map,
2166                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2167                              ogid, ogid);
2168
2169         if (r < 0)
2170                 return -ENOMEM;
2171
2172         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2173          * namespace. */
2174         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2175         if (unshare_ready_fd < 0)
2176                 return -errno;
2177
2178         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2179          * failed. */
2180         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2181                 return -errno;
2182
2183         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2184         if (r < 0)
2185                 return r;
2186         if (r == 0) {
2187                 _cleanup_close_ int fd = -1;
2188                 const char *a;
2189                 pid_t ppid;
2190
2191                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2192                  * here, after the parent opened its own user namespace. */
2193
2194                 ppid = getppid();
2195                 errno_pipe[0] = safe_close(errno_pipe[0]);
2196
2197                 /* Wait until the parent unshared the user namespace */
2198                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2199                         r = -errno;
2200                         goto child_fail;
2201                 }
2202
2203                 /* Disable the setgroups() system call in the child user namespace, for good. */
2204                 a = procfs_file_alloca(ppid, "setgroups");
2205                 fd = open(a, O_WRONLY|O_CLOEXEC);
2206                 if (fd < 0) {
2207                         if (errno != ENOENT) {
2208                                 r = -errno;
2209                                 goto child_fail;
2210                         }
2211
2212                         /* If the file is missing the kernel is too old, let's continue anyway. */
2213                 } else {
2214                         if (write(fd, "deny\n", 5) < 0) {
2215                                 r = -errno;
2216                                 goto child_fail;
2217                         }
2218
2219                         fd = safe_close(fd);
2220                 }
2221
2222                 /* First write the GID map */
2223                 a = procfs_file_alloca(ppid, "gid_map");
2224                 fd = open(a, O_WRONLY|O_CLOEXEC);
2225                 if (fd < 0) {
2226                         r = -errno;
2227                         goto child_fail;
2228                 }
2229                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2230                         r = -errno;
2231                         goto child_fail;
2232                 }
2233                 fd = safe_close(fd);
2234
2235                 /* The write the UID map */
2236                 a = procfs_file_alloca(ppid, "uid_map");
2237                 fd = open(a, O_WRONLY|O_CLOEXEC);
2238                 if (fd < 0) {
2239                         r = -errno;
2240                         goto child_fail;
2241                 }
2242                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2243                         r = -errno;
2244                         goto child_fail;
2245                 }
2246
2247                 _exit(EXIT_SUCCESS);
2248
2249         child_fail:
2250                 (void) write(errno_pipe[1], &r, sizeof(r));
2251                 _exit(EXIT_FAILURE);
2252         }
2253
2254         errno_pipe[1] = safe_close(errno_pipe[1]);
2255
2256         if (unshare(CLONE_NEWUSER) < 0)
2257                 return -errno;
2258
2259         /* Let the child know that the namespace is ready now */
2260         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2261                 return -errno;
2262
2263         /* Try to read an error code from the child */
2264         n = read(errno_pipe[0], &r, sizeof(r));
2265         if (n < 0)
2266                 return -errno;
2267         if (n == sizeof(r)) { /* an error code was sent to us */
2268                 if (r < 0)
2269                         return r;
2270                 return -EIO;
2271         }
2272         if (n != 0) /* on success we should have read 0 bytes */
2273                 return -EIO;
2274
2275         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2276         if (r < 0)
2277                 return r;
2278         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2279                 return -EIO;
2280
2281         return 0;
2282 }
2283
2284 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2285         if (!context->dynamic_user)
2286                 return false;
2287
2288         if (type == EXEC_DIRECTORY_CONFIGURATION)
2289                 return false;
2290
2291         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2292                 return false;
2293
2294         return true;
2295 }
2296
2297 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2298         _cleanup_free_ char *src_abs = NULL;
2299         char **dst;
2300         int r;
2301
2302         assert(source);
2303
2304         src_abs = path_join(root, source);
2305         if (!src_abs)
2306                 return -ENOMEM;
2307
2308         STRV_FOREACH(dst, symlinks) {
2309                 _cleanup_free_ char *dst_abs = NULL;
2310
2311                 dst_abs = path_join(root, *dst);
2312                 if (!dst_abs)
2313                         return -ENOMEM;
2314
2315                 r = mkdir_parents_label(dst_abs, 0755);
2316                 if (r < 0)
2317                         return r;
2318
2319                 r = symlink_idempotent(src_abs, dst_abs, true);
2320                 if (r < 0)
2321                         return r;
2322         }
2323
2324         return 0;
2325 }
2326
2327 static int setup_exec_directory(
2328                 const ExecContext *context,
2329                 const ExecParameters *params,
2330                 uid_t uid,
2331                 gid_t gid,
2332                 ExecDirectoryType type,
2333                 bool needs_mount_namespace,
2334                 int *exit_status) {
2335
2336         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2337                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2338                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2339                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2340                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2341                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2342         };
2343         int r;
2344
2345         assert(context);
2346         assert(params);
2347         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2348         assert(exit_status);
2349
2350         if (!params->prefix[type])
2351                 return 0;
2352
2353         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2354                 if (!uid_is_valid(uid))
2355                         uid = 0;
2356                 if (!gid_is_valid(gid))
2357                         gid = 0;
2358         }
2359
2360         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2361                 _cleanup_free_ char *p = NULL, *pp = NULL;
2362
2363                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2364                 if (!p) {
2365                         r = -ENOMEM;
2366                         goto fail;
2367                 }
2368
2369                 r = mkdir_parents_label(p, 0755);
2370                 if (r < 0)
2371                         goto fail;
2372
2373                 if (exec_directory_is_private(context, type)) {
2374                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2375                          * case we want to avoid leaving a directory around fully accessible that is owned by
2376                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2377                          * trick used by container managers to prohibit host users to get access to files of
2378                          * the same UID in containers: we place everything inside a directory that has an
2379                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2380                          * for unprivileged host code. We then use fs namespacing to make this directory
2381                          * permeable for the service itself.
2382                          *
2383                          * Specifically: for a service which wants a special directory "foo/" we first create
2384                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2385                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2386                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2387                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2388                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2389                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2390                          * for the service and making sure it only gets access to the dirs it needs but no
2391                          * others. Tricky? Yes, absolutely, but it works!
2392                          *
2393                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2394                          * to be owned by the service itself.
2395                          *
2396                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2397                          * for sharing files or sockets with other services. */
2398
2399                         pp = path_join(params->prefix[type], "private");
2400                         if (!pp) {
2401                                 r = -ENOMEM;
2402                                 goto fail;
2403                         }
2404
2405                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2406                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2407                         if (r < 0)
2408                                 goto fail;
2409
2410                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2411                                 r = -ENOMEM;
2412                                 goto fail;
2413                         }
2414
2415                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2416                         r = mkdir_parents_label(pp, 0755);
2417                         if (r < 0)
2418                                 goto fail;
2419
2420                         if (is_dir(p, false) > 0 &&
2421                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2422
2423                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2424                                  * it over. Most likely the service has been upgraded from one that didn't use
2425                                  * DynamicUser=1, to one that does. */
2426
2427                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2428                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2429                                          exec_directory_type_to_string(type), p, pp);
2430
2431                                 if (rename(p, pp) < 0) {
2432                                         r = -errno;
2433                                         goto fail;
2434                                 }
2435                         } else {
2436                                 /* Otherwise, create the actual directory for the service */
2437
2438                                 r = mkdir_label(pp, context->directories[type].mode);
2439                                 if (r < 0 && r != -EEXIST)
2440                                         goto fail;
2441                         }
2442
2443                         /* And link it up from the original place. Note that if a mount namespace is going to be
2444                          * used, then this symlink remains on the host, and a new one for the child namespace will
2445                          * be created later. */
2446                         r = symlink_idempotent(pp, p, true);
2447                         if (r < 0)
2448                                 goto fail;
2449
2450                 } else {
2451                         _cleanup_free_ char *target = NULL;
2452
2453                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2454                             readlink_and_make_absolute(p, &target) >= 0) {
2455                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2456
2457                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2458                                  * by DynamicUser=1 (see above)?
2459                                  *
2460                                  * We do this for all directory types except for ConfigurationDirectory=,
2461                                  * since they all support the private/ symlink logic at least in some
2462                                  * configurations, see above. */
2463
2464                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2465                                 if (r < 0)
2466                                         goto fail;
2467
2468                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2469                                 if (!q) {
2470                                         r = -ENOMEM;
2471                                         goto fail;
2472                                 }
2473
2474                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2475                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2476                                 if (r < 0)
2477                                         goto fail;
2478
2479                                 if (path_equal(q_resolved, target_resolved)) {
2480
2481                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2482                                          * but is no longer. Let's move the directory back up. */
2483
2484                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2485                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2486                                                  exec_directory_type_to_string(type), q, p);
2487
2488                                         if (unlink(p) < 0) {
2489                                                 r = -errno;
2490                                                 goto fail;
2491                                         }
2492
2493                                         if (rename(q, p) < 0) {
2494                                                 r = -errno;
2495                                                 goto fail;
2496                                         }
2497                                 }
2498                         }
2499
2500                         r = mkdir_label(p, context->directories[type].mode);
2501                         if (r < 0) {
2502                                 if (r != -EEXIST)
2503                                         goto fail;
2504
2505                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2506                                         struct stat st;
2507
2508                                         /* Don't change the owner/access mode of the configuration directory,
2509                                          * as in the common case it is not written to by a service, and shall
2510                                          * not be writable. */
2511
2512                                         if (stat(p, &st) < 0) {
2513                                                 r = -errno;
2514                                                 goto fail;
2515                                         }
2516
2517                                         /* Still complain if the access mode doesn't match */
2518                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2519                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2520                                                             "(File system: %o %sMode: %o)",
2521                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2522                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2523
2524                                         continue;
2525                                 }
2526                         }
2527                 }
2528
2529                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2530                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2531                  * current UID/GID ownership.) */
2532                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2533                 if (r < 0)
2534                         goto fail;
2535
2536                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2537                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2538                  * assignments to exist. */
2539                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2540                 if (r < 0)
2541                         goto fail;
2542         }
2543
2544         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2545          * they are set up later, to allow configuring empty var/run/etc. */
2546         if (!needs_mount_namespace)
2547                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2548                         r = create_many_symlinks(params->prefix[type],
2549                                                  context->directories[type].items[i].path,
2550                                                  context->directories[type].items[i].symlinks);
2551                         if (r < 0)
2552                                 goto fail;
2553                 }
2554
2555         return 0;
2556
2557 fail:
2558         *exit_status = exit_status_table[type];
2559         return r;
2560 }
2561
2562 static int write_credential(
2563                 int dfd,
2564                 const char *id,
2565                 const void *data,
2566                 size_t size,
2567                 uid_t uid,
2568                 bool ownership_ok) {
2569
2570         _cleanup_(unlink_and_freep) char *tmp = NULL;
2571         _cleanup_close_ int fd = -1;
2572         int r;
2573
2574         r = tempfn_random_child("", "cred", &tmp);
2575         if (r < 0)
2576                 return r;
2577
2578         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2579         if (fd < 0) {
2580                 tmp = mfree(tmp);
2581                 return -errno;
2582         }
2583
2584         r = loop_write(fd, data, size, /* do_poll = */ false);
2585         if (r < 0)
2586                 return r;
2587
2588         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2589                 return -errno;
2590
2591         if (uid_is_valid(uid) && uid != getuid()) {
2592                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2593                 if (r < 0) {
2594                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2595                                 return r;
2596
2597                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2598                                             * to express: that the user gets read access and nothing
2599                                             * else. But if the backing fs can't support that (e.g. ramfs)
2600                                             * then we can use file ownership instead. But that's only safe if
2601                                             * we can then re-mount the whole thing read-only, so that the
2602                                             * user can no longer chmod() the file to gain write access. */
2603                                 return r;
2604
2605                         if (fchown(fd, uid, GID_INVALID) < 0)
2606                                 return -errno;
2607                 }
2608         }
2609
2610         if (renameat(dfd, tmp, dfd, id) < 0)
2611                 return -errno;
2612
2613         tmp = mfree(tmp);
2614         return 0;
2615 }
2616
2617 static int load_credential(
2618                 const ExecContext *context,
2619                 const ExecParameters *params,
2620                 ExecLoadCredential *lc,
2621                 const char *unit,
2622                 int read_dfd,
2623                 int write_dfd,
2624                 uid_t uid,
2625                 bool ownership_ok,
2626                 uint64_t *left) {
2627
2628         assert(context);
2629         assert(lc);
2630         assert(unit);
2631         assert(write_dfd >= 0);
2632         assert(left);
2633
2634         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2635         _cleanup_(erase_and_freep) char *data = NULL;
2636         _cleanup_free_ char *j = NULL, *bindname = NULL;
2637         bool missing_ok = true;
2638         const char *source;
2639         size_t size, add;
2640         int r;
2641
2642         if (path_is_absolute(lc->path) || read_dfd >= 0) {
2643                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2644                 source = lc->path;
2645                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2646
2647                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2648                  * via the source socket address in case we read off an AF_UNIX socket. */
2649                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
2650                         return -ENOMEM;
2651
2652                 missing_ok = false;
2653
2654         } else if (params->received_credentials) {
2655                 /* If this is a relative path, take it relative to the credentials we received
2656                  * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2657                  * on a credential store, i.e. this is guaranteed to be regular files. */
2658                 j = path_join(params->received_credentials, lc->path);
2659                 if (!j)
2660                         return -ENOMEM;
2661
2662                 source = j;
2663         } else
2664                 source = NULL;
2665
2666         if (source)
2667                 r = read_full_file_full(
2668                                 read_dfd, source,
2669                                 UINT64_MAX,
2670                                 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2671                                 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2672                                 bindname,
2673                                 &data, &size);
2674         else
2675                 r = -ENOENT;
2676
2677         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
2678                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2679                  * will get clear errors if we don't pass such a missing credential on as they
2680                  * themselves will get ENOENT when trying to read them, which should not be much
2681                  * worse than when we handle the error here and make it fatal.
2682                  *
2683                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2684                  * we are fine, too. */
2685                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
2686                 return 0;
2687         }
2688         if (r < 0)
2689                 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2690
2691         if (lc->encrypted) {
2692                 _cleanup_free_ void *plaintext = NULL;
2693                 size_t plaintext_size = 0;
2694
2695                 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2696                 if (r < 0)
2697                         return r;
2698
2699                 free_and_replace(data, plaintext);
2700                 size = plaintext_size;
2701         }
2702
2703         add = strlen(lc->id) + size;
2704         if (add > *left)
2705                 return -E2BIG;
2706
2707         r = write_credential(write_dfd, lc->id, data, size, uid, ownership_ok);
2708         if (r < 0)
2709                 return r;
2710
2711         *left -= add;
2712         return 0;
2713 }
2714
2715 struct load_cred_args {
2716         Set *seen_creds;
2717
2718         const ExecContext *context;
2719         const ExecParameters *params;
2720         ExecLoadCredential *parent_local_credential;
2721         const char *unit;
2722         int dfd;
2723         uid_t uid;
2724         bool ownership_ok;
2725         uint64_t *left;
2726 };
2727
2728 static int load_cred_recurse_dir_cb(
2729                 RecurseDirEvent event,
2730                 const char *path,
2731                 int dir_fd,
2732                 int inode_fd,
2733                 const struct dirent *de,
2734                 const struct statx *sx,
2735                 void *userdata) {
2736
2737         _cleanup_free_ char *credname = NULL, *sub_id = NULL;
2738         struct load_cred_args *args = userdata;
2739         int r;
2740
2741         if (event != RECURSE_DIR_ENTRY)
2742                 return RECURSE_DIR_CONTINUE;
2743
2744         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2745                 return RECURSE_DIR_CONTINUE;
2746
2747         credname = strreplace(path, "/", "_");
2748         if (!credname)
2749                 return -ENOMEM;
2750
2751         sub_id = strjoin(args->parent_local_credential->id, "_", credname);
2752         if (!sub_id)
2753                 return -ENOMEM;
2754
2755         if (!credential_name_valid(sub_id))
2756                 return -EINVAL;
2757
2758         if (set_contains(args->seen_creds, sub_id)) {
2759                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2760                 return RECURSE_DIR_CONTINUE;
2761         }
2762
2763         r = set_put_strdup(&args->seen_creds, sub_id);
2764         if (r < 0)
2765                 return r;
2766
2767         r = load_credential(args->context, args->params,
2768                 &(ExecLoadCredential) {
2769                         .id = sub_id,
2770                         .path = (char *) de->d_name,
2771                         .encrypted = args->parent_local_credential->encrypted,
2772                 }, args->unit, dir_fd, args->dfd, args->uid, args->ownership_ok, args->left);
2773         if (r < 0)
2774                 return r;
2775
2776         return RECURSE_DIR_CONTINUE;
2777 }
2778
2779 static int acquire_credentials(
2780                 const ExecContext *context,
2781                 const ExecParameters *params,
2782                 const char *unit,
2783                 const char *p,
2784                 uid_t uid,
2785                 bool ownership_ok) {
2786
2787         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2788         _cleanup_close_ int dfd = -1;
2789         _cleanup_set_free_ Set *seen_creds = NULL;
2790         ExecLoadCredential *lc;
2791         ExecSetCredential *sc;
2792         int r;
2793
2794         assert(context);
2795         assert(p);
2796
2797         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2798         if (dfd < 0)
2799                 return -errno;
2800
2801         seen_creds = set_new(&string_hash_ops_free);
2802         if (!seen_creds)
2803                 return -ENOMEM;
2804
2805         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2806         HASHMAP_FOREACH(lc, context->load_credentials) {
2807                 _cleanup_close_ int sub_fd = -1;
2808
2809                 /* Skip over credentials with unspecified paths. These are received by the
2810                  * service manager via the $CREDENTIALS_DIRECTORY environment variable. */
2811                 if (!is_path(lc->path) && streq(lc->id, lc->path))
2812                         continue;
2813
2814                 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2815                 if (sub_fd < 0 && errno != ENOTDIR)
2816                         return -errno;
2817
2818                 if (sub_fd < 0) {
2819                         r = set_put_strdup(&seen_creds, lc->id);
2820                         if (r < 0)
2821                                 return r;
2822                         r = load_credential(context, params, lc, unit, -1, dfd, uid, ownership_ok, &left);
2823                         if (r < 0)
2824                                 return r;
2825
2826                 } else {
2827                         r = recurse_dir(
2828                                         sub_fd,
2829                                         /* path= */ "",
2830                                         /* statx_mask= */ 0,
2831                                         /* n_depth_max= */ UINT_MAX,
2832                                         RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2833                                         load_cred_recurse_dir_cb,
2834                                         &(struct load_cred_args) {
2835                                                 .seen_creds = seen_creds,
2836                                                 .context = context,
2837                                                 .params = params,
2838                                                 .parent_local_credential = lc,
2839                                                 .unit = unit,
2840                                                 .dfd = dfd,
2841                                                 .uid = uid,
2842                                                 .ownership_ok = ownership_ok,
2843                                                 .left = &left,
2844                                         });
2845                         if (r < 0)
2846                                 return r;
2847                 }
2848         }
2849
2850         /* First we use the literally specified credentials. Note that they might be overridden again below,
2851          * and thus act as a "default" if the same credential is specified multiple times */
2852         HASHMAP_FOREACH(sc, context->set_credentials) {
2853                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2854                 const char *data;
2855                 size_t size, add;
2856
2857                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2858                         continue;
2859                 if (errno != ENOENT)
2860                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2861
2862                 if (sc->encrypted) {
2863                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2864                         if (r < 0)
2865                                 return r;
2866
2867                         data = plaintext;
2868                 } else {
2869                         data = sc->data;
2870                         size = sc->size;
2871                 }
2872
2873                 add = strlen(sc->id) + size;
2874                 if (add > left)
2875                         return -E2BIG;
2876
2877                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2878                 if (r < 0)
2879                         return r;
2880
2881
2882                 left -= add;
2883         }
2884
2885         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2886                 return -errno;
2887
2888         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2889          * accessible */
2890
2891         if (uid_is_valid(uid) && uid != getuid()) {
2892                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2893                 if (r < 0) {
2894                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2895                                 return r;
2896
2897                         if (!ownership_ok)
2898                                 return r;
2899
2900                         if (fchown(dfd, uid, GID_INVALID) < 0)
2901                                 return -errno;
2902                 }
2903         }
2904
2905         return 0;
2906 }
2907
2908 static int setup_credentials_internal(
2909                 const ExecContext *context,
2910                 const ExecParameters *params,
2911                 const char *unit,
2912                 const char *final,        /* This is where the credential store shall eventually end up at */
2913                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2914                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2915                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2916                 uid_t uid) {
2917
2918         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2919                                    * if we mounted something; false if we definitely can't mount anything */
2920         bool final_mounted;
2921         const char *where;
2922
2923         assert(context);
2924         assert(final);
2925         assert(workspace);
2926
2927         if (reuse_workspace) {
2928                 r = path_is_mount_point(workspace, NULL, 0);
2929                 if (r < 0)
2930                         return r;
2931                 if (r > 0)
2932                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2933                 else
2934                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2935         } else
2936                 workspace_mounted = -1; /* ditto */
2937
2938         r = path_is_mount_point(final, NULL, 0);
2939         if (r < 0)
2940                 return r;
2941         if (r > 0) {
2942                 /* If the final place already has something mounted, we use that. If the workspace also has
2943                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
2944                  * different). */
2945                 final_mounted = true;
2946
2947                 if (workspace_mounted < 0) {
2948                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2949                          * the final version to the workspace, and make it writable, so that we can make
2950                          * changes */
2951
2952                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2953                         if (r < 0)
2954                                 return r;
2955
2956                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2957                         if (r < 0)
2958                                 return r;
2959
2960                         workspace_mounted = true;
2961                 }
2962         } else
2963                 final_mounted = false;
2964
2965         if (workspace_mounted < 0) {
2966                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2967                 for (int try = 0;; try++) {
2968
2969                         if (try == 0) {
2970                                 /* Try "ramfs" first, since it's not swap backed */
2971                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2972                                 if (r >= 0) {
2973                                         workspace_mounted = true;
2974                                         break;
2975                                 }
2976
2977                         } else if (try == 1) {
2978                                 _cleanup_free_ char *opts = NULL;
2979
2980                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
2981                                         return -ENOMEM;
2982
2983                                 /* Fall back to "tmpfs" otherwise */
2984                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2985                                 if (r >= 0) {
2986                                         workspace_mounted = true;
2987                                         break;
2988                                 }
2989
2990                         } else {
2991                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2992                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2993                                 if (r < 0) {
2994                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2995                                                 return r;
2996
2997                                         if (must_mount) /* If we it's not OK to use the plain directory
2998                                                          * fallback, propagate all errors too */
2999                                                 return r;
3000
3001                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3002                                          * proceed for compat with container envs, and just use the final dir
3003                                          * as is. */
3004
3005                                         workspace_mounted = false;
3006                                         break;
3007                                 }
3008
3009                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3010                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3011                                 if (r < 0)
3012                                         return r;
3013
3014                                 workspace_mounted = true;
3015                                 break;
3016                         }
3017                 }
3018         }
3019
3020         assert(!must_mount || workspace_mounted > 0);
3021         where = workspace_mounted ? workspace : final;
3022
3023         (void) label_fix_container(where, final, 0);
3024
3025         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3026         if (r < 0)
3027                 return r;
3028
3029         if (workspace_mounted) {
3030                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3031                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3032                 if (r < 0)
3033                         return r;
3034
3035                 /* And mount it to the final place, read-only */
3036                 if (final_mounted)
3037                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3038                 else
3039                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3040                 if (r < 0)
3041                         return r;
3042         } else {
3043                 _cleanup_free_ char *parent = NULL;
3044
3045                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3046                  * open access to the top-level credential directory and the per-service directory now */
3047
3048                 parent = dirname_malloc(final);
3049                 if (!parent)
3050                         return -ENOMEM;
3051                 if (chmod(parent, 0755) < 0)
3052                         return -errno;
3053         }
3054
3055         return 0;
3056 }
3057
3058 static int setup_credentials(
3059                 const ExecContext *context,
3060                 const ExecParameters *params,
3061                 const char *unit,
3062                 uid_t uid) {
3063
3064         _cleanup_free_ char *p = NULL, *q = NULL;
3065         const char *i;
3066         int r;
3067
3068         assert(context);
3069         assert(params);
3070
3071         if (!exec_context_has_credentials(context))
3072                 return 0;
3073
3074         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3075                 return -EINVAL;
3076
3077         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3078          * and the subdir we mount over with a read-only file system readable by the service's user */
3079         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3080         if (!q)
3081                 return -ENOMEM;
3082
3083         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3084         if (r < 0 && r != -EEXIST)
3085                 return r;
3086
3087         p = path_join(q, unit);
3088         if (!p)
3089                 return -ENOMEM;
3090
3091         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3092         if (r < 0 && r != -EEXIST)
3093                 return r;
3094
3095         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3096         if (r < 0) {
3097                 _cleanup_free_ char *t = NULL, *u = NULL;
3098
3099                 /* If this is not a privilege or support issue then propagate the error */
3100                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3101                         return r;
3102
3103                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3104                  * it into place, so that users can't access half-initialized credential stores. */
3105                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3106                 if (!t)
3107                         return -ENOMEM;
3108
3109                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3110                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3111                  * after it is fully set up */
3112                 u = path_join(t, unit);
3113                 if (!u)
3114                         return -ENOMEM;
3115
3116                 FOREACH_STRING(i, t, u) {
3117                         r = mkdir_label(i, 0700);
3118                         if (r < 0 && r != -EEXIST)
3119                                 return r;
3120                 }
3121
3122                 r = setup_credentials_internal(
3123                                 context,
3124                                 params,
3125                                 unit,
3126                                 p,       /* final mount point */
3127                                 u,       /* temporary workspace to overmount */
3128                                 true,    /* reuse the workspace if it is already a mount */
3129                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3130                                 uid);
3131
3132                 (void) rmdir(u); /* remove the workspace again if we can. */
3133
3134                 if (r < 0)
3135                         return r;
3136
3137         } else if (r == 0) {
3138
3139                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3140                  * we can use the same directory for all cases, after turning off propagation. Question
3141                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3142                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3143                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3144                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3145                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3146                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3147                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3148                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3149                  * propagation on the former, and then overmount the latter.
3150                  *
3151                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3152                  * for this purpose, but there are few other candidates that work equally well for us, and
3153                  * given that the we do this in a privately namespaced short-lived single-threaded process
3154                  * that no one else sees this should be OK to do. */
3155
3156                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3157                 if (r < 0)
3158                         goto child_fail;
3159
3160                 r = setup_credentials_internal(
3161                                 context,
3162                                 params,
3163                                 unit,
3164                                 p,           /* final mount point */
3165                                 "/dev/shm",  /* temporary workspace to overmount */
3166                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3167                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3168                                 uid);
3169                 if (r < 0)
3170                         goto child_fail;
3171
3172                 _exit(EXIT_SUCCESS);
3173
3174         child_fail:
3175                 _exit(EXIT_FAILURE);
3176         }
3177
3178         return 0;
3179 }
3180
3181 #if ENABLE_SMACK
3182 static int setup_smack(
3183                 const ExecContext *context,
3184                 int executable_fd) {
3185         int r;
3186
3187         assert(context);
3188         assert(executable_fd >= 0);
3189
3190         if (context->smack_process_label) {
3191                 r = mac_smack_apply_pid(0, context->smack_process_label);
3192                 if (r < 0)
3193                         return r;
3194         }
3195 #ifdef SMACK_DEFAULT_PROCESS_LABEL
3196         else {
3197                 _cleanup_free_ char *exec_label = NULL;
3198
3199                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3200                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
3201                         return r;
3202
3203                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3204                 if (r < 0)
3205                         return r;
3206         }
3207 #endif
3208
3209         return 0;
3210 }
3211 #endif
3212
3213 static int compile_bind_mounts(
3214                 const ExecContext *context,
3215                 const ExecParameters *params,
3216                 BindMount **ret_bind_mounts,
3217                 size_t *ret_n_bind_mounts,
3218                 char ***ret_empty_directories) {
3219
3220         _cleanup_strv_free_ char **empty_directories = NULL;
3221         BindMount *bind_mounts;
3222         size_t n, h = 0;
3223         int r;
3224
3225         assert(context);
3226         assert(params);
3227         assert(ret_bind_mounts);
3228         assert(ret_n_bind_mounts);
3229         assert(ret_empty_directories);
3230
3231         n = context->n_bind_mounts;
3232         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3233                 if (!params->prefix[t])
3234                         continue;
3235
3236                 n += context->directories[t].n_items;
3237         }
3238
3239         if (n <= 0) {
3240                 *ret_bind_mounts = NULL;
3241                 *ret_n_bind_mounts = 0;
3242                 *ret_empty_directories = NULL;
3243                 return 0;
3244         }
3245
3246         bind_mounts = new(BindMount, n);
3247         if (!bind_mounts)
3248                 return -ENOMEM;
3249
3250         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3251                 BindMount *item = context->bind_mounts + i;
3252                 char *s, *d;
3253
3254                 s = strdup(item->source);
3255                 if (!s) {
3256                         r = -ENOMEM;
3257                         goto finish;
3258                 }
3259
3260                 d = strdup(item->destination);
3261                 if (!d) {
3262                         free(s);
3263                         r = -ENOMEM;
3264                         goto finish;
3265                 }
3266
3267                 bind_mounts[h++] = (BindMount) {
3268                         .source = s,
3269                         .destination = d,
3270                         .read_only = item->read_only,
3271                         .recursive = item->recursive,
3272                         .ignore_enoent = item->ignore_enoent,
3273                 };
3274         }
3275
3276         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3277                 if (!params->prefix[t])
3278                         continue;
3279
3280                 if (context->directories[t].n_items == 0)
3281                         continue;
3282
3283                 if (exec_directory_is_private(context, t) &&
3284                     !exec_context_with_rootfs(context)) {
3285                         char *private_root;
3286
3287                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3288                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3289                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3290
3291                         private_root = path_join(params->prefix[t], "private");
3292                         if (!private_root) {
3293                                 r = -ENOMEM;
3294                                 goto finish;
3295                         }
3296
3297                         r = strv_consume(&empty_directories, private_root);
3298                         if (r < 0)
3299                                 goto finish;
3300                 }
3301
3302                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3303                         char *s, *d;
3304
3305                         if (exec_directory_is_private(context, t))
3306                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3307                         else
3308                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3309                         if (!s) {
3310                                 r = -ENOMEM;
3311                                 goto finish;
3312                         }
3313
3314                         if (exec_directory_is_private(context, t) &&
3315                             exec_context_with_rootfs(context))
3316                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3317                                  * directory is not created on the root directory. So, let's bind-mount the directory
3318                                  * on the 'non-private' place. */
3319                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3320                         else
3321                                 d = strdup(s);
3322                         if (!d) {
3323                                 free(s);
3324                                 r = -ENOMEM;
3325                                 goto finish;
3326                         }
3327
3328                         bind_mounts[h++] = (BindMount) {
3329                                 .source = s,
3330                                 .destination = d,
3331                                 .read_only = false,
3332                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3333                                 .recursive = true,
3334                                 .ignore_enoent = false,
3335                         };
3336                 }
3337         }
3338
3339         assert(h == n);
3340
3341         *ret_bind_mounts = bind_mounts;
3342         *ret_n_bind_mounts = n;
3343         *ret_empty_directories = TAKE_PTR(empty_directories);
3344
3345         return (int) n;
3346
3347 finish:
3348         bind_mount_free_many(bind_mounts, h);
3349         return r;
3350 }
3351
3352 /* ret_symlinks will contain a list of pairs src:dest that describes
3353  * the symlinks to create later on. For example, the symlinks needed
3354  * to safely give private directories to DynamicUser=1 users. */
3355 static int compile_symlinks(
3356                 const ExecContext *context,
3357                 const ExecParameters *params,
3358                 char ***ret_symlinks) {
3359
3360         _cleanup_strv_free_ char **symlinks = NULL;
3361         int r;
3362
3363         assert(context);
3364         assert(params);
3365         assert(ret_symlinks);
3366
3367         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3368                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3369                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3370                         char **symlink;
3371
3372                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3373                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3374
3375                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3376                                 dst_abs = path_join(params->prefix[dt], *symlink);
3377                                 if (!src_abs || !dst_abs)
3378                                         return -ENOMEM;
3379
3380                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3381                                 if (r < 0)
3382                                         return r;
3383                         }
3384
3385                         if (!exec_directory_is_private(context, dt))
3386                                 continue;
3387
3388                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3389                         if (!private_path)
3390                                 return -ENOMEM;
3391
3392                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3393                         if (!path)
3394                                 return -ENOMEM;
3395
3396                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3397                         if (r < 0)
3398                                 return r;
3399                 }
3400         }
3401
3402         *ret_symlinks = TAKE_PTR(symlinks);
3403
3404         return 0;
3405 }
3406
3407 static bool insist_on_sandboxing(
3408                 const ExecContext *context,
3409                 const char *root_dir,
3410                 const char *root_image,
3411                 const BindMount *bind_mounts,
3412                 size_t n_bind_mounts) {
3413
3414         assert(context);
3415         assert(n_bind_mounts == 0 || bind_mounts);
3416
3417         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3418          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3419          * rearrange stuff in a way we cannot ignore gracefully. */
3420
3421         if (context->n_temporary_filesystems > 0)
3422                 return true;
3423
3424         if (root_dir || root_image)
3425                 return true;
3426
3427         if (context->n_mount_images > 0)
3428                 return true;
3429
3430         if (context->dynamic_user)
3431                 return true;
3432
3433         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3434          * essential. */
3435         for (size_t i = 0; i < n_bind_mounts; i++)
3436                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3437                         return true;
3438
3439         if (context->log_namespace)
3440                 return true;
3441
3442         return false;
3443 }
3444
3445 static int apply_mount_namespace(
3446                 const Unit *u,
3447                 ExecCommandFlags command_flags,
3448                 const ExecContext *context,
3449                 const ExecParameters *params,
3450                 const ExecRuntime *runtime,
3451                 char **error_path) {
3452
3453         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3454         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3455         const char *root_dir = NULL, *root_image = NULL;
3456         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3457         NamespaceInfo ns_info;
3458         bool needs_sandboxing;
3459         BindMount *bind_mounts = NULL;
3460         size_t n_bind_mounts = 0;
3461         int r;
3462
3463         assert(context);
3464
3465         if (params->flags & EXEC_APPLY_CHROOT) {
3466                 root_image = context->root_image;
3467
3468                 if (!root_image)
3469                         root_dir = context->root_directory;
3470         }
3471
3472         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3473         if (r < 0)
3474                 return r;
3475
3476         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3477         r = compile_symlinks(context, params, &symlinks);
3478         if (r < 0)
3479                 return r;
3480
3481         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3482         if (needs_sandboxing) {
3483                 /* The runtime struct only contains the parent of the private /tmp,
3484                  * which is non-accessible to world users. Inside of it there's a /tmp
3485                  * that is sticky, and that's the one we want to use here.
3486                  * This does not apply when we are using /run/systemd/empty as fallback. */
3487
3488                 if (context->private_tmp && runtime) {
3489                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3490                                 tmp_dir = runtime->tmp_dir;
3491                         else if (runtime->tmp_dir)
3492                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3493
3494                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3495                                 var_tmp_dir = runtime->var_tmp_dir;
3496                         else if (runtime->var_tmp_dir)
3497                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3498                 }
3499
3500                 ns_info = (NamespaceInfo) {
3501                         .ignore_protect_paths = false,
3502                         .private_dev = context->private_devices,
3503                         .protect_control_groups = context->protect_control_groups,
3504                         .protect_kernel_tunables = context->protect_kernel_tunables,
3505                         .protect_kernel_modules = context->protect_kernel_modules,
3506                         .protect_kernel_logs = context->protect_kernel_logs,
3507                         .protect_hostname = context->protect_hostname,
3508                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3509                         .private_mounts = context->private_mounts,
3510                         .protect_home = context->protect_home,
3511                         .protect_system = context->protect_system,
3512                         .protect_proc = context->protect_proc,
3513                         .proc_subset = context->proc_subset,
3514                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3515                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3516                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3517                 };
3518         } else if (!context->dynamic_user && root_dir)
3519                 /*
3520                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3521                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3522                  * fail if we are enable to apply the sandbox inside the mount namespace.
3523                  */
3524                 ns_info = (NamespaceInfo) {
3525                         .ignore_protect_paths = true,
3526                 };
3527         else
3528                 ns_info = (NamespaceInfo) {};
3529
3530         if (context->mount_flags == MS_SHARED)
3531                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3532
3533         if (exec_context_has_credentials(context) &&
3534             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3535             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3536                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3537                 if (!creds_path) {
3538                         r = -ENOMEM;
3539                         goto finalize;
3540                 }
3541         }
3542
3543         if (MANAGER_IS_SYSTEM(u->manager)) {
3544                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3545                 if (!propagate_dir) {
3546                         r = -ENOMEM;
3547                         goto finalize;
3548                 }
3549
3550                 incoming_dir = strdup("/run/systemd/incoming");
3551                 if (!incoming_dir) {
3552                         r = -ENOMEM;
3553                         goto finalize;
3554                 }
3555         }
3556
3557         r = setup_namespace(root_dir, root_image, context->root_image_options,
3558                             &ns_info, context->read_write_paths,
3559                             needs_sandboxing ? context->read_only_paths : NULL,
3560                             needs_sandboxing ? context->inaccessible_paths : NULL,
3561                             needs_sandboxing ? context->exec_paths : NULL,
3562                             needs_sandboxing ? context->no_exec_paths : NULL,
3563                             empty_directories,
3564                             symlinks,
3565                             bind_mounts,
3566                             n_bind_mounts,
3567                             context->temporary_filesystems,
3568                             context->n_temporary_filesystems,
3569                             context->mount_images,
3570                             context->n_mount_images,
3571                             tmp_dir,
3572                             var_tmp_dir,
3573                             creds_path,
3574                             context->log_namespace,
3575                             context->mount_flags,
3576                             context->root_hash, context->root_hash_size, context->root_hash_path,
3577                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3578                             context->root_verity,
3579                             context->extension_images,
3580                             context->n_extension_images,
3581                             propagate_dir,
3582                             incoming_dir,
3583                             root_dir || root_image ? params->notify_socket : NULL,
3584                             error_path);
3585
3586         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3587          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3588          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3589          * completely different execution environment. */
3590         if (r == -ENOANO) {
3591                 if (insist_on_sandboxing(
3592                                     context,
3593                                     root_dir, root_image,
3594                                     bind_mounts,
3595                                     n_bind_mounts)) {
3596                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3597                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3598                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3599
3600                         r = -EOPNOTSUPP;
3601                 } else {
3602                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3603                         r = 0;
3604                 }
3605         }
3606
3607 finalize:
3608         bind_mount_free_many(bind_mounts, n_bind_mounts);
3609         return r;
3610 }
3611
3612 static int apply_working_directory(
3613                 const ExecContext *context,
3614                 const ExecParameters *params,
3615                 const char *home,
3616                 int *exit_status) {
3617
3618         const char *d, *wd;
3619
3620         assert(context);
3621         assert(exit_status);
3622
3623         if (context->working_directory_home) {
3624
3625                 if (!home) {
3626                         *exit_status = EXIT_CHDIR;
3627                         return -ENXIO;
3628                 }
3629
3630                 wd = home;
3631
3632         } else
3633                 wd = empty_to_root(context->working_directory);
3634
3635         if (params->flags & EXEC_APPLY_CHROOT)
3636                 d = wd;
3637         else
3638                 d = prefix_roota(context->root_directory, wd);
3639
3640         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3641                 *exit_status = EXIT_CHDIR;
3642                 return -errno;
3643         }
3644
3645         return 0;
3646 }
3647
3648 static int apply_root_directory(
3649                 const ExecContext *context,
3650                 const ExecParameters *params,
3651                 const bool needs_mount_ns,
3652                 int *exit_status) {
3653
3654         assert(context);
3655         assert(exit_status);
3656
3657         if (params->flags & EXEC_APPLY_CHROOT)
3658                 if (!needs_mount_ns && context->root_directory)
3659                         if (chroot(context->root_directory) < 0) {
3660                                 *exit_status = EXIT_CHROOT;
3661                                 return -errno;
3662                         }
3663
3664         return 0;
3665 }
3666
3667 static int setup_keyring(
3668                 const Unit *u,
3669                 const ExecContext *context,
3670                 const ExecParameters *p,
3671                 uid_t uid, gid_t gid) {
3672
3673         key_serial_t keyring;
3674         int r = 0;
3675         uid_t saved_uid;
3676         gid_t saved_gid;
3677
3678         assert(u);
3679         assert(context);
3680         assert(p);
3681
3682         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3683          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3684          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3685          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3686          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3687          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3688
3689         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3690                 return 0;
3691
3692         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3693          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3694          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3695          * & group is just as nasty as acquiring a reference to the user keyring. */
3696
3697         saved_uid = getuid();
3698         saved_gid = getgid();
3699
3700         if (gid_is_valid(gid) && gid != saved_gid) {
3701                 if (setregid(gid, -1) < 0)
3702                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3703         }
3704
3705         if (uid_is_valid(uid) && uid != saved_uid) {
3706                 if (setreuid(uid, -1) < 0) {
3707                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3708                         goto out;
3709                 }
3710         }
3711
3712         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3713         if (keyring == -1) {
3714                 if (errno == ENOSYS)
3715                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3716                 else if (ERRNO_IS_PRIVILEGE(errno))
3717                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3718                 else if (errno == EDQUOT)
3719                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3720                 else
3721                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3722
3723                 goto out;
3724         }
3725
3726         /* When requested link the user keyring into the session keyring. */
3727         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3728
3729                 if (keyctl(KEYCTL_LINK,
3730                            KEY_SPEC_USER_KEYRING,
3731                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3732                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3733                         goto out;
3734                 }
3735         }
3736
3737         /* Restore uid/gid back */
3738         if (uid_is_valid(uid) && uid != saved_uid) {
3739                 if (setreuid(saved_uid, -1) < 0) {
3740                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3741                         goto out;
3742                 }
3743         }
3744
3745         if (gid_is_valid(gid) && gid != saved_gid) {
3746                 if (setregid(saved_gid, -1) < 0)
3747                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3748         }
3749
3750         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3751         if (!sd_id128_is_null(u->invocation_id)) {
3752                 key_serial_t key;
3753
3754                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3755                 if (key == -1)
3756                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3757                 else {
3758                         if (keyctl(KEYCTL_SETPERM, key,
3759                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3760                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3761                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3762                 }
3763         }
3764
3765 out:
3766         /* Revert back uid & gid for the last time, and exit */
3767         /* no extra logging, as only the first already reported error matters */
3768         if (getuid() != saved_uid)
3769                 (void) setreuid(saved_uid, -1);
3770
3771         if (getgid() != saved_gid)
3772                 (void) setregid(saved_gid, -1);
3773
3774         return r;
3775 }
3776
3777 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3778         assert(array);
3779         assert(n);
3780         assert(pair);
3781
3782         if (pair[0] >= 0)
3783                 array[(*n)++] = pair[0];
3784         if (pair[1] >= 0)
3785                 array[(*n)++] = pair[1];
3786 }
3787
3788 static int close_remaining_fds(
3789                 const ExecParameters *params,
3790                 const ExecRuntime *runtime,
3791                 const DynamicCreds *dcreds,
3792                 int user_lookup_fd,
3793                 int socket_fd,
3794                 const int *fds, size_t n_fds) {
3795
3796         size_t n_dont_close = 0;
3797         int dont_close[n_fds + 12];
3798
3799         assert(params);
3800
3801         if (params->stdin_fd >= 0)
3802                 dont_close[n_dont_close++] = params->stdin_fd;
3803         if (params->stdout_fd >= 0)
3804                 dont_close[n_dont_close++] = params->stdout_fd;
3805         if (params->stderr_fd >= 0)
3806                 dont_close[n_dont_close++] = params->stderr_fd;
3807
3808         if (socket_fd >= 0)
3809                 dont_close[n_dont_close++] = socket_fd;
3810         if (n_fds > 0) {
3811                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3812                 n_dont_close += n_fds;
3813         }
3814
3815         if (runtime) {
3816                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3817                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3818         }
3819
3820         if (dcreds) {
3821                 if (dcreds->user)
3822                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3823                 if (dcreds->group)
3824                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3825         }
3826
3827         if (user_lookup_fd >= 0)
3828                 dont_close[n_dont_close++] = user_lookup_fd;
3829
3830         return close_all_fds(dont_close, n_dont_close);
3831 }
3832
3833 static int send_user_lookup(
3834                 Unit *unit,
3835                 int user_lookup_fd,
3836                 uid_t uid,
3837                 gid_t gid) {
3838
3839         assert(unit);
3840
3841         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3842          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3843          * specified. */
3844
3845         if (user_lookup_fd < 0)
3846                 return 0;
3847
3848         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3849                 return 0;
3850
3851         if (writev(user_lookup_fd,
3852                (struct iovec[]) {
3853                            IOVEC_INIT(&uid, sizeof(uid)),
3854                            IOVEC_INIT(&gid, sizeof(gid)),
3855                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3856                 return -errno;
3857
3858         return 0;
3859 }
3860
3861 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3862         int r;
3863
3864         assert(c);
3865         assert(home);
3866         assert(buf);
3867
3868         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3869
3870         if (*home)
3871                 return 0;
3872
3873         if (!c->working_directory_home)
3874                 return 0;
3875
3876         r = get_home_dir(buf);
3877         if (r < 0)
3878                 return r;
3879
3880         *home = *buf;
3881         return 1;
3882 }
3883
3884 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3885         _cleanup_strv_free_ char ** list = NULL;
3886         int r;
3887
3888         assert(c);
3889         assert(p);
3890         assert(ret);
3891
3892         assert(c->dynamic_user);
3893
3894         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3895          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3896          * directories. */
3897
3898         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3899                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3900                         continue;
3901
3902                 if (!p->prefix[t])
3903                         continue;
3904
3905                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3906                         char *e;
3907
3908                         if (exec_directory_is_private(c, t))
3909                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3910                         else
3911                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3912                         if (!e)
3913                                 return -ENOMEM;
3914
3915                         r = strv_consume(&list, e);
3916                         if (r < 0)
3917                                 return r;
3918                 }
3919         }
3920
3921         *ret = TAKE_PTR(list);
3922
3923         return 0;
3924 }
3925
3926 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3927         bool using_subcgroup;
3928         char *p;
3929
3930         assert(params);
3931         assert(ret);
3932
3933         if (!params->cgroup_path)
3934                 return -EINVAL;
3935
3936         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3937          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3938          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3939          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3940          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3941          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3942          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3943          * flag, which is only passed for the former statements, not for the latter. */
3944
3945         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3946         if (using_subcgroup)
3947                 p = path_join(params->cgroup_path, ".control");
3948         else
3949                 p = strdup(params->cgroup_path);
3950         if (!p)
3951                 return -ENOMEM;
3952
3953         *ret = p;
3954         return using_subcgroup;
3955 }
3956
3957 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3958         _cleanup_(cpu_set_reset) CPUSet s = {};
3959         int r;
3960
3961         assert(c);
3962         assert(ret);
3963
3964         if (!c->numa_policy.nodes.set) {
3965                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3966                 return 0;
3967         }
3968
3969         r = numa_to_cpu_set(&c->numa_policy, &s);
3970         if (r < 0)
3971                 return r;
3972
3973         cpu_set_reset(ret);
3974
3975         return cpu_set_add_all(ret, &s);
3976 }
3977
3978 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3979         assert(c);
3980
3981         return c->cpu_affinity_from_numa;
3982 }
3983
3984 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3985         int r;
3986
3987         assert(fds);
3988         assert(n_fds);
3989         assert(*n_fds < fds_size);
3990         assert(ret_fd);
3991
3992         if (fd < 0) {
3993                 *ret_fd = -1;
3994                 return 0;
3995         }
3996
3997         if (fd < 3 + (int) *n_fds) {
3998                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3999                  * the fds we pass to the process (or which are closed only during execve). */
4000
4001                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4002                 if (r < 0)
4003                         return -errno;
4004
4005                 CLOSE_AND_REPLACE(fd, r);
4006         }
4007
4008         *ret_fd = fds[*n_fds] = fd;
4009         (*n_fds) ++;
4010         return 1;
4011 }
4012
4013 static int exec_child(
4014                 Unit *unit,
4015                 const ExecCommand *command,
4016                 const ExecContext *context,
4017                 const ExecParameters *params,
4018                 ExecRuntime *runtime,
4019                 DynamicCreds *dcreds,
4020                 int socket_fd,
4021                 const int named_iofds[static 3],
4022                 int *fds,
4023                 size_t n_socket_fds,
4024                 size_t n_storage_fds,
4025                 char **files_env,
4026                 int user_lookup_fd,
4027                 int *exit_status) {
4028
4029         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4030         int r, ngids = 0, exec_fd;
4031         _cleanup_free_ gid_t *supplementary_gids = NULL;
4032         const char *username = NULL, *groupname = NULL;
4033         _cleanup_free_ char *home_buffer = NULL;
4034         const char *home = NULL, *shell = NULL;
4035         char **final_argv = NULL;
4036         dev_t journal_stream_dev = 0;
4037         ino_t journal_stream_ino = 0;
4038         bool userns_set_up = false;
4039         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4040                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4041                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4042                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4043 #if HAVE_SELINUX
4044         _cleanup_free_ char *mac_selinux_context_net = NULL;
4045         bool use_selinux = false;
4046 #endif
4047 #if ENABLE_SMACK
4048         bool use_smack = false;
4049 #endif
4050 #if HAVE_APPARMOR
4051         bool use_apparmor = false;
4052 #endif
4053         uid_t saved_uid = getuid();
4054         gid_t saved_gid = getgid();
4055         uid_t uid = UID_INVALID;
4056         gid_t gid = GID_INVALID;
4057         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4058                n_keep_fds; /* total number of fds not to close */
4059         int secure_bits;
4060         _cleanup_free_ gid_t *gids_after_pam = NULL;
4061         int ngids_after_pam = 0;
4062
4063         assert(unit);
4064         assert(command);
4065         assert(context);
4066         assert(params);
4067         assert(exit_status);
4068
4069         rename_process_from_path(command->path);
4070
4071         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4072          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4073          * both of which will be demoted to SIG_DFL. */
4074         (void) default_signals(SIGNALS_CRASH_HANDLER,
4075                                SIGNALS_IGNORE);
4076
4077         if (context->ignore_sigpipe)
4078                 (void) ignore_signals(SIGPIPE);
4079
4080         r = reset_signal_mask();
4081         if (r < 0) {
4082                 *exit_status = EXIT_SIGNAL_MASK;
4083                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4084         }
4085
4086         if (params->idle_pipe)
4087                 do_idle_pipe_dance(params->idle_pipe);
4088
4089         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4090          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4091          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4092          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4093
4094         log_forget_fds();
4095         log_set_open_when_needed(true);
4096
4097         /* In case anything used libc syslog(), close this here, too */
4098         closelog();
4099
4100         int keep_fds[n_fds + 3];
4101         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4102         n_keep_fds = n_fds;
4103
4104         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4105         if (r < 0) {
4106                 *exit_status = EXIT_FDS;
4107                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4108         }
4109
4110 #if HAVE_LIBBPF
4111         if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
4112                 int bpf_map_fd = -1;
4113
4114                 bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4115                 if (bpf_map_fd < 0) {
4116                         *exit_status = EXIT_FDS;
4117                         return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
4118                 }
4119
4120                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4121                 if (r < 0) {
4122                         *exit_status = EXIT_FDS;
4123                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4124                 }
4125         }
4126 #endif
4127
4128         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4129         if (r < 0) {
4130                 *exit_status = EXIT_FDS;
4131                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4132         }
4133
4134         if (!context->same_pgrp &&
4135             setsid() < 0) {
4136                 *exit_status = EXIT_SETSID;
4137                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4138         }
4139
4140         exec_context_tty_reset(context, params);
4141
4142         if (unit_shall_confirm_spawn(unit)) {
4143                 _cleanup_free_ char *cmdline = NULL;
4144
4145                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4146                 if (!cmdline) {
4147                         *exit_status = EXIT_MEMORY;
4148                         return log_oom();
4149                 }
4150
4151                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4152                 if (r != CONFIRM_EXECUTE) {
4153                         if (r == CONFIRM_PRETEND_SUCCESS) {
4154                                 *exit_status = EXIT_SUCCESS;
4155                                 return 0;
4156                         }
4157                         *exit_status = EXIT_CONFIRM;
4158                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4159                                                     "Execution cancelled by the user");
4160                 }
4161         }
4162
4163         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4164          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4165          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4166          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4167          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4168         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4169             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4170                 *exit_status = EXIT_MEMORY;
4171                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4172         }
4173
4174         if (context->dynamic_user && dcreds) {
4175                 _cleanup_strv_free_ char **suggested_paths = NULL;
4176
4177                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4178                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4179                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4180                         *exit_status = EXIT_USER;
4181                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4182                 }
4183
4184                 r = compile_suggested_paths(context, params, &suggested_paths);
4185                 if (r < 0) {
4186                         *exit_status = EXIT_MEMORY;
4187                         return log_oom();
4188                 }
4189
4190                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4191                 if (r < 0) {
4192                         *exit_status = EXIT_USER;
4193                         if (r == -EILSEQ)
4194                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4195                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4196                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4197                 }
4198
4199                 if (!uid_is_valid(uid)) {
4200                         *exit_status = EXIT_USER;
4201                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4202                 }
4203
4204                 if (!gid_is_valid(gid)) {
4205                         *exit_status = EXIT_USER;
4206                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4207                 }
4208
4209                 if (dcreds->user)
4210                         username = dcreds->user->name;
4211
4212         } else {
4213                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4214                 if (r < 0) {
4215                         *exit_status = EXIT_USER;
4216                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4217                 }
4218
4219                 r = get_fixed_group(context, &groupname, &gid);
4220                 if (r < 0) {
4221                         *exit_status = EXIT_GROUP;
4222                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4223                 }
4224         }
4225
4226         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4227         r = get_supplementary_groups(context, username, groupname, gid,
4228                                      &supplementary_gids, &ngids);
4229         if (r < 0) {
4230                 *exit_status = EXIT_GROUP;
4231                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4232         }
4233
4234         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4235         if (r < 0) {
4236                 *exit_status = EXIT_USER;
4237                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4238         }
4239
4240         user_lookup_fd = safe_close(user_lookup_fd);
4241
4242         r = acquire_home(context, uid, &home, &home_buffer);
4243         if (r < 0) {
4244                 *exit_status = EXIT_CHDIR;
4245                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4246         }
4247
4248         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4249          * must sure to drop O_NONBLOCK */
4250         if (socket_fd >= 0)
4251                 (void) fd_nonblock(socket_fd, false);
4252
4253         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4254          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4255         if (params->cgroup_path) {
4256                 _cleanup_free_ char *p = NULL;
4257
4258                 r = exec_parameters_get_cgroup_path(params, &p);
4259                 if (r < 0) {
4260                         *exit_status = EXIT_CGROUP;
4261                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4262                 }
4263
4264                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4265                 if (r < 0) {
4266                         *exit_status = EXIT_CGROUP;
4267                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4268                 }
4269         }
4270
4271         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4272                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4273                 if (r < 0) {
4274                         *exit_status = EXIT_NETWORK;
4275                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4276                 }
4277         }
4278
4279         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4280                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4281                 if (r < 0) {
4282                         *exit_status = EXIT_NAMESPACE;
4283                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4284                 }
4285         }
4286
4287         r = setup_input(context, params, socket_fd, named_iofds);
4288         if (r < 0) {
4289                 *exit_status = EXIT_STDIN;
4290                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4291         }
4292
4293         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4294         if (r < 0) {
4295                 *exit_status = EXIT_STDOUT;
4296                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4297         }
4298
4299         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4300         if (r < 0) {
4301                 *exit_status = EXIT_STDERR;
4302                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4303         }
4304
4305         if (context->oom_score_adjust_set) {
4306                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4307                  * prohibit write access to this file, and we shouldn't trip up over that. */
4308                 r = set_oom_score_adjust(context->oom_score_adjust);
4309                 if (ERRNO_IS_PRIVILEGE(r))
4310                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4311                 else if (r < 0) {
4312                         *exit_status = EXIT_OOM_ADJUST;
4313                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4314                 }
4315         }
4316
4317         if (context->coredump_filter_set) {
4318                 r = set_coredump_filter(context->coredump_filter);
4319                 if (ERRNO_IS_PRIVILEGE(r))
4320                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4321                 else if (r < 0)
4322                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4323         }
4324
4325         if (context->nice_set) {
4326                 r = setpriority_closest(context->nice);
4327                 if (r < 0)
4328                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4329         }
4330
4331         if (context->cpu_sched_set) {
4332                 struct sched_param param = {
4333                         .sched_priority = context->cpu_sched_priority,
4334                 };
4335
4336                 r = sched_setscheduler(0,
4337                                        context->cpu_sched_policy |
4338                                        (context->cpu_sched_reset_on_fork ?
4339                                         SCHED_RESET_ON_FORK : 0),
4340                                        &param);
4341                 if (r < 0) {
4342                         *exit_status = EXIT_SETSCHEDULER;
4343                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4344                 }
4345         }
4346
4347         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4348                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4349                 const CPUSet *cpu_set;
4350
4351                 if (context->cpu_affinity_from_numa) {
4352                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4353                         if (r < 0) {
4354                                 *exit_status = EXIT_CPUAFFINITY;
4355                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4356                         }
4357
4358                         cpu_set = &converted_cpu_set;
4359                 } else
4360                         cpu_set = &context->cpu_set;
4361
4362                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4363                         *exit_status = EXIT_CPUAFFINITY;
4364                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4365                 }
4366         }
4367
4368         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4369                 r = apply_numa_policy(&context->numa_policy);
4370                 if (r == -EOPNOTSUPP)
4371                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4372                 else if (r < 0) {
4373                         *exit_status = EXIT_NUMA_POLICY;
4374                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4375                 }
4376         }
4377
4378         if (context->ioprio_set)
4379                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4380                         *exit_status = EXIT_IOPRIO;
4381                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4382                 }
4383
4384         if (context->timer_slack_nsec != NSEC_INFINITY)
4385                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4386                         *exit_status = EXIT_TIMERSLACK;
4387                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4388                 }
4389
4390         if (context->personality != PERSONALITY_INVALID) {
4391                 r = safe_personality(context->personality);
4392                 if (r < 0) {
4393                         *exit_status = EXIT_PERSONALITY;
4394                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4395                 }
4396         }
4397
4398         if (context->utmp_id) {
4399                 const char *line = context->tty_path ?
4400                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4401                         NULL;
4402                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4403                                       line,
4404                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4405                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4406                                       USER_PROCESS,
4407                                       username);
4408         }
4409
4410         if (uid_is_valid(uid)) {
4411                 r = chown_terminal(STDIN_FILENO, uid);
4412                 if (r < 0) {
4413                         *exit_status = EXIT_STDIN;
4414                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4415                 }
4416         }
4417
4418         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4419          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4420          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4421          * touch a single hierarchy too. */
4422         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4423                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4424                 if (r < 0) {
4425                         *exit_status = EXIT_CGROUP;
4426                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4427                 }
4428         }
4429
4430         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4431
4432         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4433                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4434                 if (r < 0)
4435                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4436         }
4437
4438         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4439                 r = setup_credentials(context, params, unit->id, uid);
4440                 if (r < 0) {
4441                         *exit_status = EXIT_CREDENTIALS;
4442                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4443                 }
4444         }
4445
4446         r = build_environment(
4447                         unit,
4448                         context,
4449                         params,
4450                         n_fds,
4451                         home,
4452                         username,
4453                         shell,
4454                         journal_stream_dev,
4455                         journal_stream_ino,
4456                         &our_env);
4457         if (r < 0) {
4458                 *exit_status = EXIT_MEMORY;
4459                 return log_oom();
4460         }
4461
4462         r = build_pass_environment(context, &pass_env);
4463         if (r < 0) {
4464                 *exit_status = EXIT_MEMORY;
4465                 return log_oom();
4466         }
4467
4468         /* The PATH variable is set to the default path in params->environment.
4469          * However, this is overridden if user specified fields have PATH set.
4470          * The intention is to also override PATH if the user does
4471          * not specify PATH and the user has specified ExecSearchPath
4472          */
4473
4474         if (!strv_isempty(context->exec_search_path)) {
4475                 _cleanup_free_ char *joined = NULL;
4476
4477                 joined = strv_join(context->exec_search_path, ":");
4478                 if (!joined) {
4479                         *exit_status = EXIT_MEMORY;
4480                         return log_oom();
4481                 }
4482
4483                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4484                 if (r < 0) {
4485                         *exit_status = EXIT_MEMORY;
4486                         return log_oom();
4487                 }
4488         }
4489
4490         accum_env = strv_env_merge(params->environment,
4491                                    our_env,
4492                                    joined_exec_search_path,
4493                                    pass_env,
4494                                    context->environment,
4495                                    files_env);
4496         if (!accum_env) {
4497                 *exit_status = EXIT_MEMORY;
4498                 return log_oom();
4499         }
4500         accum_env = strv_env_clean(accum_env);
4501
4502         (void) umask(context->umask);
4503
4504         r = setup_keyring(unit, context, params, uid, gid);
4505         if (r < 0) {
4506                 *exit_status = EXIT_KEYRING;
4507                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4508         }
4509
4510         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4511         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4512
4513         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4514         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4515
4516         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4517         if (needs_ambient_hack)
4518                 needs_setuid = false;
4519         else
4520                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4521
4522         if (needs_sandboxing) {
4523                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4524                  * present. The actual MAC context application will happen later, as late as possible, to avoid
4525                  * impacting our own code paths. */
4526
4527 #if HAVE_SELINUX
4528                 use_selinux = mac_selinux_use();
4529 #endif
4530 #if ENABLE_SMACK
4531                 use_smack = mac_smack_use();
4532 #endif
4533 #if HAVE_APPARMOR
4534                 use_apparmor = mac_apparmor_use();
4535 #endif
4536         }
4537
4538         if (needs_sandboxing) {
4539                 int which_failed;
4540
4541                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4542                  * is set here. (See below.) */
4543
4544                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4545                 if (r < 0) {
4546                         *exit_status = EXIT_LIMITS;
4547                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4548                 }
4549         }
4550
4551         if (needs_setuid && context->pam_name && username) {
4552                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4553                  * wins here. (See above.) */
4554
4555                 /* All fds passed in the fds array will be closed in the pam child process. */
4556                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4557                 if (r < 0) {
4558                         *exit_status = EXIT_PAM;
4559                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4560                 }
4561
4562                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4563                 if (ngids_after_pam < 0) {
4564                         *exit_status = EXIT_MEMORY;
4565                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4566                 }
4567         }
4568
4569         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4570                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4571                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4572                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4573
4574                 userns_set_up = true;
4575                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4576                 if (r < 0) {
4577                         *exit_status = EXIT_USER;
4578                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4579                 }
4580         }
4581
4582         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4583
4584                 if (ns_type_supported(NAMESPACE_NET)) {
4585                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4586                         if (r == -EPERM)
4587                                 log_unit_warning_errno(unit, r,
4588                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4589                         else if (r < 0) {
4590                                 *exit_status = EXIT_NETWORK;
4591                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4592                         }
4593                 } else if (context->network_namespace_path) {
4594                         *exit_status = EXIT_NETWORK;
4595                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4596                                                     "NetworkNamespacePath= is not supported, refusing.");
4597                 } else
4598                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4599         }
4600
4601         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4602
4603                 if (ns_type_supported(NAMESPACE_IPC)) {
4604                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4605                         if (r == -EPERM)
4606                                 log_unit_warning_errno(unit, r,
4607                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4608                         else if (r < 0) {
4609                                 *exit_status = EXIT_NAMESPACE;
4610                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4611                         }
4612                 } else if (context->ipc_namespace_path) {
4613                         *exit_status = EXIT_NAMESPACE;
4614                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4615                                                     "IPCNamespacePath= is not supported, refusing.");
4616                 } else
4617                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4618         }
4619
4620         if (needs_mount_namespace) {
4621                 _cleanup_free_ char *error_path = NULL;
4622
4623                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4624                 if (r < 0) {
4625                         *exit_status = EXIT_NAMESPACE;
4626                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4627                                                     error_path ? ": " : "", strempty(error_path));
4628                 }
4629         }
4630
4631         if (needs_sandboxing) {
4632                 r = apply_protect_hostname(unit, context, exit_status);
4633                 if (r < 0)
4634                         return r;
4635         }
4636
4637         /* Drop groups as early as possible.
4638          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4639          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4640         if (needs_setuid) {
4641                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4642                 int ngids_to_enforce = 0;
4643
4644                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4645                                                    ngids,
4646                                                    gids_after_pam,
4647                                                    ngids_after_pam,
4648                                                    &gids_to_enforce);
4649                 if (ngids_to_enforce < 0) {
4650                         *exit_status = EXIT_MEMORY;
4651                         return log_unit_error_errno(unit,
4652                                                     ngids_to_enforce,
4653                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4654                 }
4655
4656                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4657                 if (r < 0) {
4658                         *exit_status = EXIT_GROUP;
4659                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4660                 }
4661         }
4662
4663         /* If the user namespace was not set up above, try to do it now.
4664          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4665          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4666          * case of mount namespaces being less privileged when the mount point list is copied from a
4667          * different user namespace). */
4668
4669         if (needs_sandboxing && context->private_users && !userns_set_up) {
4670                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4671                 if (r < 0) {
4672                         *exit_status = EXIT_USER;
4673                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4674                 }
4675         }
4676
4677         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4678          * shall execute. */
4679
4680         _cleanup_free_ char *executable = NULL;
4681         _cleanup_close_ int executable_fd = -1;
4682         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4683         if (r < 0) {
4684                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4685                         log_unit_struct_errno(unit, LOG_INFO, r,
4686                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4687                                               LOG_UNIT_INVOCATION_ID(unit),
4688                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4689                                                                command->path),
4690                                               "EXECUTABLE=%s", command->path);
4691                         return 0;
4692                 }
4693
4694                 *exit_status = EXIT_EXEC;
4695
4696                 return log_unit_struct_errno(unit, LOG_INFO, r,
4697                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4698                                              LOG_UNIT_INVOCATION_ID(unit),
4699                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4700                                                               command->path),
4701                                              "EXECUTABLE=%s", command->path);
4702         }
4703
4704         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4705         if (r < 0) {
4706                 *exit_status = EXIT_FDS;
4707                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4708         }
4709
4710 #if HAVE_SELINUX
4711         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4712                 int fd = -1;
4713
4714                 if (socket_fd >= 0)
4715                         fd = socket_fd;
4716                 else if (params->n_socket_fds == 1)
4717                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4718                          * use context from that fd to compute the label. */
4719                         fd = params->fds[0];
4720
4721                 if (fd >= 0) {
4722                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4723                         if (r < 0) {
4724                                 if (!context->selinux_context_ignore) {
4725                                         *exit_status = EXIT_SELINUX_CONTEXT;
4726                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4727                                 }
4728                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4729                         }
4730                 }
4731         }
4732 #endif
4733
4734         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4735          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4736          * however if we have it as we want to keep it open until the final execve(). */
4737
4738         r = close_all_fds(keep_fds, n_keep_fds);
4739         if (r >= 0)
4740                 r = shift_fds(fds, n_fds);
4741         if (r >= 0)
4742                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4743         if (r < 0) {
4744                 *exit_status = EXIT_FDS;
4745                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4746         }
4747
4748         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4749          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4750          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4751          * came this far. */
4752
4753         secure_bits = context->secure_bits;
4754
4755         if (needs_sandboxing) {
4756                 uint64_t bset;
4757
4758                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4759                  * requested. (Note this is placed after the general resource limit initialization, see
4760                  * above, in order to take precedence.) */
4761                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4762                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4763                                 *exit_status = EXIT_LIMITS;
4764                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4765                         }
4766                 }
4767
4768 #if ENABLE_SMACK
4769                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4770                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4771                 if (use_smack) {
4772                         r = setup_smack(context, executable_fd);
4773                         if (r < 0 && !context->smack_process_label_ignore) {
4774                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4775                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4776                         }
4777                 }
4778 #endif
4779
4780                 bset = context->capability_bounding_set;
4781                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4782                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4783                  * instead of us doing that */
4784                 if (needs_ambient_hack)
4785                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4786                                 (UINT64_C(1) << CAP_SETUID) |
4787                                 (UINT64_C(1) << CAP_SETGID);
4788
4789                 if (!cap_test_all(bset)) {
4790                         r = capability_bounding_set_drop(bset, false);
4791                         if (r < 0) {
4792                                 *exit_status = EXIT_CAPABILITIES;
4793                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4794                         }
4795                 }
4796
4797                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4798                  * keep-caps set.
4799                  * To be able to raise the ambient capabilities after setresuid() they have to be
4800                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4801                  * After setresuid() the ambient capabilities can be raised as they are present in
4802                  * the permitted and inhertiable set. However it is possible that someone wants to
4803                  * set ambient capabilities without changing the user, so we also set the ambient
4804                  * capabilities here.
4805                  * The requested ambient capabilities are raised in the inheritable set if the
4806                  * second argument is true. */
4807                 if (!needs_ambient_hack) {
4808                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4809                         if (r < 0) {
4810                                 *exit_status = EXIT_CAPABILITIES;
4811                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4812                         }
4813                 }
4814         }
4815
4816         /* chroot to root directory first, before we lose the ability to chroot */
4817         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4818         if (r < 0)
4819                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4820
4821         if (needs_setuid) {
4822                 if (uid_is_valid(uid)) {
4823                         r = enforce_user(context, uid);
4824                         if (r < 0) {
4825                                 *exit_status = EXIT_USER;
4826                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4827                         }
4828
4829                         if (!needs_ambient_hack &&
4830                             context->capability_ambient_set != 0) {
4831
4832                                 /* Raise the ambient capabilities after user change. */
4833                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4834                                 if (r < 0) {
4835                                         *exit_status = EXIT_CAPABILITIES;
4836                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4837                                 }
4838                         }
4839                 }
4840         }
4841
4842         /* Apply working directory here, because the working directory might be on NFS and only the user running
4843          * this service might have the correct privilege to change to the working directory */
4844         r = apply_working_directory(context, params, home, exit_status);
4845         if (r < 0)
4846                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4847
4848         if (needs_sandboxing) {
4849                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4850                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4851                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4852                  * are restricted. */
4853
4854 #if HAVE_SELINUX
4855                 if (use_selinux) {
4856                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4857
4858                         if (exec_context) {
4859                                 r = setexeccon(exec_context);
4860                                 if (r < 0) {
4861                                         if (!context->selinux_context_ignore) {
4862                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4863                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4864                                         }
4865                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4866                                 }
4867                         }
4868                 }
4869 #endif
4870
4871 #if HAVE_APPARMOR
4872                 if (use_apparmor && context->apparmor_profile) {
4873                         r = aa_change_onexec(context->apparmor_profile);
4874                         if (r < 0 && !context->apparmor_profile_ignore) {
4875                                 *exit_status = EXIT_APPARMOR_PROFILE;
4876                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4877                         }
4878                 }
4879 #endif
4880
4881                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4882                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4883                  * CAP_SETPCAP. */
4884                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4885                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4886                          * effective set here.
4887                          * The effective set is overwritten during execve  with the following  values:
4888                          * - ambient set (for non-root processes)
4889                          * - (inheritable | bounding) set for root processes)
4890                          *
4891                          * Hence there is no security impact to raise it in the effective set before execve
4892                          */
4893                         r = capability_gain_cap_setpcap(NULL);
4894                         if (r < 0) {
4895                                 *exit_status = EXIT_CAPABILITIES;
4896                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4897                         }
4898                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4899                                 *exit_status = EXIT_SECUREBITS;
4900                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4901                         }
4902                 }
4903
4904                 if (context_has_no_new_privileges(context))
4905                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4906                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4907                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4908                         }
4909
4910 #if HAVE_SECCOMP
4911                 r = apply_address_families(unit, context);
4912                 if (r < 0) {
4913                         *exit_status = EXIT_ADDRESS_FAMILIES;
4914                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4915                 }
4916
4917                 r = apply_memory_deny_write_execute(unit, context);
4918                 if (r < 0) {
4919                         *exit_status = EXIT_SECCOMP;
4920                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4921                 }
4922
4923                 r = apply_restrict_realtime(unit, context);
4924                 if (r < 0) {
4925                         *exit_status = EXIT_SECCOMP;
4926                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4927                 }
4928
4929                 r = apply_restrict_suid_sgid(unit, context);
4930                 if (r < 0) {
4931                         *exit_status = EXIT_SECCOMP;
4932                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4933                 }
4934
4935                 r = apply_restrict_namespaces(unit, context);
4936                 if (r < 0) {
4937                         *exit_status = EXIT_SECCOMP;
4938                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4939                 }
4940
4941                 r = apply_protect_sysctl(unit, context);
4942                 if (r < 0) {
4943                         *exit_status = EXIT_SECCOMP;
4944                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4945                 }
4946
4947                 r = apply_protect_kernel_modules(unit, context);
4948                 if (r < 0) {
4949                         *exit_status = EXIT_SECCOMP;
4950                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4951                 }
4952
4953                 r = apply_protect_kernel_logs(unit, context);
4954                 if (r < 0) {
4955                         *exit_status = EXIT_SECCOMP;
4956                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4957                 }
4958
4959                 r = apply_protect_clock(unit, context);
4960                 if (r < 0) {
4961                         *exit_status = EXIT_SECCOMP;
4962                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4963                 }
4964
4965                 r = apply_private_devices(unit, context);
4966                 if (r < 0) {
4967                         *exit_status = EXIT_SECCOMP;
4968                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4969                 }
4970
4971                 r = apply_syscall_archs(unit, context);
4972                 if (r < 0) {
4973                         *exit_status = EXIT_SECCOMP;
4974                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4975                 }
4976
4977                 r = apply_lock_personality(unit, context);
4978                 if (r < 0) {
4979                         *exit_status = EXIT_SECCOMP;
4980                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4981                 }
4982
4983                 r = apply_syscall_log(unit, context);
4984                 if (r < 0) {
4985                         *exit_status = EXIT_SECCOMP;
4986                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4987                 }
4988
4989                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4990                  * by the filter as little as possible. */
4991                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4992                 if (r < 0) {
4993                         *exit_status = EXIT_SECCOMP;
4994                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4995                 }
4996 #endif
4997
4998 #if HAVE_LIBBPF
4999                 r = apply_restrict_filesystems(unit, context);
5000                 if (r < 0) {
5001                         *exit_status = EXIT_BPF;
5002                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5003                 }
5004 #endif
5005
5006         }
5007
5008         if (!strv_isempty(context->unset_environment)) {
5009                 char **ee = NULL;
5010
5011                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5012                 if (!ee) {
5013                         *exit_status = EXIT_MEMORY;
5014                         return log_oom();
5015                 }
5016
5017                 strv_free_and_replace(accum_env, ee);
5018         }
5019
5020         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5021                 replaced_argv = replace_env_argv(command->argv, accum_env);
5022                 if (!replaced_argv) {
5023                         *exit_status = EXIT_MEMORY;
5024                         return log_oom();
5025                 }
5026                 final_argv = replaced_argv;
5027         } else
5028                 final_argv = command->argv;
5029
5030         if (DEBUG_LOGGING) {
5031                 _cleanup_free_ char *line = NULL;
5032
5033                 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5034                 if (!line) {
5035                         *exit_status = EXIT_MEMORY;
5036                         return log_oom();
5037                 }
5038
5039                 log_unit_struct(unit, LOG_DEBUG,
5040                                 "EXECUTABLE=%s", executable,
5041                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5042         }
5043
5044         if (exec_fd >= 0) {
5045                 uint8_t hot = 1;
5046
5047                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5048                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5049
5050                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5051                         *exit_status = EXIT_EXEC;
5052                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5053                 }
5054         }
5055
5056         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5057
5058         if (exec_fd >= 0) {
5059                 uint8_t hot = 0;
5060
5061                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5062                  * that POLLHUP on it no longer means execve() succeeded. */
5063
5064                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5065                         *exit_status = EXIT_EXEC;
5066                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5067                 }
5068         }
5069
5070         *exit_status = EXIT_EXEC;
5071         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5072 }
5073
5074 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5075 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5076
5077 int exec_spawn(Unit *unit,
5078                ExecCommand *command,
5079                const ExecContext *context,
5080                const ExecParameters *params,
5081                ExecRuntime *runtime,
5082                DynamicCreds *dcreds,
5083                pid_t *ret) {
5084
5085         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5086         _cleanup_free_ char *subcgroup_path = NULL;
5087         _cleanup_strv_free_ char **files_env = NULL;
5088         size_t n_storage_fds = 0, n_socket_fds = 0;
5089         _cleanup_free_ char *line = NULL;
5090         pid_t pid;
5091
5092         assert(unit);
5093         assert(command);
5094         assert(context);
5095         assert(ret);
5096         assert(params);
5097         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5098
5099         if (context->std_input == EXEC_INPUT_SOCKET ||
5100             context->std_output == EXEC_OUTPUT_SOCKET ||
5101             context->std_error == EXEC_OUTPUT_SOCKET) {
5102
5103                 if (params->n_socket_fds > 1)
5104                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5105
5106                 if (params->n_socket_fds == 0)
5107                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5108
5109                 socket_fd = params->fds[0];
5110         } else {
5111                 socket_fd = -1;
5112                 fds = params->fds;
5113                 n_socket_fds = params->n_socket_fds;
5114                 n_storage_fds = params->n_storage_fds;
5115         }
5116
5117         r = exec_context_named_iofds(context, params, named_iofds);
5118         if (r < 0)
5119                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5120
5121         r = exec_context_load_environment(unit, context, &files_env);
5122         if (r < 0)
5123                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5124
5125         line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5126         if (!line)
5127                 return log_oom();
5128
5129         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5130            and, until the next SELinux policy changes, we save further reloads in future children. */
5131         mac_selinux_maybe_reload();
5132
5133         log_unit_struct(unit, LOG_DEBUG,
5134                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5135                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5136                                                            the mount namespace in the child, but we want to log
5137                                                            from the parent, so we need to use the (possibly
5138                                                            inaccurate) path here. */
5139                         LOG_UNIT_INVOCATION_ID(unit));
5140
5141         if (params->cgroup_path) {
5142                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5143                 if (r < 0)
5144                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5145                 if (r > 0) { /* We are using a child cgroup */
5146                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5147                         if (r < 0)
5148                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5149
5150                         /* Normally we would not propagate the oomd xattrs to children but since we created this
5151                          * sub-cgroup internally we should do it. */
5152                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5153                 }
5154         }
5155
5156         pid = fork();
5157         if (pid < 0)
5158                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5159
5160         if (pid == 0) {
5161                 int exit_status = EXIT_SUCCESS;
5162
5163                 r = exec_child(unit,
5164                                command,
5165                                context,
5166                                params,
5167                                runtime,
5168                                dcreds,
5169                                socket_fd,
5170                                named_iofds,
5171                                fds,
5172                                n_socket_fds,
5173                                n_storage_fds,
5174                                files_env,
5175                                unit->manager->user_lookup_fds[1],
5176                                &exit_status);
5177
5178                 if (r < 0) {
5179                         const char *status =
5180                                 exit_status_to_string(exit_status,
5181                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5182
5183                         log_unit_struct_errno(unit, LOG_ERR, r,
5184                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5185                                               LOG_UNIT_INVOCATION_ID(unit),
5186                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5187                                                                status, command->path),
5188                                               "EXECUTABLE=%s", command->path);
5189                 }
5190
5191                 _exit(exit_status);
5192         }
5193
5194         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5195
5196         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5197          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5198          * process will be killed too). */
5199         if (subcgroup_path)
5200                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5201
5202         exec_status_start(&command->exec_status, pid);
5203
5204         *ret = pid;
5205         return 0;
5206 }
5207
5208 void exec_context_init(ExecContext *c) {
5209         assert(c);
5210
5211         c->umask = 0022;
5212         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5213         c->cpu_sched_policy = SCHED_OTHER;
5214         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5215         c->syslog_level_prefix = true;
5216         c->ignore_sigpipe = true;
5217         c->timer_slack_nsec = NSEC_INFINITY;
5218         c->personality = PERSONALITY_INVALID;
5219         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5220                 c->directories[t].mode = 0755;
5221         c->timeout_clean_usec = USEC_INFINITY;
5222         c->capability_bounding_set = CAP_ALL;
5223         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5224         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5225         c->log_level_max = -1;
5226 #if HAVE_SECCOMP
5227         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5228 #endif
5229         c->tty_rows = UINT_MAX;
5230         c->tty_cols = UINT_MAX;
5231         numa_policy_reset(&c->numa_policy);
5232 }
5233
5234 void exec_context_done(ExecContext *c) {
5235         assert(c);
5236
5237         c->environment = strv_free(c->environment);
5238         c->environment_files = strv_free(c->environment_files);
5239         c->pass_environment = strv_free(c->pass_environment);
5240         c->unset_environment = strv_free(c->unset_environment);
5241
5242         rlimit_free_all(c->rlimit);
5243
5244         for (size_t l = 0; l < 3; l++) {
5245                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5246                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5247         }
5248
5249         c->working_directory = mfree(c->working_directory);
5250         c->root_directory = mfree(c->root_directory);
5251         c->root_image = mfree(c->root_image);
5252         c->root_image_options = mount_options_free_all(c->root_image_options);
5253         c->root_hash = mfree(c->root_hash);
5254         c->root_hash_size = 0;
5255         c->root_hash_path = mfree(c->root_hash_path);
5256         c->root_hash_sig = mfree(c->root_hash_sig);
5257         c->root_hash_sig_size = 0;
5258         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5259         c->root_verity = mfree(c->root_verity);
5260         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5261         c->tty_path = mfree(c->tty_path);
5262         c->syslog_identifier = mfree(c->syslog_identifier);
5263         c->user = mfree(c->user);
5264         c->group = mfree(c->group);
5265
5266         c->supplementary_groups = strv_free(c->supplementary_groups);
5267
5268         c->pam_name = mfree(c->pam_name);
5269
5270         c->read_only_paths = strv_free(c->read_only_paths);
5271         c->read_write_paths = strv_free(c->read_write_paths);
5272         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5273         c->exec_paths = strv_free(c->exec_paths);
5274         c->no_exec_paths = strv_free(c->no_exec_paths);
5275         c->exec_search_path = strv_free(c->exec_search_path);
5276
5277         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5278         c->bind_mounts = NULL;
5279         c->n_bind_mounts = 0;
5280         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5281         c->temporary_filesystems = NULL;
5282         c->n_temporary_filesystems = 0;
5283         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5284
5285         cpu_set_reset(&c->cpu_set);
5286         numa_policy_reset(&c->numa_policy);
5287
5288         c->utmp_id = mfree(c->utmp_id);
5289         c->selinux_context = mfree(c->selinux_context);
5290         c->apparmor_profile = mfree(c->apparmor_profile);
5291         c->smack_process_label = mfree(c->smack_process_label);
5292
5293         c->restrict_filesystems = set_free(c->restrict_filesystems);
5294
5295         c->syscall_filter = hashmap_free(c->syscall_filter);
5296         c->syscall_archs = set_free(c->syscall_archs);
5297         c->address_families = set_free(c->address_families);
5298
5299         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5300                 exec_directory_done(&c->directories[t]);
5301
5302         c->log_level_max = -1;
5303
5304         exec_context_free_log_extra_fields(c);
5305
5306         c->log_ratelimit_interval_usec = 0;
5307         c->log_ratelimit_burst = 0;
5308
5309         c->stdin_data = mfree(c->stdin_data);
5310         c->stdin_data_size = 0;
5311
5312         c->network_namespace_path = mfree(c->network_namespace_path);
5313         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5314
5315         c->log_namespace = mfree(c->log_namespace);
5316
5317         c->load_credentials = hashmap_free(c->load_credentials);
5318         c->set_credentials = hashmap_free(c->set_credentials);
5319 }
5320
5321 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5322         assert(c);
5323
5324         if (!runtime_prefix)
5325                 return 0;
5326
5327         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5328                 _cleanup_free_ char *p = NULL;
5329
5330                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5331                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5332                 else
5333                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5334                 if (!p)
5335                         return -ENOMEM;
5336
5337                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5338                  * service next. */
5339                 (void) rm_rf(p, REMOVE_ROOT);
5340
5341                 char **symlink;
5342                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5343                         _cleanup_free_ char *symlink_abs = NULL;
5344
5345                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5346                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5347                         else
5348                                 symlink_abs = path_join(runtime_prefix, *symlink);
5349                         if (!symlink_abs)
5350                                 return -ENOMEM;
5351
5352                         (void) unlink(symlink_abs);
5353                 }
5354
5355         }
5356
5357         return 0;
5358 }
5359
5360 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5361         _cleanup_free_ char *p = NULL;
5362
5363         assert(c);
5364
5365         if (!runtime_prefix || !unit)
5366                 return 0;
5367
5368         p = path_join(runtime_prefix, "credentials", unit);
5369         if (!p)
5370                 return -ENOMEM;
5371
5372         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5373          * unmount it, and afterwards remove the mount point */
5374         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5375         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5376
5377         return 0;
5378 }
5379
5380 static void exec_command_done(ExecCommand *c) {
5381         assert(c);
5382
5383         c->path = mfree(c->path);
5384         c->argv = strv_free(c->argv);
5385 }
5386
5387 void exec_command_done_array(ExecCommand *c, size_t n) {
5388         for (size_t i = 0; i < n; i++)
5389                 exec_command_done(c+i);
5390 }
5391
5392 ExecCommand* exec_command_free_list(ExecCommand *c) {
5393         ExecCommand *i;
5394
5395         while ((i = c)) {
5396                 LIST_REMOVE(command, c, i);
5397                 exec_command_done(i);
5398                 free(i);
5399         }
5400
5401         return NULL;
5402 }
5403
5404 void exec_command_free_array(ExecCommand **c, size_t n) {
5405         for (size_t i = 0; i < n; i++)
5406                 c[i] = exec_command_free_list(c[i]);
5407 }
5408
5409 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5410         for (size_t i = 0; i < n; i++)
5411                 exec_status_reset(&c[i].exec_status);
5412 }
5413
5414 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5415         for (size_t i = 0; i < n; i++) {
5416                 ExecCommand *z;
5417
5418                 LIST_FOREACH(command, z, c[i])
5419                         exec_status_reset(&z->exec_status);
5420         }
5421 }
5422
5423 typedef struct InvalidEnvInfo {
5424         const Unit *unit;
5425         const char *path;
5426 } InvalidEnvInfo;
5427
5428 static void invalid_env(const char *p, void *userdata) {
5429         InvalidEnvInfo *info = userdata;
5430
5431         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5432 }
5433
5434 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5435         assert(c);
5436
5437         switch (fd_index) {
5438
5439         case STDIN_FILENO:
5440                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5441                         return NULL;
5442
5443                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5444
5445         case STDOUT_FILENO:
5446                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5447                         return NULL;
5448
5449                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5450
5451         case STDERR_FILENO:
5452                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5453                         return NULL;
5454
5455                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5456
5457         default:
5458                 return NULL;
5459         }
5460 }
5461
5462 static int exec_context_named_iofds(
5463                 const ExecContext *c,
5464                 const ExecParameters *p,
5465                 int named_iofds[static 3]) {
5466
5467         size_t targets;
5468         const char* stdio_fdname[3];
5469         size_t n_fds;
5470
5471         assert(c);
5472         assert(p);
5473         assert(named_iofds);
5474
5475         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5476                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5477                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5478
5479         for (size_t i = 0; i < 3; i++)
5480                 stdio_fdname[i] = exec_context_fdname(c, i);
5481
5482         n_fds = p->n_storage_fds + p->n_socket_fds;
5483
5484         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5485                 if (named_iofds[STDIN_FILENO] < 0 &&
5486                     c->std_input == EXEC_INPUT_NAMED_FD &&
5487                     stdio_fdname[STDIN_FILENO] &&
5488                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5489
5490                         named_iofds[STDIN_FILENO] = p->fds[i];
5491                         targets--;
5492
5493                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5494                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5495                            stdio_fdname[STDOUT_FILENO] &&
5496                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5497
5498                         named_iofds[STDOUT_FILENO] = p->fds[i];
5499                         targets--;
5500
5501                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5502                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5503                            stdio_fdname[STDERR_FILENO] &&
5504                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5505
5506                         named_iofds[STDERR_FILENO] = p->fds[i];
5507                         targets--;
5508                 }
5509
5510         return targets == 0 ? 0 : -ENOENT;
5511 }
5512
5513 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5514         char **i, **r = NULL;
5515
5516         assert(c);
5517         assert(l);
5518
5519         STRV_FOREACH(i, c->environment_files) {
5520                 char *fn;
5521                 int k;
5522                 bool ignore = false;
5523                 char **p;
5524                 _cleanup_globfree_ glob_t pglob = {};
5525
5526                 fn = *i;
5527
5528                 if (fn[0] == '-') {
5529                         ignore = true;
5530                         fn++;
5531                 }
5532
5533                 if (!path_is_absolute(fn)) {
5534                         if (ignore)
5535                                 continue;
5536
5537                         strv_free(r);
5538                         return -EINVAL;
5539                 }
5540
5541                 /* Filename supports globbing, take all matching files */
5542                 k = safe_glob(fn, 0, &pglob);
5543                 if (k < 0) {
5544                         if (ignore)
5545                                 continue;
5546
5547                         strv_free(r);
5548                         return k;
5549                 }
5550
5551                 /* When we don't match anything, -ENOENT should be returned */
5552                 assert(pglob.gl_pathc > 0);
5553
5554                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5555                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5556                         if (k < 0) {
5557                                 if (ignore)
5558                                         continue;
5559
5560                                 strv_free(r);
5561                                 return k;
5562                         }
5563                         /* Log invalid environment variables with filename */
5564                         if (p) {
5565                                 InvalidEnvInfo info = {
5566                                         .unit = unit,
5567                                         .path = pglob.gl_pathv[n]
5568                                 };
5569
5570                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5571                         }
5572
5573                         if (!r)
5574                                 r = p;
5575                         else {
5576                                 char **m;
5577
5578                                 m = strv_env_merge(r, p);
5579                                 strv_free(r);
5580                                 strv_free(p);
5581                                 if (!m)
5582                                         return -ENOMEM;
5583
5584                                 r = m;
5585                         }
5586                 }
5587         }
5588
5589         *l = r;
5590
5591         return 0;
5592 }
5593
5594 static bool tty_may_match_dev_console(const char *tty) {
5595         _cleanup_free_ char *resolved = NULL;
5596
5597         if (!tty)
5598                 return true;
5599
5600         tty = skip_dev_prefix(tty);
5601
5602         /* trivial identity? */
5603         if (streq(tty, "console"))
5604                 return true;
5605
5606         if (resolve_dev_console(&resolved) < 0)
5607                 return true; /* if we could not resolve, assume it may */
5608
5609         /* "tty0" means the active VC, so it may be the same sometimes */
5610         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5611 }
5612
5613 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5614         assert(ec);
5615
5616         return ec->tty_reset ||
5617                 ec->tty_vhangup ||
5618                 ec->tty_vt_disallocate ||
5619                 is_terminal_input(ec->std_input) ||
5620                 is_terminal_output(ec->std_output) ||
5621                 is_terminal_output(ec->std_error);
5622 }
5623
5624 bool exec_context_may_touch_console(const ExecContext *ec) {
5625
5626         return exec_context_may_touch_tty(ec) &&
5627                tty_may_match_dev_console(exec_context_tty_path(ec));
5628 }
5629
5630 static void strv_fprintf(FILE *f, char **l) {
5631         char **g;
5632
5633         assert(f);
5634
5635         STRV_FOREACH(g, l)
5636                 fprintf(f, " %s", *g);
5637 }
5638
5639 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5640         assert(f);
5641         assert(prefix);
5642         assert(name);
5643
5644         if (!strv_isempty(strv)) {
5645                 fprintf(f, "%s%s:", prefix, name);
5646                 strv_fprintf(f, strv);
5647                 fputs("\n", f);
5648         }
5649 }
5650
5651 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5652         char **e, **d;
5653         int r;
5654
5655         assert(c);
5656         assert(f);
5657
5658         prefix = strempty(prefix);
5659
5660         fprintf(f,
5661                 "%sUMask: %04o\n"
5662                 "%sWorkingDirectory: %s\n"
5663                 "%sRootDirectory: %s\n"
5664                 "%sNonBlocking: %s\n"
5665                 "%sPrivateTmp: %s\n"
5666                 "%sPrivateDevices: %s\n"
5667                 "%sProtectKernelTunables: %s\n"
5668                 "%sProtectKernelModules: %s\n"
5669                 "%sProtectKernelLogs: %s\n"
5670                 "%sProtectClock: %s\n"
5671                 "%sProtectControlGroups: %s\n"
5672                 "%sPrivateNetwork: %s\n"
5673                 "%sPrivateUsers: %s\n"
5674                 "%sProtectHome: %s\n"
5675                 "%sProtectSystem: %s\n"
5676                 "%sMountAPIVFS: %s\n"
5677                 "%sIgnoreSIGPIPE: %s\n"
5678                 "%sMemoryDenyWriteExecute: %s\n"
5679                 "%sRestrictRealtime: %s\n"
5680                 "%sRestrictSUIDSGID: %s\n"
5681                 "%sKeyringMode: %s\n"
5682                 "%sProtectHostname: %s\n"
5683                 "%sProtectProc: %s\n"
5684                 "%sProcSubset: %s\n",
5685                 prefix, c->umask,
5686                 prefix, empty_to_root(c->working_directory),
5687                 prefix, empty_to_root(c->root_directory),
5688                 prefix, yes_no(c->non_blocking),
5689                 prefix, yes_no(c->private_tmp),
5690                 prefix, yes_no(c->private_devices),
5691                 prefix, yes_no(c->protect_kernel_tunables),
5692                 prefix, yes_no(c->protect_kernel_modules),
5693                 prefix, yes_no(c->protect_kernel_logs),
5694                 prefix, yes_no(c->protect_clock),
5695                 prefix, yes_no(c->protect_control_groups),
5696                 prefix, yes_no(c->private_network),
5697                 prefix, yes_no(c->private_users),
5698                 prefix, protect_home_to_string(c->protect_home),
5699                 prefix, protect_system_to_string(c->protect_system),
5700                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5701                 prefix, yes_no(c->ignore_sigpipe),
5702                 prefix, yes_no(c->memory_deny_write_execute),
5703                 prefix, yes_no(c->restrict_realtime),
5704                 prefix, yes_no(c->restrict_suid_sgid),
5705                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5706                 prefix, yes_no(c->protect_hostname),
5707                 prefix, protect_proc_to_string(c->protect_proc),
5708                 prefix, proc_subset_to_string(c->proc_subset));
5709
5710         if (c->root_image)
5711                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5712
5713         if (c->root_image_options) {
5714                 MountOptions *o;
5715
5716                 fprintf(f, "%sRootImageOptions:", prefix);
5717                 LIST_FOREACH(mount_options, o, c->root_image_options)
5718                         if (!isempty(o->options))
5719                                 fprintf(f, " %s:%s",
5720                                         partition_designator_to_string(o->partition_designator),
5721                                         o->options);
5722                 fprintf(f, "\n");
5723         }
5724
5725         if (c->root_hash) {
5726                 _cleanup_free_ char *encoded = NULL;
5727                 encoded = hexmem(c->root_hash, c->root_hash_size);
5728                 if (encoded)
5729                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5730         }
5731
5732         if (c->root_hash_path)
5733                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5734
5735         if (c->root_hash_sig) {
5736                 _cleanup_free_ char *encoded = NULL;
5737                 ssize_t len;
5738                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5739                 if (len)
5740                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5741         }
5742
5743         if (c->root_hash_sig_path)
5744                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5745
5746         if (c->root_verity)
5747                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5748
5749         STRV_FOREACH(e, c->environment)
5750                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5751
5752         STRV_FOREACH(e, c->environment_files)
5753                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5754
5755         STRV_FOREACH(e, c->pass_environment)
5756                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5757
5758         STRV_FOREACH(e, c->unset_environment)
5759                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5760
5761         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5762
5763         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5764                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5765
5766                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5767                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5768
5769                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5770                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5771                 }
5772         }
5773
5774         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5775
5776         if (c->nice_set)
5777                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5778
5779         if (c->oom_score_adjust_set)
5780                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5781
5782         if (c->coredump_filter_set)
5783                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5784
5785         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5786                 if (c->rlimit[i]) {
5787                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5788                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5789                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5790                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5791                 }
5792
5793         if (c->ioprio_set) {
5794                 _cleanup_free_ char *class_str = NULL;
5795
5796                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5797                 if (r >= 0)
5798                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5799
5800                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5801         }
5802
5803         if (c->cpu_sched_set) {
5804                 _cleanup_free_ char *policy_str = NULL;
5805
5806                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5807                 if (r >= 0)
5808                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5809
5810                 fprintf(f,
5811                         "%sCPUSchedulingPriority: %i\n"
5812                         "%sCPUSchedulingResetOnFork: %s\n",
5813                         prefix, c->cpu_sched_priority,
5814                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5815         }
5816
5817         if (c->cpu_set.set) {
5818                 _cleanup_free_ char *affinity = NULL;
5819
5820                 affinity = cpu_set_to_range_string(&c->cpu_set);
5821                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5822         }
5823
5824         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5825                 _cleanup_free_ char *nodes = NULL;
5826
5827                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5828                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5829                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5830         }
5831
5832         if (c->timer_slack_nsec != NSEC_INFINITY)
5833                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5834
5835         fprintf(f,
5836                 "%sStandardInput: %s\n"
5837                 "%sStandardOutput: %s\n"
5838                 "%sStandardError: %s\n",
5839                 prefix, exec_input_to_string(c->std_input),
5840                 prefix, exec_output_to_string(c->std_output),
5841                 prefix, exec_output_to_string(c->std_error));
5842
5843         if (c->std_input == EXEC_INPUT_NAMED_FD)
5844                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5845         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5846                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5847         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5848                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5849
5850         if (c->std_input == EXEC_INPUT_FILE)
5851                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5852         if (c->std_output == EXEC_OUTPUT_FILE)
5853                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5854         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5855                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5856         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5857                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5858         if (c->std_error == EXEC_OUTPUT_FILE)
5859                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5860         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5861                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5862         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5863                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5864
5865         if (c->tty_path)
5866                 fprintf(f,
5867                         "%sTTYPath: %s\n"
5868                         "%sTTYReset: %s\n"
5869                         "%sTTYVHangup: %s\n"
5870                         "%sTTYVTDisallocate: %s\n"
5871                         "%sTTYRows: %u\n"
5872                         "%sTTYColumns: %u\n",
5873                         prefix, c->tty_path,
5874                         prefix, yes_no(c->tty_reset),
5875                         prefix, yes_no(c->tty_vhangup),
5876                         prefix, yes_no(c->tty_vt_disallocate),
5877                         prefix, c->tty_rows,
5878                         prefix, c->tty_cols);
5879
5880         if (IN_SET(c->std_output,
5881                    EXEC_OUTPUT_KMSG,
5882                    EXEC_OUTPUT_JOURNAL,
5883                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5884                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5885             IN_SET(c->std_error,
5886                    EXEC_OUTPUT_KMSG,
5887                    EXEC_OUTPUT_JOURNAL,
5888                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5889                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5890
5891                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5892
5893                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5894                 if (r >= 0)
5895                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5896
5897                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5898                 if (r >= 0)
5899                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5900         }
5901
5902         if (c->log_level_max >= 0) {
5903                 _cleanup_free_ char *t = NULL;
5904
5905                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5906
5907                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5908         }
5909
5910         if (c->log_ratelimit_interval_usec > 0)
5911                 fprintf(f,
5912                         "%sLogRateLimitIntervalSec: %s\n",
5913                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5914
5915         if (c->log_ratelimit_burst > 0)
5916                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5917
5918         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5919                 fprintf(f, "%sLogExtraFields: ", prefix);
5920                 fwrite(c->log_extra_fields[j].iov_base,
5921                        1, c->log_extra_fields[j].iov_len,
5922                        f);
5923                 fputc('\n', f);
5924         }
5925
5926         if (c->log_namespace)
5927                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5928
5929         if (c->secure_bits) {
5930                 _cleanup_free_ char *str = NULL;
5931
5932                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5933                 if (r >= 0)
5934                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5935         }
5936
5937         if (c->capability_bounding_set != CAP_ALL) {
5938                 _cleanup_free_ char *str = NULL;
5939
5940                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5941                 if (r >= 0)
5942                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5943         }
5944
5945         if (c->capability_ambient_set != 0) {
5946                 _cleanup_free_ char *str = NULL;
5947
5948                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5949                 if (r >= 0)
5950                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5951         }
5952
5953         if (c->user)
5954                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5955         if (c->group)
5956                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5957
5958         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5959
5960         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5961
5962         if (c->pam_name)
5963                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5964
5965         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5966         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5967         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5968         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5969         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5970         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
5971
5972         for (size_t i = 0; i < c->n_bind_mounts; i++)
5973                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5974                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5975                         c->bind_mounts[i].ignore_enoent ? "-": "",
5976                         c->bind_mounts[i].source,
5977                         c->bind_mounts[i].destination,
5978                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
5979
5980         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5981                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5982
5983                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5984                         t->path,
5985                         isempty(t->options) ? "" : ":",
5986                         strempty(t->options));
5987         }
5988
5989         if (c->utmp_id)
5990                 fprintf(f,
5991                         "%sUtmpIdentifier: %s\n",
5992                         prefix, c->utmp_id);
5993
5994         if (c->selinux_context)
5995                 fprintf(f,
5996                         "%sSELinuxContext: %s%s\n",
5997                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5998
5999         if (c->apparmor_profile)
6000                 fprintf(f,
6001                         "%sAppArmorProfile: %s%s\n",
6002                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6003
6004         if (c->smack_process_label)
6005                 fprintf(f,
6006                         "%sSmackProcessLabel: %s%s\n",
6007                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6008
6009         if (c->personality != PERSONALITY_INVALID)
6010                 fprintf(f,
6011                         "%sPersonality: %s\n",
6012                         prefix, strna(personality_to_string(c->personality)));
6013
6014         fprintf(f,
6015                 "%sLockPersonality: %s\n",
6016                 prefix, yes_no(c->lock_personality));
6017
6018         if (c->syscall_filter) {
6019 #if HAVE_SECCOMP
6020                 void *id, *val;
6021                 bool first = true;
6022 #endif
6023
6024                 fprintf(f,
6025                         "%sSystemCallFilter: ",
6026                         prefix);
6027
6028                 if (!c->syscall_allow_list)
6029                         fputc('~', f);
6030
6031 #if HAVE_SECCOMP
6032                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6033                         _cleanup_free_ char *name = NULL;
6034                         const char *errno_name = NULL;
6035                         int num = PTR_TO_INT(val);
6036
6037                         if (first)
6038                                 first = false;
6039                         else
6040                                 fputc(' ', f);
6041
6042                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6043                         fputs(strna(name), f);
6044
6045                         if (num >= 0) {
6046                                 errno_name = seccomp_errno_or_action_to_string(num);
6047                                 if (errno_name)
6048                                         fprintf(f, ":%s", errno_name);
6049                                 else
6050                                         fprintf(f, ":%d", num);
6051                         }
6052                 }
6053 #endif
6054
6055                 fputc('\n', f);
6056         }
6057
6058         if (c->syscall_archs) {
6059 #if HAVE_SECCOMP
6060                 void *id;
6061 #endif
6062
6063                 fprintf(f,
6064                         "%sSystemCallArchitectures:",
6065                         prefix);
6066
6067 #if HAVE_SECCOMP
6068                 SET_FOREACH(id, c->syscall_archs)
6069                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6070 #endif
6071                 fputc('\n', f);
6072         }
6073
6074         if (exec_context_restrict_namespaces_set(c)) {
6075                 _cleanup_free_ char *s = NULL;
6076
6077                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6078                 if (r >= 0)
6079                         fprintf(f, "%sRestrictNamespaces: %s\n",
6080                                 prefix, strna(s));
6081         }
6082
6083 #if HAVE_LIBBPF
6084         if (exec_context_restrict_filesystems_set(c))
6085                 SET_FOREACH(e, c->restrict_filesystems)
6086                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
6087 #endif
6088
6089         if (c->network_namespace_path)
6090                 fprintf(f,
6091                         "%sNetworkNamespacePath: %s\n",
6092                         prefix, c->network_namespace_path);
6093
6094         if (c->syscall_errno > 0) {
6095 #if HAVE_SECCOMP
6096                 const char *errno_name;
6097 #endif
6098
6099                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6100
6101 #if HAVE_SECCOMP
6102                 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6103                 if (errno_name)
6104                         fputs(errno_name, f);
6105                 else
6106                         fprintf(f, "%d", c->syscall_errno);
6107 #endif
6108                 fputc('\n', f);
6109         }
6110
6111         for (size_t i = 0; i < c->n_mount_images; i++) {
6112                 MountOptions *o;
6113
6114                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6115                         c->mount_images[i].ignore_enoent ? "-": "",
6116                         c->mount_images[i].source,
6117                         c->mount_images[i].destination);
6118                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6119                         fprintf(f, ":%s:%s",
6120                                 partition_designator_to_string(o->partition_designator),
6121                                 strempty(o->options));
6122                 fprintf(f, "\n");
6123         }
6124
6125         for (size_t i = 0; i < c->n_extension_images; i++) {
6126                 MountOptions *o;
6127
6128                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6129                         c->extension_images[i].ignore_enoent ? "-": "",
6130                         c->extension_images[i].source);
6131                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6132                         fprintf(f, ":%s:%s",
6133                                 partition_designator_to_string(o->partition_designator),
6134                                 strempty(o->options));
6135                 fprintf(f, "\n");
6136         }
6137 }
6138
6139 bool exec_context_maintains_privileges(const ExecContext *c) {
6140         assert(c);
6141
6142         /* Returns true if the process forked off would run under
6143          * an unchanged UID or as root. */
6144
6145         if (!c->user)
6146                 return true;
6147
6148         if (streq(c->user, "root") || streq(c->user, "0"))
6149                 return true;
6150
6151         return false;
6152 }
6153
6154 int exec_context_get_effective_ioprio(const ExecContext *c) {
6155         int p;
6156
6157         assert(c);
6158
6159         if (c->ioprio_set)
6160                 return c->ioprio;
6161
6162         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6163         if (p < 0)
6164                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6165
6166         return ioprio_normalize(p);
6167 }
6168
6169 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6170         assert(c);
6171
6172         /* Explicit setting wins */
6173         if (c->mount_apivfs_set)
6174                 return c->mount_apivfs;
6175
6176         /* Default to "yes" if root directory or image are specified */
6177         if (exec_context_with_rootfs(c))
6178                 return true;
6179
6180         return false;
6181 }
6182
6183 void exec_context_free_log_extra_fields(ExecContext *c) {
6184         assert(c);
6185
6186         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6187                 free(c->log_extra_fields[l].iov_base);
6188         c->log_extra_fields = mfree(c->log_extra_fields);
6189         c->n_log_extra_fields = 0;
6190 }
6191
6192 void exec_context_revert_tty(ExecContext *c) {
6193         _cleanup_close_ int fd = -1;
6194         const char *path;
6195         struct stat st;
6196         int r;
6197
6198         assert(c);
6199
6200         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6201         exec_context_tty_reset(c, NULL);
6202
6203         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6204          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6205          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6206         if (!exec_context_may_touch_tty(c))
6207                 return;
6208
6209         path = exec_context_tty_path(c);
6210         if (!path)
6211                 return;
6212
6213         fd = open(path, O_PATH|O_CLOEXEC);
6214         if (fd < 0)
6215                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6216                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6217                                              path);
6218
6219         if (fstat(fd, &st) < 0)
6220                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6221
6222         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6223          * if things are a character device, since a proper check either means we'd have to open the TTY and
6224          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6225          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6226          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6227         if (!S_ISCHR(st.st_mode))
6228                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6229
6230         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6231         if (r < 0)
6232                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6233 }
6234
6235 int exec_context_get_clean_directories(
6236                 ExecContext *c,
6237                 char **prefix,
6238                 ExecCleanMask mask,
6239                 char ***ret) {
6240
6241         _cleanup_strv_free_ char **l = NULL;
6242         int r;
6243
6244         assert(c);
6245         assert(prefix);
6246         assert(ret);
6247
6248         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6249                 if (!FLAGS_SET(mask, 1U << t))
6250                         continue;
6251
6252                 if (!prefix[t])
6253                         continue;
6254
6255                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6256                         char *j;
6257
6258                         j = path_join(prefix[t], c->directories[t].items[i].path);
6259                         if (!j)
6260                                 return -ENOMEM;
6261
6262                         r = strv_consume(&l, j);
6263                         if (r < 0)
6264                                 return r;
6265
6266                         /* Also remove private directories unconditionally. */
6267                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6268                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6269                                 if (!j)
6270                                         return -ENOMEM;
6271
6272                                 r = strv_consume(&l, j);
6273                                 if (r < 0)
6274                                         return r;
6275                         }
6276
6277                         char **symlink;
6278                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6279                                 j = path_join(prefix[t], *symlink);
6280                                 if (!j)
6281                                         return -ENOMEM;
6282
6283                                 r = strv_consume(&l, j);
6284                                 if (r < 0)
6285                                         return r;
6286                         }
6287                 }
6288         }
6289
6290         *ret = TAKE_PTR(l);
6291         return 0;
6292 }
6293
6294 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6295         ExecCleanMask mask = 0;
6296
6297         assert(c);
6298         assert(ret);
6299
6300         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6301                 if (c->directories[t].n_items > 0)
6302                         mask |= 1U << t;
6303
6304         *ret = mask;
6305         return 0;
6306 }
6307
6308 void exec_status_start(ExecStatus *s, pid_t pid) {
6309         assert(s);
6310
6311         *s = (ExecStatus) {
6312                 .pid = pid,
6313         };
6314
6315         dual_timestamp_get(&s->start_timestamp);
6316 }
6317
6318 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6319         assert(s);
6320
6321         if (s->pid != pid)
6322                 *s = (ExecStatus) {
6323                         .pid = pid,
6324                 };
6325
6326         dual_timestamp_get(&s->exit_timestamp);
6327
6328         s->code = code;
6329         s->status = status;
6330
6331         if (context && context->utmp_id)
6332                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6333 }
6334
6335 void exec_status_reset(ExecStatus *s) {
6336         assert(s);
6337
6338         *s = (ExecStatus) {};
6339 }
6340
6341 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6342         assert(s);
6343         assert(f);
6344
6345         if (s->pid <= 0)
6346                 return;
6347
6348         prefix = strempty(prefix);
6349
6350         fprintf(f,
6351                 "%sPID: "PID_FMT"\n",
6352                 prefix, s->pid);
6353
6354         if (dual_timestamp_is_set(&s->start_timestamp))
6355                 fprintf(f,
6356                         "%sStart Timestamp: %s\n",
6357                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6358
6359         if (dual_timestamp_is_set(&s->exit_timestamp))
6360                 fprintf(f,
6361                         "%sExit Timestamp: %s\n"
6362                         "%sExit Code: %s\n"
6363                         "%sExit Status: %i\n",
6364                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6365                         prefix, sigchld_code_to_string(s->code),
6366                         prefix, s->status);
6367 }
6368
6369 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6370         _cleanup_free_ char *cmd = NULL;
6371         const char *prefix2;
6372
6373         assert(c);
6374         assert(f);
6375
6376         prefix = strempty(prefix);
6377         prefix2 = strjoina(prefix, "\t");
6378
6379         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6380         fprintf(f,
6381                 "%sCommand Line: %s\n",
6382                 prefix, cmd ?: strerror_safe(ENOMEM));
6383
6384         exec_status_dump(&c->exec_status, f, prefix2);
6385 }
6386
6387 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6388         assert(f);
6389
6390         prefix = strempty(prefix);
6391
6392         LIST_FOREACH(command, c, c)
6393                 exec_command_dump(c, f, prefix);
6394 }
6395
6396 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6397         ExecCommand *end;
6398
6399         assert(l);
6400         assert(e);
6401
6402         if (*l) {
6403                 /* It's kind of important, that we keep the order here */
6404                 LIST_FIND_TAIL(command, *l, end);
6405                 LIST_INSERT_AFTER(command, *l, end, e);
6406         } else
6407               *l = e;
6408 }
6409
6410 int exec_command_set(ExecCommand *c, const char *path, ...) {
6411         va_list ap;
6412         char **l, *p;
6413
6414         assert(c);
6415         assert(path);
6416
6417         va_start(ap, path);
6418         l = strv_new_ap(path, ap);
6419         va_end(ap);
6420
6421         if (!l)
6422                 return -ENOMEM;
6423
6424         p = strdup(path);
6425         if (!p) {
6426                 strv_free(l);
6427                 return -ENOMEM;
6428         }
6429
6430         free_and_replace(c->path, p);
6431
6432         return strv_free_and_replace(c->argv, l);
6433 }
6434
6435 int exec_command_append(ExecCommand *c, const char *path, ...) {
6436         _cleanup_strv_free_ char **l = NULL;
6437         va_list ap;
6438         int r;
6439
6440         assert(c);
6441         assert(path);
6442
6443         va_start(ap, path);
6444         l = strv_new_ap(path, ap);
6445         va_end(ap);
6446
6447         if (!l)
6448                 return -ENOMEM;
6449
6450         r = strv_extend_strv(&c->argv, l, false);
6451         if (r < 0)
6452                 return r;
6453
6454         return 0;
6455 }
6456
6457 static void *remove_tmpdir_thread(void *p) {
6458         _cleanup_free_ char *path = p;
6459
6460         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6461         return NULL;
6462 }
6463
6464 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6465         int r;
6466
6467         if (!rt)
6468                 return NULL;
6469
6470         if (rt->manager)
6471                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6472
6473         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6474
6475         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6476                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6477
6478                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6479                 if (r < 0)
6480                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6481                 else
6482                         rt->tmp_dir = NULL;
6483         }
6484
6485         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6486                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6487
6488                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6489                 if (r < 0)
6490                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6491                 else
6492                         rt->var_tmp_dir = NULL;
6493         }
6494
6495         rt->id = mfree(rt->id);
6496         rt->tmp_dir = mfree(rt->tmp_dir);
6497         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6498         safe_close_pair(rt->netns_storage_socket);
6499         safe_close_pair(rt->ipcns_storage_socket);
6500         return mfree(rt);
6501 }
6502
6503 static void exec_runtime_freep(ExecRuntime **rt) {
6504         (void) exec_runtime_free(*rt, false);
6505 }
6506
6507 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6508         _cleanup_free_ char *id_copy = NULL;
6509         ExecRuntime *n;
6510
6511         assert(ret);
6512
6513         id_copy = strdup(id);
6514         if (!id_copy)
6515                 return -ENOMEM;
6516
6517         n = new(ExecRuntime, 1);
6518         if (!n)
6519                 return -ENOMEM;
6520
6521         *n = (ExecRuntime) {
6522                 .id = TAKE_PTR(id_copy),
6523                 .netns_storage_socket = { -1, -1 },
6524                 .ipcns_storage_socket = { -1, -1 },
6525         };
6526
6527         *ret = n;
6528         return 0;
6529 }
6530
6531 static int exec_runtime_add(
6532                 Manager *m,
6533                 const char *id,
6534                 char **tmp_dir,
6535                 char **var_tmp_dir,
6536                 int netns_storage_socket[2],
6537                 int ipcns_storage_socket[2],
6538                 ExecRuntime **ret) {
6539
6540         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6541         int r;
6542
6543         assert(m);
6544         assert(id);
6545
6546         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6547
6548         r = exec_runtime_allocate(&rt, id);
6549         if (r < 0)
6550                 return r;
6551
6552         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6553         if (r < 0)
6554                 return r;
6555
6556         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6557         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6558         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6559
6560         if (netns_storage_socket) {
6561                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6562                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6563         }
6564
6565         if (ipcns_storage_socket) {
6566                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6567                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6568         }
6569
6570         rt->manager = m;
6571
6572         if (ret)
6573                 *ret = rt;
6574         /* do not remove created ExecRuntime object when the operation succeeds. */
6575         TAKE_PTR(rt);
6576         return 0;
6577 }
6578
6579 static int exec_runtime_make(
6580                 Manager *m,
6581                 const ExecContext *c,
6582                 const char *id,
6583                 ExecRuntime **ret) {
6584
6585         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6586         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6587         int r;
6588
6589         assert(m);
6590         assert(c);
6591         assert(id);
6592
6593         /* It is not necessary to create ExecRuntime object. */
6594         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6595                 *ret = NULL;
6596                 return 0;
6597         }
6598
6599         if (c->private_tmp &&
6600             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6601               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6602                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6603                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6604                 if (r < 0)
6605                         return r;
6606         }
6607
6608         if (c->private_network || c->network_namespace_path) {
6609                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6610                         return -errno;
6611         }
6612
6613         if (c->private_ipc || c->ipc_namespace_path) {
6614                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6615                         return -errno;
6616         }
6617
6618         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6619         if (r < 0)
6620                 return r;
6621
6622         return 1;
6623 }
6624
6625 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6626         ExecRuntime *rt;
6627         int r;
6628
6629         assert(m);
6630         assert(id);
6631         assert(ret);
6632
6633         rt = hashmap_get(m->exec_runtime_by_id, id);
6634         if (rt)
6635                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6636                 goto ref;
6637
6638         if (!create) {
6639                 *ret = NULL;
6640                 return 0;
6641         }
6642
6643         /* If not found, then create a new object. */
6644         r = exec_runtime_make(m, c, id, &rt);
6645         if (r < 0)
6646                 return r;
6647         if (r == 0) {
6648                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6649                 *ret = NULL;
6650                 return 0;
6651         }
6652
6653 ref:
6654         /* increment reference counter. */
6655         rt->n_ref++;
6656         *ret = rt;
6657         return 1;
6658 }
6659
6660 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6661         if (!rt)
6662                 return NULL;
6663
6664         assert(rt->n_ref > 0);
6665
6666         rt->n_ref--;
6667         if (rt->n_ref > 0)
6668                 return NULL;
6669
6670         return exec_runtime_free(rt, destroy);
6671 }
6672
6673 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6674         ExecRuntime *rt;
6675
6676         assert(m);
6677         assert(f);
6678         assert(fds);
6679
6680         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6681                 fprintf(f, "exec-runtime=%s", rt->id);
6682
6683                 if (rt->tmp_dir)
6684                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6685
6686                 if (rt->var_tmp_dir)
6687                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6688
6689                 if (rt->netns_storage_socket[0] >= 0) {
6690                         int copy;
6691
6692                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6693                         if (copy < 0)
6694                                 return copy;
6695
6696                         fprintf(f, " netns-socket-0=%i", copy);
6697                 }
6698
6699                 if (rt->netns_storage_socket[1] >= 0) {
6700                         int copy;
6701
6702                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6703                         if (copy < 0)
6704                                 return copy;
6705
6706                         fprintf(f, " netns-socket-1=%i", copy);
6707                 }
6708
6709                 if (rt->ipcns_storage_socket[0] >= 0) {
6710                         int copy;
6711
6712                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6713                         if (copy < 0)
6714                                 return copy;
6715
6716                         fprintf(f, " ipcns-socket-0=%i", copy);
6717                 }
6718
6719                 if (rt->ipcns_storage_socket[1] >= 0) {
6720                         int copy;
6721
6722                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6723                         if (copy < 0)
6724                                 return copy;
6725
6726                         fprintf(f, " ipcns-socket-1=%i", copy);
6727                 }
6728
6729                 fputc('\n', f);
6730         }
6731
6732         return 0;
6733 }
6734
6735 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6736         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6737         ExecRuntime *rt;
6738         int r;
6739
6740         /* This is for the migration from old (v237 or earlier) deserialization text.
6741          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6742          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6743          * so or not from the serialized text, then we always creates a new object owned by this. */
6744
6745         assert(u);
6746         assert(key);
6747         assert(value);
6748
6749         /* Manager manages ExecRuntime objects by the unit id.
6750          * So, we omit the serialized text when the unit does not have id (yet?)... */
6751         if (isempty(u->id)) {
6752                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6753                 return 0;
6754         }
6755
6756         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6757                 return log_oom();
6758
6759         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6760         if (!rt) {
6761                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6762                         return log_oom();
6763
6764                 rt = rt_create;
6765         }
6766
6767         if (streq(key, "tmp-dir")) {
6768                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6769                         return -ENOMEM;
6770
6771         } else if (streq(key, "var-tmp-dir")) {
6772                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6773                         return -ENOMEM;
6774
6775         } else if (streq(key, "netns-socket-0")) {
6776                 int fd;
6777
6778                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6779                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6780                         return 0;
6781                 }
6782
6783                 safe_close(rt->netns_storage_socket[0]);
6784                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6785
6786         } else if (streq(key, "netns-socket-1")) {
6787                 int fd;
6788
6789                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6790                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6791                         return 0;
6792                 }
6793
6794                 safe_close(rt->netns_storage_socket[1]);
6795                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6796
6797         } else
6798                 return 0;
6799
6800         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6801         if (rt_create) {
6802                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6803                 if (r < 0) {
6804                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6805                         return 0;
6806                 }
6807
6808                 rt_create->manager = u->manager;
6809
6810                 /* Avoid cleanup */
6811                 TAKE_PTR(rt_create);
6812         }
6813
6814         return 1;
6815 }
6816
6817 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6818         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6819         char *id = NULL;
6820         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6821         const char *p, *v = value;
6822         size_t n;
6823
6824         assert(m);
6825         assert(value);
6826         assert(fds);
6827
6828         n = strcspn(v, " ");
6829         id = strndupa_safe(v, n);
6830         if (v[n] != ' ')
6831                 goto finalize;
6832         p = v + n + 1;
6833
6834         v = startswith(p, "tmp-dir=");
6835         if (v) {
6836                 n = strcspn(v, " ");
6837                 tmp_dir = strndup(v, n);
6838                 if (!tmp_dir)
6839                         return log_oom();
6840                 if (v[n] != ' ')
6841                         goto finalize;
6842                 p = v + n + 1;
6843         }
6844
6845         v = startswith(p, "var-tmp-dir=");
6846         if (v) {
6847                 n = strcspn(v, " ");
6848                 var_tmp_dir = strndup(v, n);
6849                 if (!var_tmp_dir)
6850                         return log_oom();
6851                 if (v[n] != ' ')
6852                         goto finalize;
6853                 p = v + n + 1;
6854         }
6855
6856         v = startswith(p, "netns-socket-0=");
6857         if (v) {
6858                 char *buf;
6859
6860                 n = strcspn(v, " ");
6861                 buf = strndupa_safe(v, n);
6862
6863                 r = safe_atoi(buf, &netns_fdpair[0]);
6864                 if (r < 0)
6865                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6866                 if (!fdset_contains(fds, netns_fdpair[0]))
6867                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6868                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6869                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6870                 if (v[n] != ' ')
6871                         goto finalize;
6872                 p = v + n + 1;
6873         }
6874
6875         v = startswith(p, "netns-socket-1=");
6876         if (v) {
6877                 char *buf;
6878
6879                 n = strcspn(v, " ");
6880                 buf = strndupa_safe(v, n);
6881
6882                 r = safe_atoi(buf, &netns_fdpair[1]);
6883                 if (r < 0)
6884                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6885                 if (!fdset_contains(fds, netns_fdpair[1]))
6886                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6887                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6888                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6889                 if (v[n] != ' ')
6890                         goto finalize;
6891                 p = v + n + 1;
6892         }
6893
6894         v = startswith(p, "ipcns-socket-0=");
6895         if (v) {
6896                 char *buf;
6897
6898                 n = strcspn(v, " ");
6899                 buf = strndupa_safe(v, n);
6900
6901                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6902                 if (r < 0)
6903                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6904                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6905                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6906                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6907                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6908                 if (v[n] != ' ')
6909                         goto finalize;
6910                 p = v + n + 1;
6911         }
6912
6913         v = startswith(p, "ipcns-socket-1=");
6914         if (v) {
6915                 char *buf;
6916
6917                 n = strcspn(v, " ");
6918                 buf = strndupa_safe(v, n);
6919
6920                 r = safe_atoi(buf, &ipcns_fdpair[1]);
6921                 if (r < 0)
6922                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6923                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6924                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6925                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6926                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6927         }
6928
6929 finalize:
6930         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6931         if (r < 0)
6932                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6933         return 0;
6934 }
6935
6936 void exec_runtime_vacuum(Manager *m) {
6937         ExecRuntime *rt;
6938
6939         assert(m);
6940
6941         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6942
6943         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6944                 if (rt->n_ref > 0)
6945                         continue;
6946
6947                 (void) exec_runtime_free(rt, false);
6948         }
6949 }
6950
6951 void exec_params_clear(ExecParameters *p) {
6952         if (!p)
6953                 return;
6954
6955         p->environment = strv_free(p->environment);
6956         p->fd_names = strv_free(p->fd_names);
6957         p->fds = mfree(p->fds);
6958         p->exec_fd = safe_close(p->exec_fd);
6959 }
6960
6961 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6962         if (!sc)
6963                 return NULL;
6964
6965         free(sc->id);
6966         free(sc->data);
6967         return mfree(sc);
6968 }
6969
6970 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6971         if (!lc)
6972                 return NULL;
6973
6974         free(lc->id);
6975         free(lc->path);
6976         return mfree(lc);
6977 }
6978
6979 void exec_directory_done(ExecDirectory *d) {
6980         if (!d)
6981                 return;
6982
6983         for (size_t i = 0; i < d->n_items; i++) {
6984                 free(d->items[i].path);
6985                 strv_free(d->items[i].symlinks);
6986         }
6987
6988         d->items = mfree(d->items);
6989         d->n_items = 0;
6990         d->mode = 0755;
6991 }
6992
6993 int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
6994         _cleanup_strv_free_ char **s = NULL;
6995         _cleanup_free_ char *p = NULL;
6996
6997         assert(d);
6998         assert(n);
6999         assert(path);
7000
7001         p = strdup(path);
7002         if (!p)
7003                 return -ENOMEM;
7004
7005         if (symlinks) {
7006                 s = strv_copy(symlinks);
7007                 if (!s)
7008                         return -ENOMEM;
7009         }
7010
7011         if (!GREEDY_REALLOC(*d, *n + 1))
7012                 return -ENOMEM;
7013
7014         (*d)[(*n) ++] = (ExecDirectoryItem) {
7015                 .path = TAKE_PTR(p),
7016                 .symlinks = TAKE_PTR(s),
7017         };
7018
7019         return 0;
7020 }
7021
7022 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7023 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7024
7025 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7026         [EXEC_INPUT_NULL] = "null",
7027         [EXEC_INPUT_TTY] = "tty",
7028         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7029         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7030         [EXEC_INPUT_SOCKET] = "socket",
7031         [EXEC_INPUT_NAMED_FD] = "fd",
7032         [EXEC_INPUT_DATA] = "data",
7033         [EXEC_INPUT_FILE] = "file",
7034 };
7035
7036 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7037
7038 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7039         [EXEC_OUTPUT_INHERIT] = "inherit",
7040         [EXEC_OUTPUT_NULL] = "null",
7041         [EXEC_OUTPUT_TTY] = "tty",
7042         [EXEC_OUTPUT_KMSG] = "kmsg",
7043         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7044         [EXEC_OUTPUT_JOURNAL] = "journal",
7045         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7046         [EXEC_OUTPUT_SOCKET] = "socket",
7047         [EXEC_OUTPUT_NAMED_FD] = "fd",
7048         [EXEC_OUTPUT_FILE] = "file",
7049         [EXEC_OUTPUT_FILE_APPEND] = "append",
7050         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7051 };
7052
7053 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7054
7055 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7056         [EXEC_UTMP_INIT] = "init",
7057         [EXEC_UTMP_LOGIN] = "login",
7058         [EXEC_UTMP_USER] = "user",
7059 };
7060
7061 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7062
7063 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7064         [EXEC_PRESERVE_NO] = "no",
7065         [EXEC_PRESERVE_YES] = "yes",
7066         [EXEC_PRESERVE_RESTART] = "restart",
7067 };
7068
7069 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7070
7071 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7072 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7073         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7074         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7075         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7076         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7077         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7078 };
7079
7080 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7081
7082 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7083 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7084         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7085         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7086         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7087         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7088         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7089 };
7090
7091 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7092
7093 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7094  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7095  * directories, specifically .timer units with their timestamp touch file. */
7096 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7097         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7098         [EXEC_DIRECTORY_STATE] = "state",
7099         [EXEC_DIRECTORY_CACHE] = "cache",
7100         [EXEC_DIRECTORY_LOGS] = "logs",
7101         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7102 };
7103
7104 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7105
7106 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7107  * the service payload in. */
7108 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7109         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7110         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7111         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7112         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7113         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7114 };
7115
7116 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7117
7118 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7119         [EXEC_KEYRING_INHERIT] = "inherit",
7120         [EXEC_KEYRING_PRIVATE] = "private",
7121         [EXEC_KEYRING_SHARED] = "shared",
7122 };
7123
7124 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);