src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "argv-util.h"
  43 #include "async.h"
  44 #include "barrier.h"
  45 #include "bpf-lsm.h"
  46 #include "cap-list.h"
  47 #include "capability-util.h"
  48 #include "cgroup-setup.h"
  49 #include "chase.h"
  50 #include "chown-recursive.h"
  51 #include "constants.h"
  52 #include "cpu-set-util.h"
  53 #include "creds-util.h"
  54 #include "data-fd-util.h"
  55 #include "env-file.h"
  56 #include "env-util.h"
  57 #include "errno-list.h"
  58 #include "escape.h"
  59 #include "execute.h"
  60 #include "exit-status.h"
  61 #include "fd-util.h"
  62 #include "fileio.h"
  63 #include "format-util.h"
  64 #include "glob-util.h"
  65 #include "hexdecoct.h"
  66 #include "io-util.h"
  67 #include "ioprio-util.h"
  68 #include "label.h"
  69 #include "log.h"
  70 #include "macro.h"
  71 #include "manager.h"
  72 #include "manager-dump.h"
  73 #include "memory-util.h"
  74 #include "missing_fs.h"
  75 #include "missing_ioprio.h"
  76 #include "missing_prctl.h"
  77 #include "mkdir-label.h"
  78 #include "mount-util.h"
  79 #include "mountpoint-util.h"
  80 #include "namespace.h"
  81 #include "parse-util.h"
  82 #include "path-util.h"
  83 #include "proc-cmdline.h"
  84 #include "process-util.h"
  85 #include "psi-util.h"
  86 #include "random-util.h"
  87 #include "recurse-dir.h"
  88 #include "rlimit-util.h"
  89 #include "rm-rf.h"
  90 #if HAVE_SECCOMP
  91 #include "seccomp-util.h"
  92 #endif
  93 #include "securebits-util.h"
  94 #include "selinux-util.h"
  95 #include "signal-util.h"
  96 #include "smack-util.h"
  97 #include "socket-util.h"
  98 #include "sort-util.h"
  99 #include "special.h"
 100 #include "stat-util.h"
 101 #include "string-table.h"
 102 #include "string-util.h"
 103 #include "strv.h"
 104 #include "syslog-util.h"
 105 #include "terminal-util.h"
 106 #include "tmpfile-util.h"
 107 #include "umask-util.h"
 108 #include "unit-serialize.h"
 109 #include "user-util.h"
 110 #include "utmp-wtmp.h"
 111
 112 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 113 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 114
 115 #define SNDBUF_SIZE (8*1024*1024)
 116
 117 static int shift_fds(int fds[], size_t n_fds) {
 118         if (n_fds <= 0)
 119                 return 0;
 120
 121         /* Modifies the fds array! (sorts it) */
 122
 123         assert(fds);
 124
 125         for (int start = 0;;) {
 126                 int restart_from = -1;
 127
 128                 for (int i = start; i < (int) n_fds; i++) {
 129                         int nfd;
 130
 131                         /* Already at right index? */
 132                         if (fds[i] == i+3)
 133                                 continue;
 134
 135                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 136                         if (nfd < 0)
 137                                 return -errno;
 138
 139                         safe_close(fds[i]);
 140                         fds[i] = nfd;
 141
 142                         /* Hmm, the fd we wanted isn't free? Then
 143                          * let's remember that and try again from here */
 144                         if (nfd != i+3 && restart_from < 0)
 145                                 restart_from = i;
 146                 }
 147
 148                 if (restart_from < 0)
 149                         break;
 150
 151                 start = restart_from;
 152         }
 153
 154         return 0;
 155 }
 156
 157 static int flags_fds(
 158                 const int fds[],
 159                 size_t n_socket_fds,
 160                 size_t n_fds,
 161                 bool nonblock) {
 162
 163         int r;
 164
 165         if (n_fds <= 0)
 166                 return 0;
 167
 168         assert(fds);
 169
 170         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 171          * O_NONBLOCK only applies to socket activation though. */
 172
 173         for (size_t i = 0; i < n_fds; i++) {
 174
 175                 if (i < n_socket_fds) {
 176                         r = fd_nonblock(fds[i], nonblock);
 177                         if (r < 0)
 178                                 return r;
 179                 }
 180
 181                 /* We unconditionally drop FD_CLOEXEC from the fds,
 182                  * since after all we want to pass these fds to our
 183                  * children */
 184
 185                 r = fd_cloexec(fds[i], false);
 186                 if (r < 0)
 187                         return r;
 188         }
 189
 190         return 0;
 191 }
 192
 193 static const char *exec_context_tty_path(const ExecContext *context) {
 194         assert(context);
 195
 196         if (context->stdio_as_fds)
 197                 return NULL;
 198
 199         if (context->tty_path)
 200                 return context->tty_path;
 201
 202         return "/dev/console";
 203 }
 204
 205 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 206         _cleanup_free_ char *rowskey = NULL, *rowsvalue = NULL, *colskey = NULL, *colsvalue = NULL;
 207         unsigned rows, cols;
 208         const char *tty;
 209         int r;
 210
 211         assert(context);
 212         assert(ret_rows);
 213         assert(ret_cols);
 214
 215         rows = context->tty_rows;
 216         cols = context->tty_cols;
 217
 218         tty = exec_context_tty_path(context);
 219         if (!tty || (rows != UINT_MAX && cols != UINT_MAX)) {
 220                 *ret_rows = rows;
 221                 *ret_cols = cols;
 222                 return 0;
 223         }
 224
 225         tty = skip_dev_prefix(tty);
 226         if (!in_charset(tty, ALPHANUMERICAL)) {
 227                 log_debug("%s contains non-alphanumeric characters, ignoring", tty);
 228                 *ret_rows = rows;
 229                 *ret_cols = cols;
 230                 return 0;
 231         }
 232
 233         rowskey = strjoin("systemd.tty.rows.", tty);
 234         if (!rowskey)
 235                 return -ENOMEM;
 236
 237         colskey = strjoin("systemd.tty.columns.", tty);
 238         if (!colskey)
 239                 return -ENOMEM;
 240
 241         r = proc_cmdline_get_key_many(/* flags = */ 0,
 242                                       rowskey, &rowsvalue,
 243                                       colskey, &colsvalue);
 244         if (r < 0)
 245                 log_debug_errno(r, "Failed to read TTY size of %s from kernel cmdline, ignoring: %m", tty);
 246
 247         if (rows == UINT_MAX && rowsvalue) {
 248                 r = safe_atou(rowsvalue, &rows);
 249                 if (r < 0)
 250                         log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", rowskey, rowsvalue);
 251         }
 252
 253         if (cols == UINT_MAX && colsvalue) {
 254                 r = safe_atou(colsvalue, &cols);
 255                 if (r < 0)
 256                         log_debug_errno(r, "Failed to parse %s=%s, ignoring: %m", colskey, colsvalue);
 257         }
 258
 259         *ret_rows = rows;
 260         *ret_cols = cols;
 261
 262         return 0;
 263 }
 264
 265 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 266         const char *path;
 267
 268         assert(context);
 269
 270         path = exec_context_tty_path(context);
 271
 272         if (context->tty_vhangup) {
 273                 if (p && p->stdin_fd >= 0)
 274                         (void) terminal_vhangup_fd(p->stdin_fd);
 275                 else if (path)
 276                         (void) terminal_vhangup(path);
 277         }
 278
 279         if (context->tty_reset) {
 280                 if (p && p->stdin_fd >= 0)
 281                         (void) reset_terminal_fd(p->stdin_fd, true);
 282                 else if (path)
 283                         (void) reset_terminal(path);
 284         }
 285
 286         if (p && p->stdin_fd >= 0) {
 287                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 288
 289                 (void) exec_context_tty_size(context, &rows, &cols);
 290                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 291         }
 292
 293         if (context->tty_vt_disallocate && path)
 294                 (void) vt_disallocate(path);
 295 }
 296
 297 static bool is_terminal_input(ExecInput i) {
 298         return IN_SET(i,
 299                       EXEC_INPUT_TTY,
 300                       EXEC_INPUT_TTY_FORCE,
 301                       EXEC_INPUT_TTY_FAIL);
 302 }
 303
 304 static bool is_terminal_output(ExecOutput o) {
 305         return IN_SET(o,
 306                       EXEC_OUTPUT_TTY,
 307                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 308                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 309 }
 310
 311 static bool is_kmsg_output(ExecOutput o) {
 312         return IN_SET(o,
 313                       EXEC_OUTPUT_KMSG,
 314                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 315 }
 316
 317 static bool exec_context_needs_term(const ExecContext *c) {
 318         assert(c);
 319
 320         /* Return true if the execution context suggests we should set $TERM to something useful. */
 321
 322         if (is_terminal_input(c->std_input))
 323                 return true;
 324
 325         if (is_terminal_output(c->std_output))
 326                 return true;
 327
 328         if (is_terminal_output(c->std_error))
 329                 return true;
 330
 331         return !!c->tty_path;
 332 }
 333
 334 static int open_null_as(int flags, int nfd) {
 335         int fd;
 336
 337         assert(nfd >= 0);
 338
 339         fd = open("/dev/null", flags|O_NOCTTY);
 340         if (fd < 0)
 341                 return -errno;
 342
 343         return move_fd(fd, nfd, false);
 344 }
 345
 346 static int connect_journal_socket(
 347                 int fd,
 348                 const char *log_namespace,
 349                 uid_t uid,
 350                 gid_t gid) {
 351
 352         uid_t olduid = UID_INVALID;
 353         gid_t oldgid = GID_INVALID;
 354         const char *j;
 355         int r;
 356
 357         j = log_namespace ?
 358                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 359                 "/run/systemd/journal/stdout";
 360
 361         if (gid_is_valid(gid)) {
 362                 oldgid = getgid();
 363
 364                 if (setegid(gid) < 0)
 365                         return -errno;
 366         }
 367
 368         if (uid_is_valid(uid)) {
 369                 olduid = getuid();
 370
 371                 if (seteuid(uid) < 0) {
 372                         r = -errno;
 373                         goto restore_gid;
 374                 }
 375         }
 376
 377         r = connect_unix_path(fd, AT_FDCWD, j);
 378
 379         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 380            an LSM interferes. */
 381
 382         if (uid_is_valid(uid))
 383                 (void) seteuid(olduid);
 384
 385  restore_gid:
 386         if (gid_is_valid(gid))
 387                 (void) setegid(oldgid);
 388
 389         return r;
 390 }
 391
 392 static int connect_logger_as(
 393                 const Unit *unit,
 394                 const ExecContext *context,
 395                 const ExecParameters *params,
 396                 ExecOutput output,
 397                 const char *ident,
 398                 int nfd,
 399                 uid_t uid,
 400                 gid_t gid) {
 401
 402         _cleanup_close_ int fd = -EBADF;
 403         int r;
 404
 405         assert(context);
 406         assert(params);
 407         assert(output < _EXEC_OUTPUT_MAX);
 408         assert(ident);
 409         assert(nfd >= 0);
 410
 411         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 412         if (fd < 0)
 413                 return -errno;
 414
 415         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 416         if (r < 0)
 417                 return r;
 418
 419         if (shutdown(fd, SHUT_RD) < 0)
 420                 return -errno;
 421
 422         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 423
 424         if (dprintf(fd,
 425                 "%s\n"
 426                 "%s\n"
 427                 "%i\n"
 428                 "%i\n"
 429                 "%i\n"
 430                 "%i\n"
 431                 "%i\n",
 432                 context->syslog_identifier ?: ident,
 433                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 434                 context->syslog_priority,
 435                 !!context->syslog_level_prefix,
 436                 false,
 437                 is_kmsg_output(output),
 438                 is_terminal_output(output)) < 0)
 439                 return -errno;
 440
 441         return move_fd(TAKE_FD(fd), nfd, false);
 442 }
 443
 444 static int open_terminal_as(const char *path, int flags, int nfd) {
 445         int fd;
 446
 447         assert(path);
 448         assert(nfd >= 0);
 449
 450         fd = open_terminal(path, flags | O_NOCTTY);
 451         if (fd < 0)
 452                 return fd;
 453
 454         return move_fd(fd, nfd, false);
 455 }
 456
 457 static int acquire_path(const char *path, int flags, mode_t mode) {
 458         _cleanup_close_ int fd = -EBADF;
 459         int r;
 460
 461         assert(path);
 462
 463         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 464                 flags |= O_CREAT;
 465
 466         fd = open(path, flags|O_NOCTTY, mode);
 467         if (fd >= 0)
 468                 return TAKE_FD(fd);
 469
 470         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 471                 return -errno;
 472
 473         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 474
 475         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 476         if (fd < 0)
 477                 return -errno;
 478
 479         r = connect_unix_path(fd, AT_FDCWD, path);
 480         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 481                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 482                  * wasn't an AF_UNIX socket after all */
 483                 return -ENXIO;
 484         if (r < 0)
 485                 return r;
 486
 487         if ((flags & O_ACCMODE) == O_RDONLY)
 488                 r = shutdown(fd, SHUT_WR);
 489         else if ((flags & O_ACCMODE) == O_WRONLY)
 490                 r = shutdown(fd, SHUT_RD);
 491         else
 492                 r = 0;
 493         if (r < 0)
 494                 return -errno;
 495
 496         return TAKE_FD(fd);
 497 }
 498
 499 static int fixup_input(
 500                 const ExecContext *context,
 501                 int socket_fd,
 502                 bool apply_tty_stdin) {
 503
 504         ExecInput std_input;
 505
 506         assert(context);
 507
 508         std_input = context->std_input;
 509
 510         if (is_terminal_input(std_input) && !apply_tty_stdin)
 511                 return EXEC_INPUT_NULL;
 512
 513         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 514                 return EXEC_INPUT_NULL;
 515
 516         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 517                 return EXEC_INPUT_NULL;
 518
 519         return std_input;
 520 }
 521
 522 static int fixup_output(ExecOutput output, int socket_fd) {
 523
 524         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 525                 return EXEC_OUTPUT_INHERIT;
 526
 527         return output;
 528 }
 529
 530 static int setup_input(
 531                 const ExecContext *context,
 532                 const ExecParameters *params,
 533                 int socket_fd,
 534                 const int named_iofds[static 3]) {
 535
 536         ExecInput i;
 537         int r;
 538
 539         assert(context);
 540         assert(params);
 541         assert(named_iofds);
 542
 543         if (params->stdin_fd >= 0) {
 544                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 545                         return -errno;
 546
 547                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 548                 if (isatty(STDIN_FILENO)) {
 549                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 550
 551                         (void) exec_context_tty_size(context, &rows, &cols);
 552                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 553                         (void) reset_terminal_fd(STDIN_FILENO, true);
 554                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 555                 }
 556
 557                 return STDIN_FILENO;
 558         }
 559
 560         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 561
 562         switch (i) {
 563
 564         case EXEC_INPUT_NULL:
 565                 return open_null_as(O_RDONLY, STDIN_FILENO);
 566
 567         case EXEC_INPUT_TTY:
 568         case EXEC_INPUT_TTY_FORCE:
 569         case EXEC_INPUT_TTY_FAIL: {
 570                 unsigned rows, cols;
 571                 int fd;
 572
 573                 fd = acquire_terminal(exec_context_tty_path(context),
 574                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 575                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 576                                                                   ACQUIRE_TERMINAL_WAIT,
 577                                       USEC_INFINITY);
 578                 if (fd < 0)
 579                         return fd;
 580
 581                 r = exec_context_tty_size(context, &rows, &cols);
 582                 if (r < 0)
 583                         return r;
 584
 585                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 586                 if (r < 0)
 587                         return r;
 588
 589                 return move_fd(fd, STDIN_FILENO, false);
 590         }
 591
 592         case EXEC_INPUT_SOCKET:
 593                 assert(socket_fd >= 0);
 594
 595                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 596
 597         case EXEC_INPUT_NAMED_FD:
 598                 assert(named_iofds[STDIN_FILENO] >= 0);
 599
 600                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 601                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 602
 603         case EXEC_INPUT_DATA: {
 604                 int fd;
 605
 606                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 607                 if (fd < 0)
 608                         return fd;
 609
 610                 return move_fd(fd, STDIN_FILENO, false);
 611         }
 612
 613         case EXEC_INPUT_FILE: {
 614                 bool rw;
 615                 int fd;
 616
 617                 assert(context->stdio_file[STDIN_FILENO]);
 618
 619                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 620                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 621
 622                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 623                 if (fd < 0)
 624                         return fd;
 625
 626                 return move_fd(fd, STDIN_FILENO, false);
 627         }
 628
 629         default:
 630                 assert_not_reached();
 631         }
 632 }
 633
 634 static bool can_inherit_stderr_from_stdout(
 635                 const ExecContext *context,
 636                 ExecOutput o,
 637                 ExecOutput e) {
 638
 639         assert(context);
 640
 641         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 642          * stderr fd */
 643
 644         if (e == EXEC_OUTPUT_INHERIT)
 645                 return true;
 646         if (e != o)
 647                 return false;
 648
 649         if (e == EXEC_OUTPUT_NAMED_FD)
 650                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 651
 652         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 653                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 654
 655         return true;
 656 }
 657
 658 static int setup_output(
 659                 const Unit *unit,
 660                 const ExecContext *context,
 661                 const ExecParameters *params,
 662                 int fileno,
 663                 int socket_fd,
 664                 const int named_iofds[static 3],
 665                 const char *ident,
 666                 uid_t uid,
 667                 gid_t gid,
 668                 dev_t *journal_stream_dev,
 669                 ino_t *journal_stream_ino) {
 670
 671         ExecOutput o;
 672         ExecInput i;
 673         int r;
 674
 675         assert(unit);
 676         assert(context);
 677         assert(params);
 678         assert(ident);
 679         assert(journal_stream_dev);
 680         assert(journal_stream_ino);
 681
 682         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 683
 684                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 685                         return -errno;
 686
 687                 return STDOUT_FILENO;
 688         }
 689
 690         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 691                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 692                         return -errno;
 693
 694                 return STDERR_FILENO;
 695         }
 696
 697         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 698         o = fixup_output(context->std_output, socket_fd);
 699
 700         if (fileno == STDERR_FILENO) {
 701                 ExecOutput e;
 702                 e = fixup_output(context->std_error, socket_fd);
 703
 704                 /* This expects the input and output are already set up */
 705
 706                 /* Don't change the stderr file descriptor if we inherit all
 707                  * the way and are not on a tty */
 708                 if (e == EXEC_OUTPUT_INHERIT &&
 709                     o == EXEC_OUTPUT_INHERIT &&
 710                     i == EXEC_INPUT_NULL &&
 711                     !is_terminal_input(context->std_input) &&
 712                     getppid() != 1)
 713                         return fileno;
 714
 715                 /* Duplicate from stdout if possible */
 716                 if (can_inherit_stderr_from_stdout(context, o, e))
 717                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 718
 719                 o = e;
 720
 721         } else if (o == EXEC_OUTPUT_INHERIT) {
 722                 /* If input got downgraded, inherit the original value */
 723                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 724                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 725
 726                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 727                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 728                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 729
 730                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 731                 if (getppid() != 1)
 732                         return fileno;
 733
 734                 /* We need to open /dev/null here anew, to get the right access mode. */
 735                 return open_null_as(O_WRONLY, fileno);
 736         }
 737
 738         switch (o) {
 739
 740         case EXEC_OUTPUT_NULL:
 741                 return open_null_as(O_WRONLY, fileno);
 742
 743         case EXEC_OUTPUT_TTY:
 744                 if (is_terminal_input(i))
 745                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 746
 747                 /* We don't reset the terminal if this is just about output */
 748                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 749
 750         case EXEC_OUTPUT_KMSG:
 751         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 752         case EXEC_OUTPUT_JOURNAL:
 753         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 754                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 755                 if (r < 0) {
 756                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 757                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 758                         r = open_null_as(O_WRONLY, fileno);
 759                 } else {
 760                         struct stat st;
 761
 762                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 763                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 764                          * services to detect whether they are connected to the journal or not.
 765                          *
 766                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 767                          * about STDERR as that's usually the best way to do logging. */
 768
 769                         if (fstat(fileno, &st) >= 0 &&
 770                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 771                                 *journal_stream_dev = st.st_dev;
 772                                 *journal_stream_ino = st.st_ino;
 773                         }
 774                 }
 775                 return r;
 776
 777         case EXEC_OUTPUT_SOCKET:
 778                 assert(socket_fd >= 0);
 779
 780                 return RET_NERRNO(dup2(socket_fd, fileno));
 781
 782         case EXEC_OUTPUT_NAMED_FD:
 783                 assert(named_iofds[fileno] >= 0);
 784
 785                 (void) fd_nonblock(named_iofds[fileno], false);
 786                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 787
 788         case EXEC_OUTPUT_FILE:
 789         case EXEC_OUTPUT_FILE_APPEND:
 790         case EXEC_OUTPUT_FILE_TRUNCATE: {
 791                 bool rw;
 792                 int fd, flags;
 793
 794                 assert(context->stdio_file[fileno]);
 795
 796                 rw = context->std_input == EXEC_INPUT_FILE &&
 797                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 798
 799                 if (rw)
 800                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 801
 802                 flags = O_WRONLY;
 803                 if (o == EXEC_OUTPUT_FILE_APPEND)
 804                         flags |= O_APPEND;
 805                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 806                         flags |= O_TRUNC;
 807
 808                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 809                 if (fd < 0)
 810                         return fd;
 811
 812                 return move_fd(fd, fileno, 0);
 813         }
 814
 815         default:
 816                 assert_not_reached();
 817         }
 818 }
 819
 820 static int chown_terminal(int fd, uid_t uid) {
 821         int r;
 822
 823         assert(fd >= 0);
 824
 825         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 826         if (isatty(fd) < 1) {
 827                 if (IN_SET(errno, EINVAL, ENOTTY))
 828                         return 0; /* not a tty */
 829
 830                 return -errno;
 831         }
 832
 833         /* This might fail. What matters are the results. */
 834         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 835         if (r < 0)
 836                 return r;
 837
 838         return 1;
 839 }
 840
 841 static int setup_confirm_stdio(
 842                 const ExecContext *context,
 843                 const char *vc,
 844                 int *ret_saved_stdin,
 845                 int *ret_saved_stdout) {
 846
 847         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 848         unsigned rows, cols;
 849         int r;
 850
 851         assert(ret_saved_stdin);
 852         assert(ret_saved_stdout);
 853
 854         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 855         if (saved_stdin < 0)
 856                 return -errno;
 857
 858         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 859         if (saved_stdout < 0)
 860                 return -errno;
 861
 862         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 863         if (fd < 0)
 864                 return fd;
 865
 866         r = chown_terminal(fd, getuid());
 867         if (r < 0)
 868                 return r;
 869
 870         r = reset_terminal_fd(fd, true);
 871         if (r < 0)
 872                 return r;
 873
 874         r = exec_context_tty_size(context, &rows, &cols);
 875         if (r < 0)
 876                 return r;
 877
 878         r = terminal_set_size_fd(fd, vc, rows, cols);
 879         if (r < 0)
 880                 return r;
 881
 882         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 883         TAKE_FD(fd);
 884         if (r < 0)
 885                 return r;
 886
 887         *ret_saved_stdin = TAKE_FD(saved_stdin);
 888         *ret_saved_stdout = TAKE_FD(saved_stdout);
 889         return 0;
 890 }
 891
 892 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 893         assert(err < 0);
 894
 895         if (err == -ETIMEDOUT)
 896                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 897         else {
 898                 errno = -err;
 899                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 900         }
 901 }
 902
 903 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 904         _cleanup_close_ int fd = -EBADF;
 905
 906         assert(vc);
 907
 908         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 909         if (fd < 0)
 910                 return;
 911
 912         write_confirm_error_fd(err, fd, u);
 913 }
 914
 915 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 916         int r = 0;
 917
 918         assert(saved_stdin);
 919         assert(saved_stdout);
 920
 921         release_terminal();
 922
 923         if (*saved_stdin >= 0)
 924                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 925                         r = -errno;
 926
 927         if (*saved_stdout >= 0)
 928                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 929                         r = -errno;
 930
 931         *saved_stdin = safe_close(*saved_stdin);
 932         *saved_stdout = safe_close(*saved_stdout);
 933
 934         return r;
 935 }
 936
 937 enum {
 938         CONFIRM_PRETEND_FAILURE = -1,
 939         CONFIRM_PRETEND_SUCCESS =  0,
 940         CONFIRM_EXECUTE = 1,
 941 };
 942
 943 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 944         int saved_stdout = -1, saved_stdin = -1, r;
 945         _cleanup_free_ char *e = NULL;
 946         char c;
 947
 948         /* For any internal errors, assume a positive response. */
 949         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 950         if (r < 0) {
 951                 write_confirm_error(r, vc, u);
 952                 return CONFIRM_EXECUTE;
 953         }
 954
 955         /* confirm_spawn might have been disabled while we were sleeping. */
 956         if (manager_is_confirm_spawn_disabled(u->manager)) {
 957                 r = 1;
 958                 goto restore_stdio;
 959         }
 960
 961         e = ellipsize(cmdline, 60, 100);
 962         if (!e) {
 963                 log_oom();
 964                 r = CONFIRM_EXECUTE;
 965                 goto restore_stdio;
 966         }
 967
 968         for (;;) {
 969                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 970                 if (r < 0) {
 971                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 972                         r = CONFIRM_EXECUTE;
 973                         goto restore_stdio;
 974                 }
 975
 976                 switch (c) {
 977                 case 'c':
 978                         printf("Resuming normal execution.\n");
 979                         manager_disable_confirm_spawn();
 980                         r = 1;
 981                         break;
 982                 case 'D':
 983                         unit_dump(u, stdout, "  ");
 984                         continue; /* ask again */
 985                 case 'f':
 986                         printf("Failing execution.\n");
 987                         r = CONFIRM_PRETEND_FAILURE;
 988                         break;
 989                 case 'h':
 990                         printf("  c - continue, proceed without asking anymore\n"
 991                                "  D - dump, show the state of the unit\n"
 992                                "  f - fail, don't execute the command and pretend it failed\n"
 993                                "  h - help\n"
 994                                "  i - info, show a short summary of the unit\n"
 995                                "  j - jobs, show jobs that are in progress\n"
 996                                "  s - skip, don't execute the command and pretend it succeeded\n"
 997                                "  y - yes, execute the command\n");
 998                         continue; /* ask again */
 999                 case 'i':
1000                         printf("  Description: %s\n"
1001                                "  Unit:        %s\n"
1002                                "  Command:     %s\n",
1003                                u->id, u->description, cmdline);
1004                         continue; /* ask again */
1005                 case 'j':
1006                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
1007                         continue; /* ask again */
1008                 case 'n':
1009                         /* 'n' was removed in favor of 'f'. */
1010                         printf("Didn't understand 'n', did you mean 'f'?\n");
1011                         continue; /* ask again */
1012                 case 's':
1013                         printf("Skipping execution.\n");
1014                         r = CONFIRM_PRETEND_SUCCESS;
1015                         break;
1016                 case 'y':
1017                         r = CONFIRM_EXECUTE;
1018                         break;
1019                 default:
1020                         assert_not_reached();
1021                 }
1022                 break;
1023         }
1024
1025 restore_stdio:
1026         restore_confirm_stdio(&saved_stdin, &saved_stdout);
1027         return r;
1028 }
1029
1030 static int get_fixed_user(const ExecContext *c, const char **user,
1031                           uid_t *uid, gid_t *gid,
1032                           const char **home, const char **shell) {
1033         int r;
1034         const char *name;
1035
1036         assert(c);
1037
1038         if (!c->user)
1039                 return 0;
1040
1041         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1042          * (i.e. are "/" or "/bin/nologin"). */
1043
1044         name = c->user;
1045         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1046         if (r < 0)
1047                 return r;
1048
1049         *user = name;
1050         return 0;
1051 }
1052
1053 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1054         int r;
1055         const char *name;
1056
1057         assert(c);
1058
1059         if (!c->group)
1060                 return 0;
1061
1062         name = c->group;
1063         r = get_group_creds(&name, gid, 0);
1064         if (r < 0)
1065                 return r;
1066
1067         *group = name;
1068         return 0;
1069 }
1070
1071 static int get_supplementary_groups(const ExecContext *c, const char *user,
1072                                     const char *group, gid_t gid,
1073                                     gid_t **supplementary_gids, int *ngids) {
1074         int r, k = 0;
1075         int ngroups_max;
1076         bool keep_groups = false;
1077         gid_t *groups = NULL;
1078         _cleanup_free_ gid_t *l_gids = NULL;
1079
1080         assert(c);
1081
1082         /*
1083          * If user is given, then lookup GID and supplementary groups list.
1084          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1085          * here and as early as possible so we keep the list of supplementary
1086          * groups of the caller.
1087          */
1088         if (user && gid_is_valid(gid) && gid != 0) {
1089                 /* First step, initialize groups from /etc/groups */
1090                 if (initgroups(user, gid) < 0)
1091                         return -errno;
1092
1093                 keep_groups = true;
1094         }
1095
1096         if (strv_isempty(c->supplementary_groups))
1097                 return 0;
1098
1099         /*
1100          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1101          * be positive, otherwise fail.
1102          */
1103         errno = 0;
1104         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1105         if (ngroups_max <= 0)
1106                 return errno_or_else(EOPNOTSUPP);
1107
1108         l_gids = new(gid_t, ngroups_max);
1109         if (!l_gids)
1110                 return -ENOMEM;
1111
1112         if (keep_groups) {
1113                 /*
1114                  * Lookup the list of groups that the user belongs to, we
1115                  * avoid NSS lookups here too for gid=0.
1116                  */
1117                 k = ngroups_max;
1118                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1119                         return -EINVAL;
1120         } else
1121                 k = 0;
1122
1123         STRV_FOREACH(i, c->supplementary_groups) {
1124                 const char *g;
1125
1126                 if (k >= ngroups_max)
1127                         return -E2BIG;
1128
1129                 g = *i;
1130                 r = get_group_creds(&g, l_gids+k, 0);
1131                 if (r < 0)
1132                         return r;
1133
1134                 k++;
1135         }
1136
1137         /*
1138          * Sets ngids to zero to drop all supplementary groups, happens
1139          * when we are under root and SupplementaryGroups= is empty.
1140          */
1141         if (k == 0) {
1142                 *ngids = 0;
1143                 return 0;
1144         }
1145
1146         /* Otherwise get the final list of supplementary groups */
1147         groups = memdup(l_gids, sizeof(gid_t) * k);
1148         if (!groups)
1149                 return -ENOMEM;
1150
1151         *supplementary_gids = groups;
1152         *ngids = k;
1153
1154         groups = NULL;
1155
1156         return 0;
1157 }
1158
1159 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1160         int r;
1161
1162         /* Handle SupplementaryGroups= if it is not empty */
1163         if (ngids > 0) {
1164                 r = maybe_setgroups(ngids, supplementary_gids);
1165                 if (r < 0)
1166                         return r;
1167         }
1168
1169         if (gid_is_valid(gid)) {
1170                 /* Then set our gids */
1171                 if (setresgid(gid, gid, gid) < 0)
1172                         return -errno;
1173         }
1174
1175         return 0;
1176 }
1177
1178 static int set_securebits(unsigned bits, unsigned mask) {
1179         unsigned applied;
1180         int current;
1181
1182         current = prctl(PR_GET_SECUREBITS);
1183         if (current < 0)
1184                 return -errno;
1185
1186         /* Clear all securebits defined in mask and set bits */
1187         applied = ((unsigned) current & ~mask) | bits;
1188         if ((unsigned) current == applied)
1189                 return 0;
1190
1191         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1192                 return -errno;
1193
1194         return 1;
1195 }
1196
1197 static int enforce_user(
1198                 const ExecContext *context,
1199                 uid_t uid,
1200                 uint64_t capability_ambient_set) {
1201         assert(context);
1202         int r;
1203
1204         if (!uid_is_valid(uid))
1205                 return 0;
1206
1207         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1208          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1209          * case. */
1210
1211         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1212
1213                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1214                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1215                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1216                 if (r < 0)
1217                         return r;
1218         }
1219
1220         /* Second step: actually set the uids */
1221         if (setresuid(uid, uid, uid) < 0)
1222                 return -errno;
1223
1224         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1225          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1226          * outside of this call. */
1227         return 0;
1228 }
1229
1230 #if HAVE_PAM
1231
1232 static int null_conv(
1233                 int num_msg,
1234                 const struct pam_message **msg,
1235                 struct pam_response **resp,
1236                 void *appdata_ptr) {
1237
1238         /* We don't support conversations */
1239
1240         return PAM_CONV_ERR;
1241 }
1242
1243 #endif
1244
1245 static int setup_pam(
1246                 const char *name,
1247                 const char *user,
1248                 uid_t uid,
1249                 gid_t gid,
1250                 const char *tty,
1251                 char ***env, /* updated on success */
1252                 const int fds[], size_t n_fds) {
1253
1254 #if HAVE_PAM
1255
1256         static const struct pam_conv conv = {
1257                 .conv = null_conv,
1258                 .appdata_ptr = NULL
1259         };
1260
1261         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1262         _cleanup_strv_free_ char **e = NULL;
1263         pam_handle_t *handle = NULL;
1264         sigset_t old_ss;
1265         int pam_code = PAM_SUCCESS, r;
1266         bool close_session = false;
1267         pid_t pam_pid = 0, parent_pid;
1268         int flags = 0;
1269
1270         assert(name);
1271         assert(user);
1272         assert(env);
1273
1274         /* We set up PAM in the parent process, then fork. The child
1275          * will then stay around until killed via PR_GET_PDEATHSIG or
1276          * systemd via the cgroup logic. It will then remove the PAM
1277          * session again. The parent process will exec() the actual
1278          * daemon. We do things this way to ensure that the main PID
1279          * of the daemon is the one we initially fork()ed. */
1280
1281         r = barrier_create(&barrier);
1282         if (r < 0)
1283                 goto fail;
1284
1285         if (log_get_max_level() < LOG_DEBUG)
1286                 flags |= PAM_SILENT;
1287
1288         pam_code = pam_start(name, user, &conv, &handle);
1289         if (pam_code != PAM_SUCCESS) {
1290                 handle = NULL;
1291                 goto fail;
1292         }
1293
1294         if (!tty) {
1295                 _cleanup_free_ char *q = NULL;
1296
1297                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1298                  * out if that's the case, and read the TTY off it. */
1299
1300                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1301                         tty = strjoina("/dev/", q);
1302         }
1303
1304         if (tty) {
1305                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1306                 if (pam_code != PAM_SUCCESS)
1307                         goto fail;
1308         }
1309
1310         STRV_FOREACH(nv, *env) {
1311                 pam_code = pam_putenv(handle, *nv);
1312                 if (pam_code != PAM_SUCCESS)
1313                         goto fail;
1314         }
1315
1316         pam_code = pam_acct_mgmt(handle, flags);
1317         if (pam_code != PAM_SUCCESS)
1318                 goto fail;
1319
1320         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1321         if (pam_code != PAM_SUCCESS)
1322                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1323
1324         pam_code = pam_open_session(handle, flags);
1325         if (pam_code != PAM_SUCCESS)
1326                 goto fail;
1327
1328         close_session = true;
1329
1330         e = pam_getenvlist(handle);
1331         if (!e) {
1332                 pam_code = PAM_BUF_ERR;
1333                 goto fail;
1334         }
1335
1336         /* Block SIGTERM, so that we know that it won't get lost in the child */
1337
1338         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1339
1340         parent_pid = getpid_cached();
1341
1342         r = safe_fork("(sd-pam)", 0, &pam_pid);
1343         if (r < 0)
1344                 goto fail;
1345         if (r == 0) {
1346                 int sig, ret = EXIT_PAM;
1347
1348                 /* The child's job is to reset the PAM session on termination */
1349                 barrier_set_role(&barrier, BARRIER_CHILD);
1350
1351                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1352                  * those fds are open here that have been opened by PAM. */
1353                 (void) close_many(fds, n_fds);
1354
1355                 /* Drop privileges - we don't need any to pam_close_session and this will make
1356                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1357                  * threads to fail to exit normally */
1358
1359                 r = maybe_setgroups(0, NULL);
1360                 if (r < 0)
1361                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1362                 if (setresgid(gid, gid, gid) < 0)
1363                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1364                 if (setresuid(uid, uid, uid) < 0)
1365                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1366
1367                 (void) ignore_signals(SIGPIPE);
1368
1369                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1370                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1371                  * this way. We rely on the control groups kill logic to do the rest for us. */
1372                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1373                         goto child_finish;
1374
1375                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1376                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1377                  *
1378                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1379                 (void) barrier_place(&barrier);
1380
1381                 /* Check if our parent process might already have died? */
1382                 if (getppid() == parent_pid) {
1383                         sigset_t ss;
1384
1385                         assert_se(sigemptyset(&ss) >= 0);
1386                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1387
1388                         for (;;) {
1389                                 if (sigwait(&ss, &sig) < 0) {
1390                                         if (errno == EINTR)
1391                                                 continue;
1392
1393                                         goto child_finish;
1394                                 }
1395
1396                                 assert(sig == SIGTERM);
1397                                 break;
1398                         }
1399                 }
1400
1401                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1402                 if (pam_code != PAM_SUCCESS)
1403                         goto child_finish;
1404
1405                 /* If our parent died we'll end the session */
1406                 if (getppid() != parent_pid) {
1407                         pam_code = pam_close_session(handle, flags);
1408                         if (pam_code != PAM_SUCCESS)
1409                                 goto child_finish;
1410                 }
1411
1412                 ret = 0;
1413
1414         child_finish:
1415                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1416                  * know about this. See pam_end(3) */
1417                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1418                 _exit(ret);
1419         }
1420
1421         barrier_set_role(&barrier, BARRIER_PARENT);
1422
1423         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1424          * here. */
1425         handle = NULL;
1426
1427         /* Unblock SIGTERM again in the parent */
1428         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1429
1430         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1431          * this fd around. */
1432         closelog();
1433
1434         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1435          * recover. However, warn loudly if it happens. */
1436         if (!barrier_place_and_sync(&barrier))
1437                 log_error("PAM initialization failed");
1438
1439         return strv_free_and_replace(*env, e);
1440
1441 fail:
1442         if (pam_code != PAM_SUCCESS) {
1443                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1444                 r = -EPERM;  /* PAM errors do not map to errno */
1445         } else
1446                 log_error_errno(r, "PAM failed: %m");
1447
1448         if (handle) {
1449                 if (close_session)
1450                         pam_code = pam_close_session(handle, flags);
1451
1452                 (void) pam_end(handle, pam_code | flags);
1453         }
1454
1455         closelog();
1456         return r;
1457 #else
1458         return 0;
1459 #endif
1460 }
1461
1462 static void rename_process_from_path(const char *path) {
1463         _cleanup_free_ char *buf = NULL;
1464         const char *p;
1465
1466         assert(path);
1467
1468         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1469          * /bin/ps */
1470
1471         if (path_extract_filename(path, &buf) < 0) {
1472                 rename_process("(...)");
1473                 return;
1474         }
1475
1476         size_t l = strlen(buf);
1477         if (l > 8) {
1478                 /* The end of the process name is usually more interesting, since the first bit might just be
1479                  * "systemd-" */
1480                 p = buf + l - 8;
1481                 l = 8;
1482         } else
1483                 p = buf;
1484
1485         char process_name[11];
1486         process_name[0] = '(';
1487         memcpy(process_name+1, p, l);
1488         process_name[1+l] = ')';
1489         process_name[1+l+1] = 0;
1490
1491         rename_process(process_name);
1492 }
1493
1494 static bool context_has_address_families(const ExecContext *c) {
1495         assert(c);
1496
1497         return c->address_families_allow_list ||
1498                 !set_isempty(c->address_families);
1499 }
1500
1501 static bool context_has_syscall_filters(const ExecContext *c) {
1502         assert(c);
1503
1504         return c->syscall_allow_list ||
1505                 !hashmap_isempty(c->syscall_filter);
1506 }
1507
1508 static bool context_has_syscall_logs(const ExecContext *c) {
1509         assert(c);
1510
1511         return c->syscall_log_allow_list ||
1512                 !hashmap_isempty(c->syscall_log);
1513 }
1514
1515 static bool context_has_no_new_privileges(const ExecContext *c) {
1516         assert(c);
1517
1518         if (c->no_new_privileges)
1519                 return true;
1520
1521         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1522                 return false;
1523
1524         /* We need NNP if we have any form of seccomp and are unprivileged */
1525         return c->lock_personality ||
1526                 c->memory_deny_write_execute ||
1527                 c->private_devices ||
1528                 c->protect_clock ||
1529                 c->protect_hostname ||
1530                 c->protect_kernel_tunables ||
1531                 c->protect_kernel_modules ||
1532                 c->protect_kernel_logs ||
1533                 context_has_address_families(c) ||
1534                 exec_context_restrict_namespaces_set(c) ||
1535                 c->restrict_realtime ||
1536                 c->restrict_suid_sgid ||
1537                 !set_isempty(c->syscall_archs) ||
1538                 context_has_syscall_filters(c) ||
1539                 context_has_syscall_logs(c);
1540 }
1541
1542 static bool exec_context_has_credentials(const ExecContext *context) {
1543
1544         assert(context);
1545
1546         return !hashmap_isempty(context->set_credentials) ||
1547                 !hashmap_isempty(context->load_credentials);
1548 }
1549
1550 #if HAVE_SECCOMP
1551
1552 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1553
1554         if (is_seccomp_available())
1555                 return false;
1556
1557         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1558         return true;
1559 }
1560
1561 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1562         uint32_t negative_action, default_action, action;
1563         int r;
1564
1565         assert(u);
1566         assert(c);
1567
1568         if (!context_has_syscall_filters(c))
1569                 return 0;
1570
1571         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1572                 return 0;
1573
1574         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1575
1576         if (c->syscall_allow_list) {
1577                 default_action = negative_action;
1578                 action = SCMP_ACT_ALLOW;
1579         } else {
1580                 default_action = SCMP_ACT_ALLOW;
1581                 action = negative_action;
1582         }
1583
1584         if (needs_ambient_hack) {
1585                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1586                 if (r < 0)
1587                         return r;
1588         }
1589
1590         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1591 }
1592
1593 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1594 #ifdef SCMP_ACT_LOG
1595         uint32_t default_action, action;
1596 #endif
1597
1598         assert(u);
1599         assert(c);
1600
1601         if (!context_has_syscall_logs(c))
1602                 return 0;
1603
1604 #ifdef SCMP_ACT_LOG
1605         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1606                 return 0;
1607
1608         if (c->syscall_log_allow_list) {
1609                 /* Log nothing but the ones listed */
1610                 default_action = SCMP_ACT_ALLOW;
1611                 action = SCMP_ACT_LOG;
1612         } else {
1613                 /* Log everything but the ones listed */
1614                 default_action = SCMP_ACT_LOG;
1615                 action = SCMP_ACT_ALLOW;
1616         }
1617
1618         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1619 #else
1620         /* old libseccomp */
1621         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1622         return 0;
1623 #endif
1624 }
1625
1626 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1627         assert(u);
1628         assert(c);
1629
1630         if (set_isempty(c->syscall_archs))
1631                 return 0;
1632
1633         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1634                 return 0;
1635
1636         return seccomp_restrict_archs(c->syscall_archs);
1637 }
1638
1639 static int apply_address_families(const Unit* u, const ExecContext *c) {
1640         assert(u);
1641         assert(c);
1642
1643         if (!context_has_address_families(c))
1644                 return 0;
1645
1646         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1647                 return 0;
1648
1649         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1650 }
1651
1652 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1653         int r;
1654
1655         assert(u);
1656         assert(c);
1657
1658         if (!c->memory_deny_write_execute)
1659                 return 0;
1660
1661         /* use prctl() if kernel supports it (6.3) */
1662         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1663         if (r == 0) {
1664                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1665                 return 0;
1666         }
1667         if (r < 0 && errno != EINVAL)
1668                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1669         /* else use seccomp */
1670         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1671
1672         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1673                 return 0;
1674
1675         return seccomp_memory_deny_write_execute();
1676 }
1677
1678 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1679         assert(u);
1680         assert(c);
1681
1682         if (!c->restrict_realtime)
1683                 return 0;
1684
1685         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1686                 return 0;
1687
1688         return seccomp_restrict_realtime();
1689 }
1690
1691 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1692         assert(u);
1693         assert(c);
1694
1695         if (!c->restrict_suid_sgid)
1696                 return 0;
1697
1698         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1699                 return 0;
1700
1701         return seccomp_restrict_suid_sgid();
1702 }
1703
1704 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1705         assert(u);
1706         assert(c);
1707
1708         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1709          * let's protect even those systems where this is left on in the kernel. */
1710
1711         if (!c->protect_kernel_tunables)
1712                 return 0;
1713
1714         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1715                 return 0;
1716
1717         return seccomp_protect_sysctl();
1718 }
1719
1720 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1721         assert(u);
1722         assert(c);
1723
1724         /* Turn off module syscalls on ProtectKernelModules=yes */
1725
1726         if (!c->protect_kernel_modules)
1727                 return 0;
1728
1729         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1730                 return 0;
1731
1732         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1733 }
1734
1735 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1736         assert(u);
1737         assert(c);
1738
1739         if (!c->protect_kernel_logs)
1740                 return 0;
1741
1742         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1743                 return 0;
1744
1745         return seccomp_protect_syslog();
1746 }
1747
1748 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1749         assert(u);
1750         assert(c);
1751
1752         if (!c->protect_clock)
1753                 return 0;
1754
1755         if (skip_seccomp_unavailable(u, "ProtectClock="))
1756                 return 0;
1757
1758         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1759 }
1760
1761 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1762         assert(u);
1763         assert(c);
1764
1765         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1766
1767         if (!c->private_devices)
1768                 return 0;
1769
1770         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1771                 return 0;
1772
1773         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1774 }
1775
1776 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1777         assert(u);
1778         assert(c);
1779
1780         if (!exec_context_restrict_namespaces_set(c))
1781                 return 0;
1782
1783         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1784                 return 0;
1785
1786         return seccomp_restrict_namespaces(c->restrict_namespaces);
1787 }
1788
1789 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1790         unsigned long personality;
1791         int r;
1792
1793         assert(u);
1794         assert(c);
1795
1796         if (!c->lock_personality)
1797                 return 0;
1798
1799         if (skip_seccomp_unavailable(u, "LockPersonality="))
1800                 return 0;
1801
1802         personality = c->personality;
1803
1804         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1805         if (personality == PERSONALITY_INVALID) {
1806
1807                 r = opinionated_personality(&personality);
1808                 if (r < 0)
1809                         return r;
1810         }
1811
1812         return seccomp_lock_personality(personality);
1813 }
1814
1815 #endif
1816
1817 #if HAVE_LIBBPF
1818 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1819         assert(u);
1820         assert(c);
1821
1822         if (!exec_context_restrict_filesystems_set(c))
1823                 return 0;
1824
1825         if (!u->manager->restrict_fs) {
1826                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1827                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1828                 return 0;
1829         }
1830
1831         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1832 }
1833 #endif
1834
1835 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1836         assert(u);
1837         assert(c);
1838
1839         if (!c->protect_hostname)
1840                 return 0;
1841
1842         if (ns_type_supported(NAMESPACE_UTS)) {
1843                 if (unshare(CLONE_NEWUTS) < 0) {
1844                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1845                                 *ret_exit_status = EXIT_NAMESPACE;
1846                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1847                         }
1848
1849                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1850                 }
1851         } else
1852                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1853
1854 #if HAVE_SECCOMP
1855         int r;
1856
1857         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1858                 return 0;
1859
1860         r = seccomp_protect_hostname();
1861         if (r < 0) {
1862                 *ret_exit_status = EXIT_SECCOMP;
1863                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1864         }
1865 #endif
1866
1867         return 0;
1868 }
1869
1870 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1871         assert(idle_pipe);
1872
1873         idle_pipe[1] = safe_close(idle_pipe[1]);
1874         idle_pipe[2] = safe_close(idle_pipe[2]);
1875
1876         if (idle_pipe[0] >= 0) {
1877                 int r;
1878
1879                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1880
1881                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1882                         ssize_t n;
1883
1884                         /* Signal systemd that we are bored and want to continue. */
1885                         n = write(idle_pipe[3], "x", 1);
1886                         if (n > 0)
1887                                 /* Wait for systemd to react to the signal above. */
1888                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1889                 }
1890
1891                 idle_pipe[0] = safe_close(idle_pipe[0]);
1892
1893         }
1894
1895         idle_pipe[3] = safe_close(idle_pipe[3]);
1896 }
1897
1898 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1899
1900 static int build_environment(
1901                 const Unit *u,
1902                 const ExecContext *c,
1903                 const ExecParameters *p,
1904                 const CGroupContext *cgroup_context,
1905                 size_t n_fds,
1906                 char **fdnames,
1907                 const char *home,
1908                 const char *username,
1909                 const char *shell,
1910                 dev_t journal_stream_dev,
1911                 ino_t journal_stream_ino,
1912                 const char *memory_pressure_path,
1913                 char ***ret) {
1914
1915         _cleanup_strv_free_ char **our_env = NULL;
1916         size_t n_env = 0;
1917         char *x;
1918         int r;
1919
1920         assert(u);
1921         assert(c);
1922         assert(p);
1923         assert(ret);
1924
1925 #define N_ENV_VARS 19
1926         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1927         if (!our_env)
1928                 return -ENOMEM;
1929
1930         if (n_fds > 0) {
1931                 _cleanup_free_ char *joined = NULL;
1932
1933                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1934                         return -ENOMEM;
1935                 our_env[n_env++] = x;
1936
1937                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1938                         return -ENOMEM;
1939                 our_env[n_env++] = x;
1940
1941                 joined = strv_join(fdnames, ":");
1942                 if (!joined)
1943                         return -ENOMEM;
1944
1945                 x = strjoin("LISTEN_FDNAMES=", joined);
1946                 if (!x)
1947                         return -ENOMEM;
1948                 our_env[n_env++] = x;
1949         }
1950
1951         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1952                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1953                         return -ENOMEM;
1954                 our_env[n_env++] = x;
1955
1956                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1957                         return -ENOMEM;
1958                 our_env[n_env++] = x;
1959         }
1960
1961         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1962          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1963          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1964         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1965                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1966                 if (!x)
1967                         return -ENOMEM;
1968                 our_env[n_env++] = x;
1969         }
1970
1971         if (home) {
1972                 x = strjoin("HOME=", home);
1973                 if (!x)
1974                         return -ENOMEM;
1975
1976                 path_simplify(x + 5);
1977                 our_env[n_env++] = x;
1978         }
1979
1980         if (username) {
1981                 x = strjoin("LOGNAME=", username);
1982                 if (!x)
1983                         return -ENOMEM;
1984                 our_env[n_env++] = x;
1985
1986                 x = strjoin("USER=", username);
1987                 if (!x)
1988                         return -ENOMEM;
1989                 our_env[n_env++] = x;
1990         }
1991
1992         if (shell) {
1993                 x = strjoin("SHELL=", shell);
1994                 if (!x)
1995                         return -ENOMEM;
1996
1997                 path_simplify(x + 6);
1998                 our_env[n_env++] = x;
1999         }
2000
2001         if (!sd_id128_is_null(u->invocation_id)) {
2002                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
2003                         return -ENOMEM;
2004
2005                 our_env[n_env++] = x;
2006         }
2007
2008         if (exec_context_needs_term(c)) {
2009                 _cleanup_free_ char *cmdline = NULL;
2010                 const char *tty_path, *term = NULL;
2011
2012                 tty_path = exec_context_tty_path(c);
2013
2014                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
2015                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
2016                  * container manager passes to PID 1 ends up all the way in the console login shown. */
2017
2018                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
2019                         term = getenv("TERM");
2020                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
2021                         _cleanup_free_ char *key = NULL;
2022
2023                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2024                         if (!key)
2025                                 return -ENOMEM;
2026
2027                         r = proc_cmdline_get_key(key, 0, &cmdline);
2028                         if (r < 0)
2029                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2030                         else if (r > 0)
2031                                 term = cmdline;
2032                 }
2033
2034                 if (!term)
2035                         term = default_term_for_tty(tty_path);
2036
2037                 x = strjoin("TERM=", term);
2038                 if (!x)
2039                         return -ENOMEM;
2040                 our_env[n_env++] = x;
2041         }
2042
2043         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2044                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2045                         return -ENOMEM;
2046
2047                 our_env[n_env++] = x;
2048         }
2049
2050         if (c->log_namespace) {
2051                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2052                 if (!x)
2053                         return -ENOMEM;
2054
2055                 our_env[n_env++] = x;
2056         }
2057
2058         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2059                 _cleanup_free_ char *joined = NULL;
2060                 const char *n;
2061
2062                 if (!p->prefix[t])
2063                         continue;
2064
2065                 if (c->directories[t].n_items == 0)
2066                         continue;
2067
2068                 n = exec_directory_env_name_to_string(t);
2069                 if (!n)
2070                         continue;
2071
2072                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2073                         _cleanup_free_ char *prefixed = NULL;
2074
2075                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2076                         if (!prefixed)
2077                                 return -ENOMEM;
2078
2079                         if (!strextend_with_separator(&joined, ":", prefixed))
2080                                 return -ENOMEM;
2081                 }
2082
2083                 x = strjoin(n, "=", joined);
2084                 if (!x)
2085                         return -ENOMEM;
2086
2087                 our_env[n_env++] = x;
2088         }
2089
2090         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2091                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2092                 if (!x)
2093                         return -ENOMEM;
2094
2095                 our_env[n_env++] = x;
2096         }
2097
2098         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2099                 return -ENOMEM;
2100
2101         our_env[n_env++] = x;
2102
2103         if (memory_pressure_path) {
2104                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2105                 if (!x)
2106                         return -ENOMEM;
2107
2108                 our_env[n_env++] = x;
2109
2110                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2111                         _cleanup_free_ char *b = NULL, *e = NULL;
2112
2113                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2114                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2115                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2116                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2117                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2118                                 return -ENOMEM;
2119
2120                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2121                                 return -ENOMEM;
2122
2123                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2124                         if (!x)
2125                                 return -ENOMEM;
2126
2127                         our_env[n_env++] = x;
2128                 }
2129         }
2130
2131         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2132 #undef N_ENV_VARS
2133
2134         *ret = TAKE_PTR(our_env);
2135
2136         return 0;
2137 }
2138
2139 static int build_pass_environment(const ExecContext *c, char ***ret) {
2140         _cleanup_strv_free_ char **pass_env = NULL;
2141         size_t n_env = 0;
2142
2143         STRV_FOREACH(i, c->pass_environment) {
2144                 _cleanup_free_ char *x = NULL;
2145                 char *v;
2146
2147                 v = getenv(*i);
2148                 if (!v)
2149                         continue;
2150                 x = strjoin(*i, "=", v);
2151                 if (!x)
2152                         return -ENOMEM;
2153
2154                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2155                         return -ENOMEM;
2156
2157                 pass_env[n_env++] = TAKE_PTR(x);
2158                 pass_env[n_env] = NULL;
2159         }
2160
2161         *ret = TAKE_PTR(pass_env);
2162
2163         return 0;
2164 }
2165
2166 bool exec_needs_network_namespace(const ExecContext *context) {
2167         assert(context);
2168
2169         return context->private_network || context->network_namespace_path;
2170 }
2171
2172 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2173         assert(context);
2174
2175         return context->private_ipc || context->ipc_namespace_path;
2176 }
2177
2178 bool exec_needs_mount_namespace(
2179                 const ExecContext *context,
2180                 const ExecParameters *params,
2181                 const ExecRuntime *runtime) {
2182
2183         assert(context);
2184
2185         if (context->root_image)
2186                 return true;
2187
2188         if (!strv_isempty(context->read_write_paths) ||
2189             !strv_isempty(context->read_only_paths) ||
2190             !strv_isempty(context->inaccessible_paths) ||
2191             !strv_isempty(context->exec_paths) ||
2192             !strv_isempty(context->no_exec_paths))
2193                 return true;
2194
2195         if (context->n_bind_mounts > 0)
2196                 return true;
2197
2198         if (context->n_temporary_filesystems > 0)
2199                 return true;
2200
2201         if (context->n_mount_images > 0)
2202                 return true;
2203
2204         if (context->n_extension_images > 0)
2205                 return true;
2206
2207         if (!strv_isempty(context->extension_directories))
2208                 return true;
2209
2210         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2211                 return true;
2212
2213         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2214                 return true;
2215
2216         if (context->private_devices ||
2217             context->private_mounts > 0 ||
2218             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2219             context->protect_system != PROTECT_SYSTEM_NO ||
2220             context->protect_home != PROTECT_HOME_NO ||
2221             context->protect_kernel_tunables ||
2222             context->protect_kernel_modules ||
2223             context->protect_kernel_logs ||
2224             context->protect_control_groups ||
2225             context->protect_proc != PROTECT_PROC_DEFAULT ||
2226             context->proc_subset != PROC_SUBSET_ALL ||
2227             exec_needs_ipc_namespace(context))
2228                 return true;
2229
2230         if (context->root_directory) {
2231                 if (exec_context_get_effective_mount_apivfs(context))
2232                         return true;
2233
2234                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2235                         if (params && !params->prefix[t])
2236                                 continue;
2237
2238                         if (context->directories[t].n_items > 0)
2239                                 return true;
2240                 }
2241         }
2242
2243         if (context->dynamic_user &&
2244             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2245              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2246              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2247                 return true;
2248
2249         if (context->log_namespace)
2250                 return true;
2251
2252         return false;
2253 }
2254
2255 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2256         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2257         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2258         _cleanup_close_ int unshare_ready_fd = -EBADF;
2259         _cleanup_(sigkill_waitp) pid_t pid = 0;
2260         uint64_t c = 1;
2261         ssize_t n;
2262         int r;
2263
2264         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2265          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2266          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2267          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2268          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2269          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2270          * continues execution normally.
2271          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2272          * does not need CAP_SETUID to write the single line mapping to itself. */
2273
2274         /* Can only set up multiple mappings with CAP_SETUID. */
2275         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2276                 r = asprintf(&uid_map,
2277                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2278                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2279                              ouid, ouid, uid, uid);
2280         else
2281                 r = asprintf(&uid_map,
2282                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2283                              ouid, ouid);
2284
2285         if (r < 0)
2286                 return -ENOMEM;
2287
2288         /* Can only set up multiple mappings with CAP_SETGID. */
2289         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2290                 r = asprintf(&gid_map,
2291                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2292                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2293                              ogid, ogid, gid, gid);
2294         else
2295                 r = asprintf(&gid_map,
2296                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2297                              ogid, ogid);
2298
2299         if (r < 0)
2300                 return -ENOMEM;
2301
2302         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2303          * namespace. */
2304         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2305         if (unshare_ready_fd < 0)
2306                 return -errno;
2307
2308         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2309          * failed. */
2310         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2311                 return -errno;
2312
2313         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2314         if (r < 0)
2315                 return r;
2316         if (r == 0) {
2317                 _cleanup_close_ int fd = -EBADF;
2318                 const char *a;
2319                 pid_t ppid;
2320
2321                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2322                  * here, after the parent opened its own user namespace. */
2323
2324                 ppid = getppid();
2325                 errno_pipe[0] = safe_close(errno_pipe[0]);
2326
2327                 /* Wait until the parent unshared the user namespace */
2328                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2329                         r = -errno;
2330                         goto child_fail;
2331                 }
2332
2333                 /* Disable the setgroups() system call in the child user namespace, for good. */
2334                 a = procfs_file_alloca(ppid, "setgroups");
2335                 fd = open(a, O_WRONLY|O_CLOEXEC);
2336                 if (fd < 0) {
2337                         if (errno != ENOENT) {
2338                                 r = -errno;
2339                                 goto child_fail;
2340                         }
2341
2342                         /* If the file is missing the kernel is too old, let's continue anyway. */
2343                 } else {
2344                         if (write(fd, "deny\n", 5) < 0) {
2345                                 r = -errno;
2346                                 goto child_fail;
2347                         }
2348
2349                         fd = safe_close(fd);
2350                 }
2351
2352                 /* First write the GID map */
2353                 a = procfs_file_alloca(ppid, "gid_map");
2354                 fd = open(a, O_WRONLY|O_CLOEXEC);
2355                 if (fd < 0) {
2356                         r = -errno;
2357                         goto child_fail;
2358                 }
2359                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2360                         r = -errno;
2361                         goto child_fail;
2362                 }
2363                 fd = safe_close(fd);
2364
2365                 /* The write the UID map */
2366                 a = procfs_file_alloca(ppid, "uid_map");
2367                 fd = open(a, O_WRONLY|O_CLOEXEC);
2368                 if (fd < 0) {
2369                         r = -errno;
2370                         goto child_fail;
2371                 }
2372                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2373                         r = -errno;
2374                         goto child_fail;
2375                 }
2376
2377                 _exit(EXIT_SUCCESS);
2378
2379         child_fail:
2380                 (void) write(errno_pipe[1], &r, sizeof(r));
2381                 _exit(EXIT_FAILURE);
2382         }
2383
2384         errno_pipe[1] = safe_close(errno_pipe[1]);
2385
2386         if (unshare(CLONE_NEWUSER) < 0)
2387                 return -errno;
2388
2389         /* Let the child know that the namespace is ready now */
2390         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2391                 return -errno;
2392
2393         /* Try to read an error code from the child */
2394         n = read(errno_pipe[0], &r, sizeof(r));
2395         if (n < 0)
2396                 return -errno;
2397         if (n == sizeof(r)) { /* an error code was sent to us */
2398                 if (r < 0)
2399                         return r;
2400                 return -EIO;
2401         }
2402         if (n != 0) /* on success we should have read 0 bytes */
2403                 return -EIO;
2404
2405         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2406         if (r < 0)
2407                 return r;
2408         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2409                 return -EIO;
2410
2411         return 0;
2412 }
2413
2414 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2415         assert(context);
2416
2417         if (!context->dynamic_user)
2418                 return false;
2419
2420         if (type == EXEC_DIRECTORY_CONFIGURATION)
2421                 return false;
2422
2423         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2424                 return false;
2425
2426         return true;
2427 }
2428
2429 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2430         _cleanup_free_ char *src_abs = NULL;
2431         int r;
2432
2433         assert(source);
2434
2435         src_abs = path_join(root, source);
2436         if (!src_abs)
2437                 return -ENOMEM;
2438
2439         STRV_FOREACH(dst, symlinks) {
2440                 _cleanup_free_ char *dst_abs = NULL;
2441
2442                 dst_abs = path_join(root, *dst);
2443                 if (!dst_abs)
2444                         return -ENOMEM;
2445
2446                 r = mkdir_parents_label(dst_abs, 0755);
2447                 if (r < 0)
2448                         return r;
2449
2450                 r = symlink_idempotent(src_abs, dst_abs, true);
2451                 if (r < 0)
2452                         return r;
2453         }
2454
2455         return 0;
2456 }
2457
2458 static int setup_exec_directory(
2459                 const ExecContext *context,
2460                 const ExecParameters *params,
2461                 uid_t uid,
2462                 gid_t gid,
2463                 ExecDirectoryType type,
2464                 bool needs_mount_namespace,
2465                 int *exit_status) {
2466
2467         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2468                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2469                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2470                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2471                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2472                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2473         };
2474         int r;
2475
2476         assert(context);
2477         assert(params);
2478         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2479         assert(exit_status);
2480
2481         if (!params->prefix[type])
2482                 return 0;
2483
2484         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2485                 if (!uid_is_valid(uid))
2486                         uid = 0;
2487                 if (!gid_is_valid(gid))
2488                         gid = 0;
2489         }
2490
2491         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2492                 _cleanup_free_ char *p = NULL, *pp = NULL;
2493
2494                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2495                 if (!p) {
2496                         r = -ENOMEM;
2497                         goto fail;
2498                 }
2499
2500                 r = mkdir_parents_label(p, 0755);
2501                 if (r < 0)
2502                         goto fail;
2503
2504                 if (exec_directory_is_private(context, type)) {
2505                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2506                          * case we want to avoid leaving a directory around fully accessible that is owned by
2507                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2508                          * trick used by container managers to prohibit host users to get access to files of
2509                          * the same UID in containers: we place everything inside a directory that has an
2510                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2511                          * for unprivileged host code. We then use fs namespacing to make this directory
2512                          * permeable for the service itself.
2513                          *
2514                          * Specifically: for a service which wants a special directory "foo/" we first create
2515                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2516                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2517                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2518                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2519                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2520                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2521                          * for the service and making sure it only gets access to the dirs it needs but no
2522                          * others. Tricky? Yes, absolutely, but it works!
2523                          *
2524                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2525                          * to be owned by the service itself.
2526                          *
2527                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2528                          * for sharing files or sockets with other services. */
2529
2530                         pp = path_join(params->prefix[type], "private");
2531                         if (!pp) {
2532                                 r = -ENOMEM;
2533                                 goto fail;
2534                         }
2535
2536                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2537                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2538                         if (r < 0)
2539                                 goto fail;
2540
2541                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2542                                 r = -ENOMEM;
2543                                 goto fail;
2544                         }
2545
2546                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2547                         r = mkdir_parents_label(pp, 0755);
2548                         if (r < 0)
2549                                 goto fail;
2550
2551                         if (is_dir(p, false) > 0 &&
2552                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2553
2554                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2555                                  * it over. Most likely the service has been upgraded from one that didn't use
2556                                  * DynamicUser=1, to one that does. */
2557
2558                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2559                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2560                                          exec_directory_type_to_string(type), p, pp);
2561
2562                                 if (rename(p, pp) < 0) {
2563                                         r = -errno;
2564                                         goto fail;
2565                                 }
2566                         } else {
2567                                 /* Otherwise, create the actual directory for the service */
2568
2569                                 r = mkdir_label(pp, context->directories[type].mode);
2570                                 if (r < 0 && r != -EEXIST)
2571                                         goto fail;
2572                         }
2573
2574                         if (!context->directories[type].items[i].only_create) {
2575                                 /* And link it up from the original place.
2576                                  * Notes
2577                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2578                                  *    the host, and a new one for the child namespace will be created later.
2579                                  * 2) It is not necessary to create this symlink when one of its parent
2580                                  *    directories is specified and already created. E.g.
2581                                  *        StateDirectory=foo foo/bar
2582                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2583                                  *        pp = "/var/lib/private/foo/bar"
2584                                  *        p = "/var/lib/foo/bar"
2585                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2586                                  *    we do not need to create the symlink, but we cannot create the symlink.
2587                                  *    See issue #24783. */
2588                                 r = symlink_idempotent(pp, p, true);
2589                                 if (r < 0)
2590                                         goto fail;
2591                         }
2592
2593                 } else {
2594                         _cleanup_free_ char *target = NULL;
2595
2596                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2597                             readlink_and_make_absolute(p, &target) >= 0) {
2598                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2599
2600                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2601                                  * by DynamicUser=1 (see above)?
2602                                  *
2603                                  * We do this for all directory types except for ConfigurationDirectory=,
2604                                  * since they all support the private/ symlink logic at least in some
2605                                  * configurations, see above. */
2606
2607                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2608                                 if (r < 0)
2609                                         goto fail;
2610
2611                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2612                                 if (!q) {
2613                                         r = -ENOMEM;
2614                                         goto fail;
2615                                 }
2616
2617                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2618                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2619                                 if (r < 0)
2620                                         goto fail;
2621
2622                                 if (path_equal(q_resolved, target_resolved)) {
2623
2624                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2625                                          * but is no longer. Let's move the directory back up. */
2626
2627                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2628                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2629                                                  exec_directory_type_to_string(type), q, p);
2630
2631                                         if (unlink(p) < 0) {
2632                                                 r = -errno;
2633                                                 goto fail;
2634                                         }
2635
2636                                         if (rename(q, p) < 0) {
2637                                                 r = -errno;
2638                                                 goto fail;
2639                                         }
2640                                 }
2641                         }
2642
2643                         r = mkdir_label(p, context->directories[type].mode);
2644                         if (r < 0) {
2645                                 if (r != -EEXIST)
2646                                         goto fail;
2647
2648                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2649                                         struct stat st;
2650
2651                                         /* Don't change the owner/access mode of the configuration directory,
2652                                          * as in the common case it is not written to by a service, and shall
2653                                          * not be writable. */
2654
2655                                         if (stat(p, &st) < 0) {
2656                                                 r = -errno;
2657                                                 goto fail;
2658                                         }
2659
2660                                         /* Still complain if the access mode doesn't match */
2661                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2662                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2663                                                             "(File system: %o %sMode: %o)",
2664                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2665                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2666
2667                                         continue;
2668                                 }
2669                         }
2670                 }
2671
2672                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2673                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2674                  * current UID/GID ownership.) */
2675                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2676                 if (r < 0)
2677                         goto fail;
2678
2679                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2680                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2681                  * assignments to exist. */
2682                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2683                 if (r < 0)
2684                         goto fail;
2685         }
2686
2687         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2688          * they are set up later, to allow configuring empty var/run/etc. */
2689         if (!needs_mount_namespace)
2690                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2691                         r = create_many_symlinks(params->prefix[type],
2692                                                  context->directories[type].items[i].path,
2693                                                  context->directories[type].items[i].symlinks);
2694                         if (r < 0)
2695                                 goto fail;
2696                 }
2697
2698         return 0;
2699
2700 fail:
2701         *exit_status = exit_status_table[type];
2702         return r;
2703 }
2704
2705 static int write_credential(
2706                 int dfd,
2707                 const char *id,
2708                 const void *data,
2709                 size_t size,
2710                 uid_t uid,
2711                 bool ownership_ok) {
2712
2713         _cleanup_(unlink_and_freep) char *tmp = NULL;
2714         _cleanup_close_ int fd = -EBADF;
2715         int r;
2716
2717         r = tempfn_random_child("", "cred", &tmp);
2718         if (r < 0)
2719                 return r;
2720
2721         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2722         if (fd < 0) {
2723                 tmp = mfree(tmp);
2724                 return -errno;
2725         }
2726
2727         r = loop_write(fd, data, size, /* do_poll = */ false);
2728         if (r < 0)
2729                 return r;
2730
2731         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2732                 return -errno;
2733
2734         if (uid_is_valid(uid) && uid != getuid()) {
2735                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2736                 if (r < 0) {
2737                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2738                                 return r;
2739
2740                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2741                                             * to express: that the user gets read access and nothing
2742                                             * else. But if the backing fs can't support that (e.g. ramfs)
2743                                             * then we can use file ownership instead. But that's only safe if
2744                                             * we can then re-mount the whole thing read-only, so that the
2745                                             * user can no longer chmod() the file to gain write access. */
2746                                 return r;
2747
2748                         if (fchown(fd, uid, GID_INVALID) < 0)
2749                                 return -errno;
2750                 }
2751         }
2752
2753         if (renameat(dfd, tmp, dfd, id) < 0)
2754                 return -errno;
2755
2756         tmp = mfree(tmp);
2757         return 0;
2758 }
2759
2760 static char **credential_search_path(
2761                 const ExecParameters *params,
2762                 bool encrypted) {
2763
2764         _cleanup_strv_free_ char **l = NULL;
2765
2766         assert(params);
2767
2768         /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2769          * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2770          * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2771
2772         if (encrypted) {
2773                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2774                         return NULL;
2775
2776                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2777                         return NULL;
2778         }
2779
2780         if (params->received_credentials_directory)
2781                 if (strv_extend(&l, params->received_credentials_directory) < 0)
2782                         return NULL;
2783
2784         if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2785                 return NULL;
2786
2787         if (DEBUG_LOGGING) {
2788                 _cleanup_free_ char *t = strv_join(l, ":");
2789
2790                 log_debug("Credential search path is: %s", t);
2791         }
2792
2793         return TAKE_PTR(l);
2794 }
2795
2796 static int load_credential(
2797                 const ExecContext *context,
2798                 const ExecParameters *params,
2799                 const char *id,
2800                 const char *path,
2801                 bool encrypted,
2802                 const char *unit,
2803                 int read_dfd,
2804                 int write_dfd,
2805                 uid_t uid,
2806                 bool ownership_ok,
2807                 uint64_t *left) {
2808
2809         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2810         _cleanup_strv_free_ char **search_path = NULL;
2811         _cleanup_(erase_and_freep) char *data = NULL;
2812         _cleanup_free_ char *bindname = NULL;
2813         const char *source = NULL;
2814         bool missing_ok = true;
2815         size_t size, add, maxsz;
2816         int r;
2817
2818         assert(context);
2819         assert(params);
2820         assert(id);
2821         assert(path);
2822         assert(unit);
2823         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2824         assert(write_dfd >= 0);
2825         assert(left);
2826
2827         if (read_dfd >= 0) {
2828                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2829                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2830                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2831                  * open it. */
2832
2833                 if (!filename_is_valid(path)) /* safety check */
2834                         return -EINVAL;
2835
2836                 missing_ok = true;
2837                 source = path;
2838
2839         } else if (path_is_absolute(path)) {
2840                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2841                  * sockets */
2842
2843                 if (!path_is_valid(path)) /* safety check */
2844                         return -EINVAL;
2845
2846                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2847
2848                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2849                  * via the source socket address in case we read off an AF_UNIX socket. */
2850                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2851                         return -ENOMEM;
2852
2853                 missing_ok = false;
2854                 source = path;
2855
2856         } else if (credential_name_valid(path)) {
2857                 /* If this is a relative path, take it as credential name relative to the credentials
2858                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2859                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2860
2861                 search_path = credential_search_path(params, encrypted);
2862                 if (!search_path)
2863                         return -ENOMEM;
2864
2865                 missing_ok = true;
2866         } else
2867                 source = NULL;
2868
2869         if (encrypted)
2870                 flags |= READ_FULL_FILE_UNBASE64;
2871
2872         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2873
2874         if (search_path) {
2875                 STRV_FOREACH(d, search_path) {
2876                         _cleanup_free_ char *j = NULL;
2877
2878                         j = path_join(*d, path);
2879                         if (!j)
2880                                 return -ENOMEM;
2881
2882                         r = read_full_file_full(
2883                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2884                                         UINT64_MAX,
2885                                         maxsz,
2886                                         flags,
2887                                         NULL,
2888                                         &data, &size);
2889                         if (r != -ENOENT)
2890                                 break;
2891                 }
2892         } else if (source)
2893                 r = read_full_file_full(
2894                                 read_dfd, source,
2895                                 UINT64_MAX,
2896                                 maxsz,
2897                                 flags,
2898                                 bindname,
2899                                 &data, &size);
2900         else
2901                 r = -ENOENT;
2902
2903         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2904                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2905                  * will get clear errors if we don't pass such a missing credential on as they
2906                  * themselves will get ENOENT when trying to read them, which should not be much
2907                  * worse than when we handle the error here and make it fatal.
2908                  *
2909                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2910                  * we are fine, too. */
2911                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2912                 return 0;
2913         }
2914         if (r < 0)
2915                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2916
2917         if (encrypted) {
2918                 _cleanup_free_ void *plaintext = NULL;
2919                 size_t plaintext_size = 0;
2920
2921                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2922                 if (r < 0)
2923                         return r;
2924
2925                 free_and_replace(data, plaintext);
2926                 size = plaintext_size;
2927         }
2928
2929         add = strlen(id) + size;
2930         if (add > *left)
2931                 return -E2BIG;
2932
2933         r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2934         if (r < 0)
2935                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2936
2937         *left -= add;
2938         return 0;
2939 }
2940
2941 struct load_cred_args {
2942         const ExecContext *context;
2943         const ExecParameters *params;
2944         bool encrypted;
2945         const char *unit;
2946         int dfd;
2947         uid_t uid;
2948         bool ownership_ok;
2949         uint64_t *left;
2950 };
2951
2952 static int load_cred_recurse_dir_cb(
2953                 RecurseDirEvent event,
2954                 const char *path,
2955                 int dir_fd,
2956                 int inode_fd,
2957                 const struct dirent *de,
2958                 const struct statx *sx,
2959                 void *userdata) {
2960
2961         struct load_cred_args *args = ASSERT_PTR(userdata);
2962         _cleanup_free_ char *sub_id = NULL;
2963         int r;
2964
2965         if (event != RECURSE_DIR_ENTRY)
2966                 return RECURSE_DIR_CONTINUE;
2967
2968         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2969                 return RECURSE_DIR_CONTINUE;
2970
2971         sub_id = strreplace(path, "/", "_");
2972         if (!sub_id)
2973                 return -ENOMEM;
2974
2975         if (!credential_name_valid(sub_id))
2976                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2977
2978         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2979                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2980                 return RECURSE_DIR_CONTINUE;
2981         }
2982         if (errno != ENOENT)
2983                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2984
2985         r = load_credential(
2986                         args->context,
2987                         args->params,
2988                         sub_id,
2989                         de->d_name,
2990                         args->encrypted,
2991                         args->unit,
2992                         dir_fd,
2993                         args->dfd,
2994                         args->uid,
2995                         args->ownership_ok,
2996                         args->left);
2997         if (r < 0)
2998                 return r;
2999
3000         return RECURSE_DIR_CONTINUE;
3001 }
3002
3003 static int acquire_credentials(
3004                 const ExecContext *context,
3005                 const ExecParameters *params,
3006                 const char *unit,
3007                 const char *p,
3008                 uid_t uid,
3009                 bool ownership_ok) {
3010
3011         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3012         _cleanup_close_ int dfd = -EBADF;
3013         ExecLoadCredential *lc;
3014         ExecSetCredential *sc;
3015         int r;
3016
3017         assert(context);
3018         assert(p);
3019
3020         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3021         if (dfd < 0)
3022                 return -errno;
3023
3024         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3025         HASHMAP_FOREACH(lc, context->load_credentials) {
3026                 _cleanup_close_ int sub_fd = -EBADF;
3027
3028                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3029                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3030                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
3031                  * propagate a credential passed to us from further up. */
3032
3033                 if (path_is_absolute(lc->path)) {
3034                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3035                         if (sub_fd < 0 && !IN_SET(errno,
3036                                                   ENOTDIR,  /* Not a directory */
3037                                                   ENOENT))  /* Doesn't exist? */
3038                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3039                 }
3040
3041                 if (sub_fd < 0)
3042                         /* Regular file (incl. a credential passed in from higher up) */
3043                         r = load_credential(
3044                                         context,
3045                                         params,
3046                                         lc->id,
3047                                         lc->path,
3048                                         lc->encrypted,
3049                                         unit,
3050                                         AT_FDCWD,
3051                                         dfd,
3052                                         uid,
3053                                         ownership_ok,
3054                                         &left);
3055                 else
3056                         /* Directory */
3057                         r = recurse_dir(
3058                                         sub_fd,
3059                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3060                                         /* statx_mask= */ 0,
3061                                         /* n_depth_max= */ UINT_MAX,
3062                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3063                                         load_cred_recurse_dir_cb,
3064                                         &(struct load_cred_args) {
3065                                                 .context = context,
3066                                                 .params = params,
3067                                                 .encrypted = lc->encrypted,
3068                                                 .unit = unit,
3069                                                 .dfd = dfd,
3070                                                 .uid = uid,
3071                                                 .ownership_ok = ownership_ok,
3072                                                 .left = &left,
3073                                         });
3074                 if (r < 0)
3075                         return r;
3076         }
3077
3078         /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
3079          * them, so that they can act as a "default" if the same credential is specified multiple times. */
3080         HASHMAP_FOREACH(sc, context->set_credentials) {
3081                 _cleanup_(erase_and_freep) void *plaintext = NULL;
3082                 const char *data;
3083                 size_t size, add;
3084
3085                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3086                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3087                  * slow and involved, hence it's nice to be able to skip that if the credential already
3088                  * exists anyway. */
3089                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3090                         continue;
3091                 if (errno != ENOENT)
3092                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3093
3094                 if (sc->encrypted) {
3095                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3096                         if (r < 0)
3097                                 return r;
3098
3099                         data = plaintext;
3100                 } else {
3101                         data = sc->data;
3102                         size = sc->size;
3103                 }
3104
3105                 add = strlen(sc->id) + size;
3106                 if (add > left)
3107                         return -E2BIG;
3108
3109                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3110                 if (r < 0)
3111                         return r;
3112
3113                 left -= add;
3114         }
3115
3116         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
3117                 return -errno;
3118
3119         /* After we created all keys with the right perms, also make sure the credential store as a whole is
3120          * accessible */
3121
3122         if (uid_is_valid(uid) && uid != getuid()) {
3123                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3124                 if (r < 0) {
3125                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3126                                 return r;
3127
3128                         if (!ownership_ok)
3129                                 return r;
3130
3131                         if (fchown(dfd, uid, GID_INVALID) < 0)
3132                                 return -errno;
3133                 }
3134         }
3135
3136         return 0;
3137 }
3138
3139 static int setup_credentials_internal(
3140                 const ExecContext *context,
3141                 const ExecParameters *params,
3142                 const char *unit,
3143                 const char *final,        /* This is where the credential store shall eventually end up at */
3144                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
3145                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
3146                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3147                 uid_t uid) {
3148
3149         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3150                                    * if we mounted something; false if we definitely can't mount anything */
3151         bool final_mounted;
3152         const char *where;
3153
3154         assert(context);
3155         assert(final);
3156         assert(workspace);
3157
3158         if (reuse_workspace) {
3159                 r = path_is_mount_point(workspace, NULL, 0);
3160                 if (r < 0)
3161                         return r;
3162                 if (r > 0)
3163                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3164                 else
3165                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3166         } else
3167                 workspace_mounted = -1; /* ditto */
3168
3169         r = path_is_mount_point(final, NULL, 0);
3170         if (r < 0)
3171                 return r;
3172         if (r > 0) {
3173                 /* If the final place already has something mounted, we use that. If the workspace also has
3174                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3175                  * different). */
3176                 final_mounted = true;
3177
3178                 if (workspace_mounted < 0) {
3179                         /* If the final place is mounted, but the workspace isn't, then let's bind mount
3180                          * the final version to the workspace, and make it writable, so that we can make
3181                          * changes */
3182
3183                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3184                         if (r < 0)
3185                                 return r;
3186
3187                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3188                         if (r < 0)
3189                                 return r;
3190
3191                         workspace_mounted = true;
3192                 }
3193         } else
3194                 final_mounted = false;
3195
3196         if (workspace_mounted < 0) {
3197                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3198                 for (int try = 0;; try++) {
3199
3200                         if (try == 0) {
3201                                 /* Try "ramfs" first, since it's not swap backed */
3202                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3203                                 if (r >= 0) {
3204                                         workspace_mounted = true;
3205                                         break;
3206                                 }
3207
3208                         } else if (try == 1) {
3209                                 _cleanup_free_ char *opts = NULL;
3210
3211                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3212                                         return -ENOMEM;
3213
3214                                 /* Fall back to "tmpfs" otherwise */
3215                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3216                                 if (r >= 0) {
3217                                         workspace_mounted = true;
3218                                         break;
3219                                 }
3220
3221                         } else {
3222                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3223                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3224                                 if (r < 0) {
3225                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3226                                                 return r;
3227
3228                                         if (must_mount) /* If we it's not OK to use the plain directory
3229                                                          * fallback, propagate all errors too */
3230                                                 return r;
3231
3232                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3233                                          * proceed for compat with container envs, and just use the final dir
3234                                          * as is. */
3235
3236                                         workspace_mounted = false;
3237                                         break;
3238                                 }
3239
3240                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3241                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3242                                 if (r < 0)
3243                                         return r;
3244
3245                                 workspace_mounted = true;
3246                                 break;
3247                         }
3248                 }
3249         }
3250
3251         assert(!must_mount || workspace_mounted > 0);
3252         where = workspace_mounted ? workspace : final;
3253
3254         (void) label_fix_full(AT_FDCWD, where, final, 0);
3255
3256         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3257         if (r < 0)
3258                 return r;
3259
3260         if (workspace_mounted) {
3261                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3262                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3263                 if (r < 0)
3264                         return r;
3265
3266                 /* And mount it to the final place, read-only */
3267                 if (final_mounted)
3268                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3269                 else
3270                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3271                 if (r < 0)
3272                         return r;
3273         } else {
3274                 _cleanup_free_ char *parent = NULL;
3275
3276                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3277                  * open access to the top-level credential directory and the per-service directory now */
3278
3279                 r = path_extract_directory(final, &parent);
3280                 if (r < 0)
3281                         return r;
3282                 if (chmod(parent, 0755) < 0)
3283                         return -errno;
3284         }
3285
3286         return 0;
3287 }
3288
3289 static int setup_credentials(
3290                 const ExecContext *context,
3291                 const ExecParameters *params,
3292                 const char *unit,
3293                 uid_t uid) {
3294
3295         _cleanup_free_ char *p = NULL, *q = NULL;
3296         int r;
3297
3298         assert(context);
3299         assert(params);
3300
3301         if (!exec_context_has_credentials(context))
3302                 return 0;
3303
3304         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3305                 return -EINVAL;
3306
3307         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3308          * and the subdir we mount over with a read-only file system readable by the service's user */
3309         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3310         if (!q)
3311                 return -ENOMEM;
3312
3313         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3314         if (r < 0 && r != -EEXIST)
3315                 return r;
3316
3317         p = path_join(q, unit);
3318         if (!p)
3319                 return -ENOMEM;
3320
3321         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3322         if (r < 0 && r != -EEXIST)
3323                 return r;
3324
3325         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3326         if (r < 0) {
3327                 _cleanup_free_ char *t = NULL, *u = NULL;
3328
3329                 /* If this is not a privilege or support issue then propagate the error */
3330                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3331                         return r;
3332
3333                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3334                  * it into place, so that users can't access half-initialized credential stores. */
3335                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3336                 if (!t)
3337                         return -ENOMEM;
3338
3339                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3340                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3341                  * after it is fully set up */
3342                 u = path_join(t, unit);
3343                 if (!u)
3344                         return -ENOMEM;
3345
3346                 FOREACH_STRING(i, t, u) {
3347                         r = mkdir_label(i, 0700);
3348                         if (r < 0 && r != -EEXIST)
3349                                 return r;
3350                 }
3351
3352                 r = setup_credentials_internal(
3353                                 context,
3354                                 params,
3355                                 unit,
3356                                 p,       /* final mount point */
3357                                 u,       /* temporary workspace to overmount */
3358                                 true,    /* reuse the workspace if it is already a mount */
3359                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3360                                 uid);
3361
3362                 (void) rmdir(u); /* remove the workspace again if we can. */
3363
3364                 if (r < 0)
3365                         return r;
3366
3367         } else if (r == 0) {
3368
3369                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3370                  * we can use the same directory for all cases, after turning off propagation. Question
3371                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3372                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3373                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3374                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3375                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3376                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3377                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3378                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3379                  * propagation on the former, and then overmount the latter.
3380                  *
3381                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3382                  * for this purpose, but there are few other candidates that work equally well for us, and
3383                  * given that the we do this in a privately namespaced short-lived single-threaded process
3384                  * that no one else sees this should be OK to do. */
3385
3386                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3387                 if (r < 0)
3388                         goto child_fail;
3389
3390                 r = setup_credentials_internal(
3391                                 context,
3392                                 params,
3393                                 unit,
3394                                 p,           /* final mount point */
3395                                 "/dev/shm",  /* temporary workspace to overmount */
3396                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3397                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3398                                 uid);
3399                 if (r < 0)
3400                         goto child_fail;
3401
3402                 _exit(EXIT_SUCCESS);
3403
3404         child_fail:
3405                 _exit(EXIT_FAILURE);
3406         }
3407
3408         return 0;
3409 }
3410
3411 #if ENABLE_SMACK
3412 static int setup_smack(
3413                 const Manager *manager,
3414                 const ExecContext *context,
3415                 int executable_fd) {
3416         int r;
3417
3418         assert(context);
3419         assert(executable_fd >= 0);
3420
3421         if (context->smack_process_label) {
3422                 r = mac_smack_apply_pid(0, context->smack_process_label);
3423                 if (r < 0)
3424                         return r;
3425         } else if (manager->default_smack_process_label) {
3426                 _cleanup_free_ char *exec_label = NULL;
3427
3428                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3429                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3430                         return r;
3431
3432                 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3433                 if (r < 0)
3434                         return r;
3435         }
3436
3437         return 0;
3438 }
3439 #endif
3440
3441 static int compile_bind_mounts(
3442                 const ExecContext *context,
3443                 const ExecParameters *params,
3444                 BindMount **ret_bind_mounts,
3445                 size_t *ret_n_bind_mounts,
3446                 char ***ret_empty_directories) {
3447
3448         _cleanup_strv_free_ char **empty_directories = NULL;
3449         BindMount *bind_mounts = NULL;
3450         size_t n, h = 0;
3451         int r;
3452
3453         assert(context);
3454         assert(params);
3455         assert(ret_bind_mounts);
3456         assert(ret_n_bind_mounts);
3457         assert(ret_empty_directories);
3458
3459         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3460
3461         n = context->n_bind_mounts;
3462         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3463                 if (!params->prefix[t])
3464                         continue;
3465
3466                 for (size_t i = 0; i < context->directories[t].n_items; i++)
3467                         n += !context->directories[t].items[i].only_create;
3468         }
3469
3470         if (n <= 0) {
3471                 *ret_bind_mounts = NULL;
3472                 *ret_n_bind_mounts = 0;
3473                 *ret_empty_directories = NULL;
3474                 return 0;
3475         }
3476
3477         bind_mounts = new(BindMount, n);
3478         if (!bind_mounts)
3479                 return -ENOMEM;
3480
3481         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3482                 BindMount *item = context->bind_mounts + i;
3483                 _cleanup_free_ char *s = NULL, *d = NULL;
3484
3485                 s = strdup(item->source);
3486                 if (!s)
3487                         return -ENOMEM;
3488
3489                 d = strdup(item->destination);
3490                 if (!d)
3491                         return -ENOMEM;
3492
3493                 bind_mounts[h++] = (BindMount) {
3494                         .source = TAKE_PTR(s),
3495                         .destination = TAKE_PTR(d),
3496                         .read_only = item->read_only,
3497                         .recursive = item->recursive,
3498                         .ignore_enoent = item->ignore_enoent,
3499                 };
3500         }
3501
3502         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3503                 if (!params->prefix[t])
3504                         continue;
3505
3506                 if (context->directories[t].n_items == 0)
3507                         continue;
3508
3509                 if (exec_directory_is_private(context, t) &&
3510                     !exec_context_with_rootfs(context)) {
3511                         char *private_root;
3512
3513                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3514                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3515                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3516
3517                         private_root = path_join(params->prefix[t], "private");
3518                         if (!private_root)
3519                                 return -ENOMEM;
3520
3521                         r = strv_consume(&empty_directories, private_root);
3522                         if (r < 0)
3523                                 return r;
3524                 }
3525
3526                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3527                         _cleanup_free_ char *s = NULL, *d = NULL;
3528
3529                         /* When one of the parent directories is in the list, we cannot create the symlink
3530                          * for the child directory. See also the comments in setup_exec_directory(). */
3531                         if (context->directories[t].items[i].only_create)
3532                                 continue;
3533
3534                         if (exec_directory_is_private(context, t))
3535                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3536                         else
3537                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3538                         if (!s)
3539                                 return -ENOMEM;
3540
3541                         if (exec_directory_is_private(context, t) &&
3542                             exec_context_with_rootfs(context))
3543                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3544                                  * directory is not created on the root directory. So, let's bind-mount the directory
3545                                  * on the 'non-private' place. */
3546                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3547                         else
3548                                 d = strdup(s);
3549                         if (!d)
3550                                 return -ENOMEM;
3551
3552                         bind_mounts[h++] = (BindMount) {
3553                                 .source = TAKE_PTR(s),
3554                                 .destination = TAKE_PTR(d),
3555                                 .read_only = false,
3556                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3557                                 .recursive = true,
3558                                 .ignore_enoent = false,
3559                         };
3560                 }
3561         }
3562
3563         assert(h == n);
3564
3565         *ret_bind_mounts = TAKE_PTR(bind_mounts);
3566         *ret_n_bind_mounts = n;
3567         *ret_empty_directories = TAKE_PTR(empty_directories);
3568
3569         return (int) n;
3570 }
3571
3572 /* ret_symlinks will contain a list of pairs src:dest that describes
3573  * the symlinks to create later on. For example, the symlinks needed
3574  * to safely give private directories to DynamicUser=1 users. */
3575 static int compile_symlinks(
3576                 const ExecContext *context,
3577                 const ExecParameters *params,
3578                 char ***ret_symlinks) {
3579
3580         _cleanup_strv_free_ char **symlinks = NULL;
3581         int r;
3582
3583         assert(context);
3584         assert(params);
3585         assert(ret_symlinks);
3586
3587         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3588                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3589                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3590
3591                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3592                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3593
3594                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3595                                 dst_abs = path_join(params->prefix[dt], *symlink);
3596                                 if (!src_abs || !dst_abs)
3597                                         return -ENOMEM;
3598
3599                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3600                                 if (r < 0)
3601                                         return r;
3602                         }
3603
3604                         if (!exec_directory_is_private(context, dt) ||
3605                             exec_context_with_rootfs(context) ||
3606                             context->directories[dt].items[i].only_create)
3607                                 continue;
3608
3609                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3610                         if (!private_path)
3611                                 return -ENOMEM;
3612
3613                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3614                         if (!path)
3615                                 return -ENOMEM;
3616
3617                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3618                         if (r < 0)
3619                                 return r;
3620                 }
3621         }
3622
3623         *ret_symlinks = TAKE_PTR(symlinks);
3624
3625         return 0;
3626 }
3627
3628 static bool insist_on_sandboxing(
3629                 const ExecContext *context,
3630                 const char *root_dir,
3631                 const char *root_image,
3632                 const BindMount *bind_mounts,
3633                 size_t n_bind_mounts) {
3634
3635         assert(context);
3636         assert(n_bind_mounts == 0 || bind_mounts);
3637
3638         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3639          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3640          * rearrange stuff in a way we cannot ignore gracefully. */
3641
3642         if (context->n_temporary_filesystems > 0)
3643                 return true;
3644
3645         if (root_dir || root_image)
3646                 return true;
3647
3648         if (context->n_mount_images > 0)
3649                 return true;
3650
3651         if (context->dynamic_user)
3652                 return true;
3653
3654         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3655                 return true;
3656
3657         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3658          * essential. */
3659         for (size_t i = 0; i < n_bind_mounts; i++)
3660                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3661                         return true;
3662
3663         if (context->log_namespace)
3664                 return true;
3665
3666         return false;
3667 }
3668
3669 static int apply_mount_namespace(
3670                 const Unit *u,
3671                 ExecCommandFlags command_flags,
3672                 const ExecContext *context,
3673                 const ExecParameters *params,
3674                 const ExecRuntime *runtime,
3675                 const char *memory_pressure_path,
3676                 char **error_path) {
3677
3678         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3679                         **read_write_paths_cleanup = NULL;
3680         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3681         const char *root_dir = NULL, *root_image = NULL;
3682         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3683                         *extension_dir = NULL;
3684         char **read_write_paths;
3685         NamespaceInfo ns_info;
3686         bool needs_sandboxing;
3687         BindMount *bind_mounts = NULL;
3688         size_t n_bind_mounts = 0;
3689         int r;
3690
3691         assert(context);
3692
3693         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3694
3695         if (params->flags & EXEC_APPLY_CHROOT) {
3696                 root_image = context->root_image;
3697
3698                 if (!root_image)
3699                         root_dir = context->root_directory;
3700         }
3701
3702         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3703         if (r < 0)
3704                 return r;
3705
3706         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3707         r = compile_symlinks(context, params, &symlinks);
3708         if (r < 0)
3709                 return r;
3710
3711         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3712          * service will need to write to it in order to start the notifications. */
3713         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3714                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3715                 if (!read_write_paths_cleanup)
3716                         return -ENOMEM;
3717
3718                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3719                 if (r < 0)
3720                         return r;
3721
3722                 read_write_paths = read_write_paths_cleanup;
3723         } else
3724                 read_write_paths = context->read_write_paths;
3725
3726         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3727         if (needs_sandboxing) {
3728                 /* The runtime struct only contains the parent of the private /tmp,
3729                  * which is non-accessible to world users. Inside of it there's a /tmp
3730                  * that is sticky, and that's the one we want to use here.
3731                  * This does not apply when we are using /run/systemd/empty as fallback. */
3732
3733                 if (context->private_tmp && runtime && runtime->shared) {
3734                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3735                                 tmp_dir = runtime->shared->tmp_dir;
3736                         else if (runtime->shared->tmp_dir)
3737                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3738
3739                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3740                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3741                         else if (runtime->shared->var_tmp_dir)
3742                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3743                 }
3744
3745                 ns_info = (NamespaceInfo) {
3746                         .ignore_protect_paths = false,
3747                         .private_dev = context->private_devices,
3748                         .protect_control_groups = context->protect_control_groups,
3749                         .protect_kernel_tunables = context->protect_kernel_tunables,
3750                         .protect_kernel_modules = context->protect_kernel_modules,
3751                         .protect_kernel_logs = context->protect_kernel_logs,
3752                         .protect_hostname = context->protect_hostname,
3753                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3754                         .protect_home = context->protect_home,
3755                         .protect_system = context->protect_system,
3756                         .protect_proc = context->protect_proc,
3757                         .proc_subset = context->proc_subset,
3758                         .private_network = exec_needs_network_namespace(context),
3759                         .private_ipc = exec_needs_ipc_namespace(context),
3760                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3761                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3762                 };
3763         } else if (!context->dynamic_user && root_dir)
3764                 /*
3765                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3766                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3767                  * fail if we are enable to apply the sandbox inside the mount namespace.
3768                  */
3769                 ns_info = (NamespaceInfo) {
3770                         .ignore_protect_paths = true,
3771                 };
3772         else
3773                 ns_info = (NamespaceInfo) {};
3774
3775         if (context->mount_propagation_flag == MS_SHARED)
3776                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3777
3778         if (exec_context_has_credentials(context) &&
3779             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3780             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3781                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3782                 if (!creds_path)
3783                         return -ENOMEM;
3784         }
3785
3786         if (MANAGER_IS_SYSTEM(u->manager)) {
3787                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3788                 if (!propagate_dir)
3789                         return -ENOMEM;
3790
3791                 incoming_dir = strdup("/run/systemd/incoming");
3792                 if (!incoming_dir)
3793                         return -ENOMEM;
3794
3795                 extension_dir = strdup("/run/systemd/unit-extensions");
3796                 if (!extension_dir)
3797                         return -ENOMEM;
3798         } else
3799                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3800                         return -ENOMEM;
3801
3802         r = setup_namespace(
3803                         root_dir,
3804                         root_image,
3805                         context->root_image_options,
3806                         context->root_image_policy ?: &image_policy_service,
3807                         &ns_info,
3808                         read_write_paths,
3809                         needs_sandboxing ? context->read_only_paths : NULL,
3810                         needs_sandboxing ? context->inaccessible_paths : NULL,
3811                         needs_sandboxing ? context->exec_paths : NULL,
3812                         needs_sandboxing ? context->no_exec_paths : NULL,
3813                         empty_directories,
3814                         symlinks,
3815                         bind_mounts,
3816                         n_bind_mounts,
3817                         context->temporary_filesystems,
3818                         context->n_temporary_filesystems,
3819                         context->mount_images,
3820                         context->n_mount_images,
3821                         context->mount_image_policy ?: &image_policy_service,
3822                         tmp_dir,
3823                         var_tmp_dir,
3824                         creds_path,
3825                         context->log_namespace,
3826                         context->mount_propagation_flag,
3827                         context->root_hash, context->root_hash_size, context->root_hash_path,
3828                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3829                         context->root_verity,
3830                         context->extension_images,
3831                         context->n_extension_images,
3832                         context->extension_image_policy ?: &image_policy_sysext,
3833                         context->extension_directories,
3834                         propagate_dir,
3835                         incoming_dir,
3836                         extension_dir,
3837                         root_dir || root_image ? params->notify_socket : NULL,
3838                         error_path);
3839
3840         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3841          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3842          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3843          * completely different execution environment. */
3844         if (r == -ENOANO) {
3845                 if (insist_on_sandboxing(
3846                                     context,
3847                                     root_dir, root_image,
3848                                     bind_mounts,
3849                                     n_bind_mounts))
3850                         return log_unit_debug_errno(u,
3851                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3852                                                     "Failed to set up namespace, and refusing to continue since "
3853                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3854                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3855                                                     n_bind_mounts,
3856                                                     context->n_temporary_filesystems,
3857                                                     yes_no(root_dir),
3858                                                     yes_no(root_image),
3859                                                     yes_no(context->dynamic_user));
3860
3861                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3862                 return 0;
3863         }
3864
3865         return r;
3866 }
3867
3868 static int apply_working_directory(
3869                 const ExecContext *context,
3870                 const ExecParameters *params,
3871                 const char *home,
3872                 int *exit_status) {
3873
3874         const char *d, *wd;
3875
3876         assert(context);
3877         assert(exit_status);
3878
3879         if (context->working_directory_home) {
3880
3881                 if (!home) {
3882                         *exit_status = EXIT_CHDIR;
3883                         return -ENXIO;
3884                 }
3885
3886                 wd = home;
3887
3888         } else
3889                 wd = empty_to_root(context->working_directory);
3890
3891         if (params->flags & EXEC_APPLY_CHROOT)
3892                 d = wd;
3893         else
3894                 d = prefix_roota(context->root_directory, wd);
3895
3896         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3897                 *exit_status = EXIT_CHDIR;
3898                 return -errno;
3899         }
3900
3901         return 0;
3902 }
3903
3904 static int apply_root_directory(
3905                 const ExecContext *context,
3906                 const ExecParameters *params,
3907                 const bool needs_mount_ns,
3908                 int *exit_status) {
3909
3910         assert(context);
3911         assert(exit_status);
3912
3913         if (params->flags & EXEC_APPLY_CHROOT)
3914                 if (!needs_mount_ns && context->root_directory)
3915                         if (chroot(context->root_directory) < 0) {
3916                                 *exit_status = EXIT_CHROOT;
3917                                 return -errno;
3918                         }
3919
3920         return 0;
3921 }
3922
3923 static int setup_keyring(
3924                 const Unit *u,
3925                 const ExecContext *context,
3926                 const ExecParameters *p,
3927                 uid_t uid, gid_t gid) {
3928
3929         key_serial_t keyring;
3930         int r = 0;
3931         uid_t saved_uid;
3932         gid_t saved_gid;
3933
3934         assert(u);
3935         assert(context);
3936         assert(p);
3937
3938         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3939          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3940          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3941          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3942          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3943          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3944
3945         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3946                 return 0;
3947
3948         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3949          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3950          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3951          * & group is just as nasty as acquiring a reference to the user keyring. */
3952
3953         saved_uid = getuid();
3954         saved_gid = getgid();
3955
3956         if (gid_is_valid(gid) && gid != saved_gid) {
3957                 if (setregid(gid, -1) < 0)
3958                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3959         }
3960
3961         if (uid_is_valid(uid) && uid != saved_uid) {
3962                 if (setreuid(uid, -1) < 0) {
3963                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3964                         goto out;
3965                 }
3966         }
3967
3968         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3969         if (keyring == -1) {
3970                 if (errno == ENOSYS)
3971                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3972                 else if (ERRNO_IS_PRIVILEGE(errno))
3973                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3974                 else if (errno == EDQUOT)
3975                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3976                 else
3977                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3978
3979                 goto out;
3980         }
3981
3982         /* When requested link the user keyring into the session keyring. */
3983         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3984
3985                 if (keyctl(KEYCTL_LINK,
3986                            KEY_SPEC_USER_KEYRING,
3987                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3988                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3989                         goto out;
3990                 }
3991         }
3992
3993         /* Restore uid/gid back */
3994         if (uid_is_valid(uid) && uid != saved_uid) {
3995                 if (setreuid(saved_uid, -1) < 0) {
3996                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3997                         goto out;
3998                 }
3999         }
4000
4001         if (gid_is_valid(gid) && gid != saved_gid) {
4002                 if (setregid(saved_gid, -1) < 0)
4003                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4004         }
4005
4006         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4007         if (!sd_id128_is_null(u->invocation_id)) {
4008                 key_serial_t key;
4009
4010                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4011                 if (key == -1)
4012                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4013                 else {
4014                         if (keyctl(KEYCTL_SETPERM, key,
4015                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4016                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4017                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4018                 }
4019         }
4020
4021 out:
4022         /* Revert back uid & gid for the last time, and exit */
4023         /* no extra logging, as only the first already reported error matters */
4024         if (getuid() != saved_uid)
4025                 (void) setreuid(saved_uid, -1);
4026
4027         if (getgid() != saved_gid)
4028                 (void) setregid(saved_gid, -1);
4029
4030         return r;
4031 }
4032
4033 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4034         assert(array);
4035         assert(n);
4036         assert(pair);
4037
4038         if (pair[0] >= 0)
4039                 array[(*n)++] = pair[0];
4040         if (pair[1] >= 0)
4041                 array[(*n)++] = pair[1];
4042 }
4043
4044 static int close_remaining_fds(
4045                 const ExecParameters *params,
4046                 const ExecRuntime *runtime,
4047                 int user_lookup_fd,
4048                 int socket_fd,
4049                 const int *fds, size_t n_fds) {
4050
4051         size_t n_dont_close = 0;
4052         int dont_close[n_fds + 12];
4053
4054         assert(params);
4055
4056         if (params->stdin_fd >= 0)
4057                 dont_close[n_dont_close++] = params->stdin_fd;
4058         if (params->stdout_fd >= 0)
4059                 dont_close[n_dont_close++] = params->stdout_fd;
4060         if (params->stderr_fd >= 0)
4061                 dont_close[n_dont_close++] = params->stderr_fd;
4062
4063         if (socket_fd >= 0)
4064                 dont_close[n_dont_close++] = socket_fd;
4065         if (n_fds > 0) {
4066                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4067                 n_dont_close += n_fds;
4068         }
4069
4070         if (runtime && runtime->shared) {
4071                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4072                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4073         }
4074
4075         if (runtime && runtime->dynamic_creds) {
4076                 if (runtime->dynamic_creds->user)
4077                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4078                 if (runtime->dynamic_creds->group)
4079                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4080         }
4081
4082         if (user_lookup_fd >= 0)
4083                 dont_close[n_dont_close++] = user_lookup_fd;
4084
4085         return close_all_fds(dont_close, n_dont_close);
4086 }
4087
4088 static int send_user_lookup(
4089                 Unit *unit,
4090                 int user_lookup_fd,
4091                 uid_t uid,
4092                 gid_t gid) {
4093
4094         assert(unit);
4095
4096         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4097          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4098          * specified. */
4099
4100         if (user_lookup_fd < 0)
4101                 return 0;
4102
4103         if (!uid_is_valid(uid) && !gid_is_valid(gid))
4104                 return 0;
4105
4106         if (writev(user_lookup_fd,
4107                (struct iovec[]) {
4108                            IOVEC_MAKE(&uid, sizeof(uid)),
4109                            IOVEC_MAKE(&gid, sizeof(gid)),
4110                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4111                 return -errno;
4112
4113         return 0;
4114 }
4115
4116 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4117         int r;
4118
4119         assert(c);
4120         assert(home);
4121         assert(buf);
4122
4123         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4124
4125         if (*home)
4126                 return 0;
4127
4128         if (!c->working_directory_home)
4129                 return 0;
4130
4131         r = get_home_dir(buf);
4132         if (r < 0)
4133                 return r;
4134
4135         *home = *buf;
4136         return 1;
4137 }
4138
4139 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4140         _cleanup_strv_free_ char ** list = NULL;
4141         int r;
4142
4143         assert(c);
4144         assert(p);
4145         assert(ret);
4146
4147         assert(c->dynamic_user);
4148
4149         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4150          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4151          * directories. */
4152
4153         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4154                 if (t == EXEC_DIRECTORY_CONFIGURATION)
4155                         continue;
4156
4157                 if (!p->prefix[t])
4158                         continue;
4159
4160                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4161                         char *e;
4162
4163                         if (exec_directory_is_private(c, t))
4164                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4165                         else
4166                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4167                         if (!e)
4168                                 return -ENOMEM;
4169
4170                         r = strv_consume(&list, e);
4171                         if (r < 0)
4172                                 return r;
4173                 }
4174         }
4175
4176         *ret = TAKE_PTR(list);
4177
4178         return 0;
4179 }
4180
4181 static int exec_parameters_get_cgroup_path(
4182                 const ExecParameters *params,
4183                 const CGroupContext *c,
4184                 char **ret) {
4185
4186         const char *subgroup = NULL;
4187         char *p;
4188
4189         assert(params);
4190         assert(ret);
4191
4192         if (!params->cgroup_path)
4193                 return -EINVAL;
4194
4195         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4196          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4197          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4198          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4199          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4200          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4201          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4202          * flag, which is only passed for the former statements, not for the latter. */
4203
4204         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4205                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4206                         subgroup = ".control";
4207                 else
4208                         subgroup = c->delegate_subgroup;
4209         }
4210
4211         if (subgroup)
4212                 p = path_join(params->cgroup_path, subgroup);
4213         else
4214                 p = strdup(params->cgroup_path);
4215         if (!p)
4216                 return -ENOMEM;
4217
4218         *ret = p;
4219         return !!subgroup;
4220 }
4221
4222 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4223         _cleanup_(cpu_set_reset) CPUSet s = {};
4224         int r;
4225
4226         assert(c);
4227         assert(ret);
4228
4229         if (!c->numa_policy.nodes.set) {
4230                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4231                 return 0;
4232         }
4233
4234         r = numa_to_cpu_set(&c->numa_policy, &s);
4235         if (r < 0)
4236                 return r;
4237
4238         cpu_set_reset(ret);
4239
4240         return cpu_set_add_all(ret, &s);
4241 }
4242
4243 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4244         assert(c);
4245
4246         return c->cpu_affinity_from_numa;
4247 }
4248
4249 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4250         int r;
4251
4252         assert(fds);
4253         assert(n_fds);
4254         assert(*n_fds < fds_size);
4255         assert(ret_fd);
4256
4257         if (fd < 0) {
4258                 *ret_fd = -EBADF;
4259                 return 0;
4260         }
4261
4262         if (fd < 3 + (int) *n_fds) {
4263                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4264                  * the fds we pass to the process (or which are closed only during execve). */
4265
4266                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4267                 if (r < 0)
4268                         return -errno;
4269
4270                 close_and_replace(fd, r);
4271         }
4272
4273         *ret_fd = fds[*n_fds] = fd;
4274         (*n_fds) ++;
4275         return 1;
4276 }
4277
4278 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4279         union sockaddr_union addr = {
4280                 .un.sun_family = AF_UNIX,
4281         };
4282         socklen_t sa_len;
4283         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4284         int r;
4285
4286         assert(u);
4287         assert(of);
4288         assert(ofd >= 0);
4289
4290         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4291         if (r < 0)
4292                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4293
4294         sa_len = r;
4295
4296         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4297                 _cleanup_close_ int fd = -EBADF;
4298
4299                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4300                 if (fd < 0)
4301                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4302
4303                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4304                 if (r == -EPROTOTYPE)
4305                         continue;
4306                 if (r < 0)
4307                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4308
4309                 return TAKE_FD(fd);
4310         }
4311
4312         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4313 }
4314
4315 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4316         struct stat st;
4317         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4318
4319         assert(u);
4320         assert(of);
4321
4322         ofd = open(of->path, O_PATH | O_CLOEXEC);
4323         if (ofd < 0)
4324                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4325
4326         if (fstat(ofd, &st) < 0)
4327                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4328
4329         if (S_ISSOCK(st.st_mode)) {
4330                 fd = connect_unix_harder(u, of, ofd);
4331                 if (fd < 0)
4332                         return fd;
4333
4334                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4335                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4336                                                     of->path);
4337
4338                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4339         } else {
4340                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4341                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4342                         flags |= O_APPEND;
4343                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4344                         flags |= O_TRUNC;
4345
4346                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4347                 if (fd < 0)
4348                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4349
4350                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4351         }
4352
4353         return TAKE_FD(fd);
4354 }
4355
4356 static int collect_open_file_fds(
4357                 Unit *u,
4358                 OpenFile* open_files,
4359                 int **fds,
4360                 char ***fdnames,
4361                 size_t *n_fds) {
4362         int r;
4363
4364         assert(u);
4365         assert(fds);
4366         assert(fdnames);
4367         assert(n_fds);
4368
4369         LIST_FOREACH(open_files, of, open_files) {
4370                 _cleanup_close_ int fd = -EBADF;
4371
4372                 fd = get_open_file_fd(u, of);
4373                 if (fd < 0) {
4374                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4375                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4376                                 continue;
4377                         }
4378
4379                         return fd;
4380                 }
4381
4382                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4383                         return -ENOMEM;
4384
4385                 r = strv_extend(fdnames, of->fdname);
4386                 if (r < 0)
4387                         return r;
4388
4389                 (*fds)[*n_fds] = TAKE_FD(fd);
4390
4391                 (*n_fds)++;
4392         }
4393
4394         return 0;
4395 }
4396
4397 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4398         assert(unit);
4399         assert(msg);
4400         assert(executable);
4401
4402         if (!DEBUG_LOGGING)
4403                 return;
4404
4405         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4406
4407         log_unit_struct(unit, LOG_DEBUG,
4408                         "EXECUTABLE=%s", executable,
4409                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4410                         LOG_UNIT_INVOCATION_ID(unit));
4411 }
4412
4413 static bool exec_context_need_unprivileged_private_users(const ExecContext *context, const Manager *manager) {
4414         assert(context);
4415         assert(manager);
4416
4417         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4418          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4419          * (system manager) then we have privileges and don't need this. */
4420         if (MANAGER_IS_SYSTEM(manager))
4421                 return false;
4422
4423         return context->private_users ||
4424                context->private_tmp ||
4425                context->private_devices ||
4426                context->private_network ||
4427                context->network_namespace_path ||
4428                context->private_ipc ||
4429                context->ipc_namespace_path ||
4430                context->private_mounts ||
4431                context->mount_apivfs ||
4432                context->n_bind_mounts > 0 ||
4433                context->n_temporary_filesystems > 0 ||
4434                context->root_directory ||
4435                !strv_isempty(context->extension_directories) ||
4436                context->protect_system != PROTECT_SYSTEM_NO ||
4437                context->protect_home != PROTECT_HOME_NO ||
4438                context->protect_kernel_tunables ||
4439                context->protect_kernel_modules ||
4440                context->protect_kernel_logs ||
4441                context->protect_control_groups ||
4442                context->protect_clock ||
4443                context->protect_hostname ||
4444                !strv_isempty(context->read_write_paths) ||
4445                !strv_isempty(context->read_only_paths) ||
4446                !strv_isempty(context->inaccessible_paths) ||
4447                !strv_isempty(context->exec_paths) ||
4448                !strv_isempty(context->no_exec_paths);
4449 }
4450
4451 static int exec_child(
4452                 Unit *unit,
4453                 const ExecCommand *command,
4454                 const ExecContext *context,
4455                 const ExecParameters *params,
4456                 ExecRuntime *runtime,
4457                 const CGroupContext *cgroup_context,
4458                 int socket_fd,
4459                 const int named_iofds[static 3],
4460                 int *params_fds,
4461                 size_t n_socket_fds,
4462                 size_t n_storage_fds,
4463                 char **files_env,
4464                 int user_lookup_fd,
4465                 int *exit_status) {
4466
4467         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4468         int r, ngids = 0, exec_fd;
4469         _cleanup_free_ gid_t *supplementary_gids = NULL;
4470         const char *username = NULL, *groupname = NULL;
4471         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4472         const char *home = NULL, *shell = NULL;
4473         char **final_argv = NULL;
4474         dev_t journal_stream_dev = 0;
4475         ino_t journal_stream_ino = 0;
4476         bool userns_set_up = false;
4477         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4478                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4479                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4480                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4481 #if HAVE_SELINUX
4482         _cleanup_free_ char *mac_selinux_context_net = NULL;
4483         bool use_selinux = false;
4484 #endif
4485 #if ENABLE_SMACK
4486         bool use_smack = false;
4487 #endif
4488 #if HAVE_APPARMOR
4489         bool use_apparmor = false;
4490 #endif
4491         uid_t saved_uid = getuid();
4492         gid_t saved_gid = getgid();
4493         uid_t uid = UID_INVALID;
4494         gid_t gid = GID_INVALID;
4495         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4496                n_keep_fds; /* total number of fds not to close */
4497         int secure_bits;
4498         _cleanup_free_ gid_t *gids_after_pam = NULL;
4499         int ngids_after_pam = 0;
4500         _cleanup_free_ int *fds = NULL;
4501         _cleanup_strv_free_ char **fdnames = NULL;
4502
4503         assert(unit);
4504         assert(command);
4505         assert(context);
4506         assert(params);
4507         assert(exit_status);
4508
4509         /* Explicitly test for CVE-2021-4034 inspired invocations */
4510         assert(command->path);
4511         assert(!strv_isempty(command->argv));
4512
4513         rename_process_from_path(command->path);
4514
4515         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4516          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4517          * both of which will be demoted to SIG_DFL. */
4518         (void) default_signals(SIGNALS_CRASH_HANDLER,
4519                                SIGNALS_IGNORE);
4520
4521         if (context->ignore_sigpipe)
4522                 (void) ignore_signals(SIGPIPE);
4523
4524         r = reset_signal_mask();
4525         if (r < 0) {
4526                 *exit_status = EXIT_SIGNAL_MASK;
4527                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4528         }
4529
4530         if (params->idle_pipe)
4531                 do_idle_pipe_dance(params->idle_pipe);
4532
4533         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4534          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4535          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4536          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4537
4538         log_forget_fds();
4539         log_set_open_when_needed(true);
4540         log_settle_target();
4541
4542         /* In case anything used libc syslog(), close this here, too */
4543         closelog();
4544
4545         fds = newdup(int, params_fds, n_fds);
4546         if (!fds) {
4547                 *exit_status = EXIT_MEMORY;
4548                 return log_oom();
4549         }
4550
4551         fdnames = strv_copy((char**) params->fd_names);
4552         if (!fdnames) {
4553                 *exit_status = EXIT_MEMORY;
4554                 return log_oom();
4555         }
4556
4557         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4558         if (r < 0) {
4559                 *exit_status = EXIT_FDS;
4560                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4561         }
4562
4563         int keep_fds[n_fds + 3];
4564         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4565         n_keep_fds = n_fds;
4566
4567         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4568         if (r < 0) {
4569                 *exit_status = EXIT_FDS;
4570                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4571         }
4572
4573 #if HAVE_LIBBPF
4574         if (unit->manager->restrict_fs) {
4575                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4576                 if (bpf_map_fd < 0) {
4577                         *exit_status = EXIT_FDS;
4578                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4579                 }
4580
4581                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4582                 if (r < 0) {
4583                         *exit_status = EXIT_FDS;
4584                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4585                 }
4586         }
4587 #endif
4588
4589         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4590         if (r < 0) {
4591                 *exit_status = EXIT_FDS;
4592                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4593         }
4594
4595         if (!context->same_pgrp &&
4596             setsid() < 0) {
4597                 *exit_status = EXIT_SETSID;
4598                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4599         }
4600
4601         exec_context_tty_reset(context, params);
4602
4603         if (unit_shall_confirm_spawn(unit)) {
4604                 _cleanup_free_ char *cmdline = NULL;
4605
4606                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4607                 if (!cmdline) {
4608                         *exit_status = EXIT_MEMORY;
4609                         return log_oom();
4610                 }
4611
4612                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4613                 if (r != CONFIRM_EXECUTE) {
4614                         if (r == CONFIRM_PRETEND_SUCCESS) {
4615                                 *exit_status = EXIT_SUCCESS;
4616                                 return 0;
4617                         }
4618                         *exit_status = EXIT_CONFIRM;
4619                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4620                                                     "Execution cancelled by the user");
4621                 }
4622         }
4623
4624         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4625          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4626          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4627          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4628          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4629         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4630             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(unit->manager->runtime_scope), true) != 0) {
4631                 *exit_status = EXIT_MEMORY;
4632                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4633         }
4634
4635         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4636                 _cleanup_strv_free_ char **suggested_paths = NULL;
4637
4638                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4639                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4640                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4641                         *exit_status = EXIT_USER;
4642                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4643                 }
4644
4645                 r = compile_suggested_paths(context, params, &suggested_paths);
4646                 if (r < 0) {
4647                         *exit_status = EXIT_MEMORY;
4648                         return log_oom();
4649                 }
4650
4651                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4652                 if (r < 0) {
4653                         *exit_status = EXIT_USER;
4654                         if (r == -EILSEQ)
4655                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4656                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4657                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4658                 }
4659
4660                 if (!uid_is_valid(uid)) {
4661                         *exit_status = EXIT_USER;
4662                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4663                 }
4664
4665                 if (!gid_is_valid(gid)) {
4666                         *exit_status = EXIT_USER;
4667                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4668                 }
4669
4670                 if (runtime->dynamic_creds->user)
4671                         username = runtime->dynamic_creds->user->name;
4672
4673         } else {
4674                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4675                 if (r < 0) {
4676                         *exit_status = EXIT_USER;
4677                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4678                 }
4679
4680                 r = get_fixed_group(context, &groupname, &gid);
4681                 if (r < 0) {
4682                         *exit_status = EXIT_GROUP;
4683                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4684                 }
4685         }
4686
4687         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4688         r = get_supplementary_groups(context, username, groupname, gid,
4689                                      &supplementary_gids, &ngids);
4690         if (r < 0) {
4691                 *exit_status = EXIT_GROUP;
4692                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4693         }
4694
4695         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4696         if (r < 0) {
4697                 *exit_status = EXIT_USER;
4698                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4699         }
4700
4701         user_lookup_fd = safe_close(user_lookup_fd);
4702
4703         r = acquire_home(context, uid, &home, &home_buffer);
4704         if (r < 0) {
4705                 *exit_status = EXIT_CHDIR;
4706                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4707         }
4708
4709         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4710         if (socket_fd >= 0)
4711                 (void) fd_nonblock(socket_fd, false);
4712
4713         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4714          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4715         if (params->cgroup_path) {
4716                 _cleanup_free_ char *p = NULL;
4717
4718                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4719                 if (r < 0) {
4720                         *exit_status = EXIT_CGROUP;
4721                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4722                 }
4723
4724                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4725                 if (r == -EUCLEAN) {
4726                         *exit_status = EXIT_CGROUP;
4727                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4728                                                     "because the cgroup or one of its parents or "
4729                                                     "siblings is in the threaded mode: %m", p);
4730                 }
4731                 if (r < 0) {
4732                         *exit_status = EXIT_CGROUP;
4733                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4734                 }
4735         }
4736
4737         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4738                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4739                 if (r < 0) {
4740                         *exit_status = EXIT_NETWORK;
4741                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4742                 }
4743         }
4744
4745         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4746                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4747                 if (r < 0) {
4748                         *exit_status = EXIT_NAMESPACE;
4749                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4750                 }
4751         }
4752
4753         r = setup_input(context, params, socket_fd, named_iofds);
4754         if (r < 0) {
4755                 *exit_status = EXIT_STDIN;
4756                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4757         }
4758
4759         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4760         if (r < 0) {
4761                 *exit_status = EXIT_STDOUT;
4762                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4763         }
4764
4765         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4766         if (r < 0) {
4767                 *exit_status = EXIT_STDERR;
4768                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4769         }
4770
4771         if (context->oom_score_adjust_set) {
4772                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4773                  * prohibit write access to this file, and we shouldn't trip up over that. */
4774                 r = set_oom_score_adjust(context->oom_score_adjust);
4775                 if (ERRNO_IS_PRIVILEGE(r))
4776                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4777                 else if (r < 0) {
4778                         *exit_status = EXIT_OOM_ADJUST;
4779                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4780                 }
4781         }
4782
4783         if (context->coredump_filter_set) {
4784                 r = set_coredump_filter(context->coredump_filter);
4785                 if (ERRNO_IS_PRIVILEGE(r))
4786                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4787                 else if (r < 0)
4788                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4789         }
4790
4791         if (context->nice_set) {
4792                 r = setpriority_closest(context->nice);
4793                 if (r < 0)
4794                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4795         }
4796
4797         if (context->cpu_sched_set) {
4798                 struct sched_param param = {
4799                         .sched_priority = context->cpu_sched_priority,
4800                 };
4801
4802                 r = sched_setscheduler(0,
4803                                        context->cpu_sched_policy |
4804                                        (context->cpu_sched_reset_on_fork ?
4805                                         SCHED_RESET_ON_FORK : 0),
4806                                        &param);
4807                 if (r < 0) {
4808                         *exit_status = EXIT_SETSCHEDULER;
4809                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4810                 }
4811         }
4812
4813         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4814                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4815                 const CPUSet *cpu_set;
4816
4817                 if (context->cpu_affinity_from_numa) {
4818                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4819                         if (r < 0) {
4820                                 *exit_status = EXIT_CPUAFFINITY;
4821                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4822                         }
4823
4824                         cpu_set = &converted_cpu_set;
4825                 } else
4826                         cpu_set = &context->cpu_set;
4827
4828                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4829                         *exit_status = EXIT_CPUAFFINITY;
4830                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4831                 }
4832         }
4833
4834         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4835                 r = apply_numa_policy(&context->numa_policy);
4836                 if (r < 0) {
4837                         if (ERRNO_IS_NOT_SUPPORTED(r))
4838                                 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4839                         else {
4840                                 *exit_status = EXIT_NUMA_POLICY;
4841                                 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4842                         }
4843                 }
4844         }
4845
4846         if (context->ioprio_set)
4847                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4848                         *exit_status = EXIT_IOPRIO;
4849                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4850                 }
4851
4852         if (context->timer_slack_nsec != NSEC_INFINITY)
4853                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4854                         *exit_status = EXIT_TIMERSLACK;
4855                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4856                 }
4857
4858         if (context->personality != PERSONALITY_INVALID) {
4859                 r = safe_personality(context->personality);
4860                 if (r < 0) {
4861                         *exit_status = EXIT_PERSONALITY;
4862                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4863                 }
4864         }
4865
4866         if (context->utmp_id) {
4867                 const char *line = context->tty_path ?
4868                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4869                         NULL;
4870                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4871                                       line,
4872                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4873                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4874                                       USER_PROCESS,
4875                                       username);
4876         }
4877
4878         if (uid_is_valid(uid)) {
4879                 r = chown_terminal(STDIN_FILENO, uid);
4880                 if (r < 0) {
4881                         *exit_status = EXIT_STDIN;
4882                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4883                 }
4884         }
4885
4886         if (params->cgroup_path) {
4887                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4888                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4889                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4890                  * touch a single hierarchy too. */
4891
4892                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4893                         _cleanup_free_ char *p = NULL;
4894
4895                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4896                         if (r < 0) {
4897                                 *exit_status = EXIT_CGROUP;
4898                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4899                         }
4900
4901                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4902                         if (r < 0) {
4903                                 *exit_status = EXIT_CGROUP;
4904                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4905                         }
4906                         if (r > 0) {
4907                                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4908                                 if (r < 0) {
4909                                         *exit_status = EXIT_CGROUP;
4910                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4911                                 }
4912                         }
4913                 }
4914
4915                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4916                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4917                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4918                                 if (r < 0) {
4919                                         *exit_status = EXIT_MEMORY;
4920                                         return log_oom();
4921                                 }
4922
4923                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4924                                 if (r < 0) {
4925                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4926                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4927                                         memory_pressure_path = mfree(memory_pressure_path);
4928                                 }
4929                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4930                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4931                                 if (!memory_pressure_path) {
4932                                         *exit_status = EXIT_MEMORY;
4933                                         return log_oom();
4934                                 }
4935                         }
4936                 }
4937         }
4938
4939         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4940
4941         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4942                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4943                 if (r < 0)
4944                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4945         }
4946
4947         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4948                 r = setup_credentials(context, params, unit->id, uid);
4949                 if (r < 0) {
4950                         *exit_status = EXIT_CREDENTIALS;
4951                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4952                 }
4953         }
4954
4955         r = build_environment(
4956                         unit,
4957                         context,
4958                         params,
4959                         cgroup_context,
4960                         n_fds,
4961                         fdnames,
4962                         home,
4963                         username,
4964                         shell,
4965                         journal_stream_dev,
4966                         journal_stream_ino,
4967                         memory_pressure_path,
4968                         &our_env);
4969         if (r < 0) {
4970                 *exit_status = EXIT_MEMORY;
4971                 return log_oom();
4972         }
4973
4974         r = build_pass_environment(context, &pass_env);
4975         if (r < 0) {
4976                 *exit_status = EXIT_MEMORY;
4977                 return log_oom();
4978         }
4979
4980         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4981          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4982          * not specify PATH but the unit has ExecSearchPath. */
4983         if (!strv_isempty(context->exec_search_path)) {
4984                 _cleanup_free_ char *joined = NULL;
4985
4986                 joined = strv_join(context->exec_search_path, ":");
4987                 if (!joined) {
4988                         *exit_status = EXIT_MEMORY;
4989                         return log_oom();
4990                 }
4991
4992                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4993                 if (r < 0) {
4994                         *exit_status = EXIT_MEMORY;
4995                         return log_oom();
4996                 }
4997         }
4998
4999         accum_env = strv_env_merge(params->environment,
5000                                    our_env,
5001                                    joined_exec_search_path,
5002                                    pass_env,
5003                                    context->environment,
5004                                    files_env);
5005         if (!accum_env) {
5006                 *exit_status = EXIT_MEMORY;
5007                 return log_oom();
5008         }
5009         accum_env = strv_env_clean(accum_env);
5010
5011         (void) umask(context->umask);
5012
5013         r = setup_keyring(unit, context, params, uid, gid);
5014         if (r < 0) {
5015                 *exit_status = EXIT_KEYRING;
5016                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5017         }
5018
5019         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5020          * from it. */
5021         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5022
5023         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5024          * for it, and the kernel doesn't actually support ambient caps. */
5025         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5026
5027         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5028          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5029          * desired. */
5030         if (needs_ambient_hack)
5031                 needs_setuid = false;
5032         else
5033                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5034
5035         uint64_t capability_ambient_set = context->capability_ambient_set;
5036
5037         if (needs_sandboxing) {
5038                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5039                  * /sys being present. The actual MAC context application will happen later, as late as
5040                  * possible, to avoid impacting our own code paths. */
5041
5042 #if HAVE_SELINUX
5043                 use_selinux = mac_selinux_use();
5044 #endif
5045 #if ENABLE_SMACK
5046                 use_smack = mac_smack_use();
5047 #endif
5048 #if HAVE_APPARMOR
5049                 use_apparmor = mac_apparmor_use();
5050 #endif
5051         }
5052
5053         if (needs_sandboxing) {
5054                 int which_failed;
5055
5056                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5057                  * is set here. (See below.) */
5058
5059                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5060                 if (r < 0) {
5061                         *exit_status = EXIT_LIMITS;
5062                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5063                 }
5064         }
5065
5066         if (needs_setuid && context->pam_name && username) {
5067                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5068                  * wins here. (See above.) */
5069
5070                 /* All fds passed in the fds array will be closed in the pam child process. */
5071                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5072                 if (r < 0) {
5073                         *exit_status = EXIT_PAM;
5074                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5075                 }
5076
5077                 if (ambient_capabilities_supported()) {
5078                         uint64_t ambient_after_pam;
5079
5080                         /* PAM modules might have set some ambient caps. Query them here and merge them into
5081                          * the caps we want to set in the end, so that we don't end up unsetting them. */
5082                         r = capability_get_ambient(&ambient_after_pam);
5083                         if (r < 0) {
5084                                 *exit_status = EXIT_CAPABILITIES;
5085                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5086                         }
5087
5088                         capability_ambient_set |= ambient_after_pam;
5089                 }
5090
5091                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5092                 if (ngids_after_pam < 0) {
5093                         *exit_status = EXIT_MEMORY;
5094                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5095                 }
5096         }
5097
5098         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, unit->manager)) {
5099                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5100                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5101                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5102
5103                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5104                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5105                  * the actual requested operations fail (or silently continue). */
5106                 if (r < 0 && context->private_users) {
5107                         *exit_status = EXIT_USER;
5108                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5109                 }
5110                 if (r < 0)
5111                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5112                 else
5113                         userns_set_up = true;
5114         }
5115
5116         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5117
5118                 if (ns_type_supported(NAMESPACE_NET)) {
5119                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5120                         if (r < 0) {
5121                                 if (ERRNO_IS_PRIVILEGE(r))
5122                                         log_unit_warning_errno(unit, r,
5123                                                                "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
5124                                 else {
5125                                         *exit_status = EXIT_NETWORK;
5126                                         return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5127                                 }
5128                         }
5129                 } else if (context->network_namespace_path) {
5130                         *exit_status = EXIT_NETWORK;
5131                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5132                                                     "NetworkNamespacePath= is not supported, refusing.");
5133                 } else
5134                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
5135         }
5136
5137         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5138
5139                 if (ns_type_supported(NAMESPACE_IPC)) {
5140                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5141                         if (r == -EPERM)
5142                                 log_unit_warning_errno(unit, r,
5143                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5144                         else if (r < 0) {
5145                                 *exit_status = EXIT_NAMESPACE;
5146                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5147                         }
5148                 } else if (context->ipc_namespace_path) {
5149                         *exit_status = EXIT_NAMESPACE;
5150                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5151                                                     "IPCNamespacePath= is not supported, refusing.");
5152                 } else
5153                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5154         }
5155
5156         if (needs_mount_namespace) {
5157                 _cleanup_free_ char *error_path = NULL;
5158
5159                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5160                 if (r < 0) {
5161                         *exit_status = EXIT_NAMESPACE;
5162                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5163                                                     error_path ? ": " : "", strempty(error_path));
5164                 }
5165         }
5166
5167         if (needs_sandboxing) {
5168                 r = apply_protect_hostname(unit, context, exit_status);
5169                 if (r < 0)
5170                         return r;
5171         }
5172
5173         /* Drop groups as early as possible.
5174          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5175          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5176         if (needs_setuid) {
5177                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5178                 int ngids_to_enforce = 0;
5179
5180                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5181                                                    ngids,
5182                                                    gids_after_pam,
5183                                                    ngids_after_pam,
5184                                                    &gids_to_enforce);
5185                 if (ngids_to_enforce < 0) {
5186                         *exit_status = EXIT_MEMORY;
5187                         return log_unit_error_errno(unit,
5188                                                     ngids_to_enforce,
5189                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
5190                 }
5191
5192                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5193                 if (r < 0) {
5194                         *exit_status = EXIT_GROUP;
5195                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5196                 }
5197         }
5198
5199         /* If the user namespace was not set up above, try to do it now.
5200          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5201          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5202          * case of mount namespaces being less privileged when the mount point list is copied from a
5203          * different user namespace). */
5204
5205         if (needs_sandboxing && context->private_users && !userns_set_up) {
5206                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5207                 if (r < 0) {
5208                         *exit_status = EXIT_USER;
5209                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5210                 }
5211         }
5212
5213         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5214          * shall execute. */
5215
5216         _cleanup_free_ char *executable = NULL;
5217         _cleanup_close_ int executable_fd = -EBADF;
5218         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5219         if (r < 0) {
5220                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5221                         log_unit_struct_errno(unit, LOG_INFO, r,
5222                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5223                                               LOG_UNIT_INVOCATION_ID(unit),
5224                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5225                                                                command->path),
5226                                               "EXECUTABLE=%s", command->path);
5227                         return 0;
5228                 }
5229
5230                 *exit_status = EXIT_EXEC;
5231
5232                 return log_unit_struct_errno(unit, LOG_INFO, r,
5233                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5234                                              LOG_UNIT_INVOCATION_ID(unit),
5235                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5236                                                               command->path),
5237                                              "EXECUTABLE=%s", command->path);
5238         }
5239
5240         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5241         if (r < 0) {
5242                 *exit_status = EXIT_FDS;
5243                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5244         }
5245
5246 #if HAVE_SELINUX
5247         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5248                 int fd = -EBADF;
5249
5250                 if (socket_fd >= 0)
5251                         fd = socket_fd;
5252                 else if (params->n_socket_fds == 1)
5253                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5254                          * use context from that fd to compute the label. */
5255                         fd = params->fds[0];
5256
5257                 if (fd >= 0) {
5258                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5259                         if (r < 0) {
5260                                 if (!context->selinux_context_ignore) {
5261                                         *exit_status = EXIT_SELINUX_CONTEXT;
5262                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5263                                 }
5264                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5265                         }
5266                 }
5267         }
5268 #endif
5269
5270         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5271          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5272          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5273          * execve(). */
5274
5275         r = close_all_fds(keep_fds, n_keep_fds);
5276         if (r >= 0)
5277                 r = shift_fds(fds, n_fds);
5278         if (r >= 0)
5279                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5280         if (r < 0) {
5281                 *exit_status = EXIT_FDS;
5282                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5283         }
5284
5285         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5286          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5287          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5288          * came this far. */
5289
5290         secure_bits = context->secure_bits;
5291
5292         if (needs_sandboxing) {
5293                 uint64_t bset;
5294
5295                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5296                  * (Note this is placed after the general resource limit initialization, see above, in order
5297                  * to take precedence.) */
5298                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5299                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5300                                 *exit_status = EXIT_LIMITS;
5301                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5302                         }
5303                 }
5304
5305 #if ENABLE_SMACK
5306                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5307                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5308                 if (use_smack) {
5309                         r = setup_smack(unit->manager, context, executable_fd);
5310                         if (r < 0 && !context->smack_process_label_ignore) {
5311                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5312                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5313                         }
5314                 }
5315 #endif
5316
5317                 bset = context->capability_bounding_set;
5318                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5319                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5320                  * instead of us doing that */
5321                 if (needs_ambient_hack)
5322                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
5323                                 (UINT64_C(1) << CAP_SETUID) |
5324                                 (UINT64_C(1) << CAP_SETGID);
5325
5326                 if (!cap_test_all(bset)) {
5327                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
5328                         if (r < 0) {
5329                                 *exit_status = EXIT_CAPABILITIES;
5330                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5331                         }
5332                 }
5333
5334                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5335                  * keep-caps set.
5336                  *
5337                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
5338                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5339                  * the ambient capabilities can be raised as they are present in the permitted and
5340                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
5341                  * without changing the user, so we also set the ambient capabilities here.
5342                  *
5343                  * The requested ambient capabilities are raised in the inheritable set if the second
5344                  * argument is true. */
5345                 if (!needs_ambient_hack) {
5346                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5347                         if (r < 0) {
5348                                 *exit_status = EXIT_CAPABILITIES;
5349                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5350                         }
5351                 }
5352         }
5353
5354         /* chroot to root directory first, before we lose the ability to chroot */
5355         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
5356         if (r < 0)
5357                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5358
5359         if (needs_setuid) {
5360                 if (uid_is_valid(uid)) {
5361                         r = enforce_user(context, uid, capability_ambient_set);
5362                         if (r < 0) {
5363                                 *exit_status = EXIT_USER;
5364                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5365                         }
5366
5367                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5368
5369                                 /* Raise the ambient capabilities after user change. */
5370                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5371                                 if (r < 0) {
5372                                         *exit_status = EXIT_CAPABILITIES;
5373                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5374                                 }
5375                         }
5376                 }
5377         }
5378
5379         /* Apply working directory here, because the working directory might be on NFS and only the user running
5380          * this service might have the correct privilege to change to the working directory */
5381         r = apply_working_directory(context, params, home, exit_status);
5382         if (r < 0)
5383                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5384
5385         if (needs_sandboxing) {
5386                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5387                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5388                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5389                  * are restricted. */
5390
5391 #if HAVE_SELINUX
5392                 if (use_selinux) {
5393                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5394
5395                         if (exec_context) {
5396                                 r = setexeccon(exec_context);
5397                                 if (r < 0) {
5398                                         if (!context->selinux_context_ignore) {
5399                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5400                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5401                                         }
5402                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5403                                 }
5404                         }
5405                 }
5406 #endif
5407
5408 #if HAVE_APPARMOR
5409                 if (use_apparmor && context->apparmor_profile) {
5410                         r = aa_change_onexec(context->apparmor_profile);
5411                         if (r < 0 && !context->apparmor_profile_ignore) {
5412                                 *exit_status = EXIT_APPARMOR_PROFILE;
5413                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5414                         }
5415                 }
5416 #endif
5417
5418                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5419                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5420                  * requires CAP_SETPCAP. */
5421                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5422                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5423                          * effective set here.
5424                          *
5425                          * The effective set is overwritten during execve() with the following values:
5426                          *
5427                          * - ambient set (for non-root processes)
5428                          *
5429                          * - (inheritable | bounding) set for root processes)
5430                          *
5431                          * Hence there is no security impact to raise it in the effective set before execve
5432                          */
5433                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5434                         if (r < 0) {
5435                                 *exit_status = EXIT_CAPABILITIES;
5436                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5437                         }
5438                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5439                                 *exit_status = EXIT_SECUREBITS;
5440                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5441                         }
5442                 }
5443
5444                 if (context_has_no_new_privileges(context))
5445                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5446                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5447                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5448                         }
5449
5450 #if HAVE_SECCOMP
5451                 r = apply_address_families(unit, context);
5452                 if (r < 0) {
5453                         *exit_status = EXIT_ADDRESS_FAMILIES;
5454                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5455                 }
5456
5457                 r = apply_memory_deny_write_execute(unit, context);
5458                 if (r < 0) {
5459                         *exit_status = EXIT_SECCOMP;
5460                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5461                 }
5462
5463                 r = apply_restrict_realtime(unit, context);
5464                 if (r < 0) {
5465                         *exit_status = EXIT_SECCOMP;
5466                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5467                 }
5468
5469                 r = apply_restrict_suid_sgid(unit, context);
5470                 if (r < 0) {
5471                         *exit_status = EXIT_SECCOMP;
5472                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5473                 }
5474
5475                 r = apply_restrict_namespaces(unit, context);
5476                 if (r < 0) {
5477                         *exit_status = EXIT_SECCOMP;
5478                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5479                 }
5480
5481                 r = apply_protect_sysctl(unit, context);
5482                 if (r < 0) {
5483                         *exit_status = EXIT_SECCOMP;
5484                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5485                 }
5486
5487                 r = apply_protect_kernel_modules(unit, context);
5488                 if (r < 0) {
5489                         *exit_status = EXIT_SECCOMP;
5490                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5491                 }
5492
5493                 r = apply_protect_kernel_logs(unit, context);
5494                 if (r < 0) {
5495                         *exit_status = EXIT_SECCOMP;
5496                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5497                 }
5498
5499                 r = apply_protect_clock(unit, context);
5500                 if (r < 0) {
5501                         *exit_status = EXIT_SECCOMP;
5502                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5503                 }
5504
5505                 r = apply_private_devices(unit, context);
5506                 if (r < 0) {
5507                         *exit_status = EXIT_SECCOMP;
5508                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5509                 }
5510
5511                 r = apply_syscall_archs(unit, context);
5512                 if (r < 0) {
5513                         *exit_status = EXIT_SECCOMP;
5514                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5515                 }
5516
5517                 r = apply_lock_personality(unit, context);
5518                 if (r < 0) {
5519                         *exit_status = EXIT_SECCOMP;
5520                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5521                 }
5522
5523                 r = apply_syscall_log(unit, context);
5524                 if (r < 0) {
5525                         *exit_status = EXIT_SECCOMP;
5526                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5527                 }
5528
5529                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5530                  * by the filter as little as possible. */
5531                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5532                 if (r < 0) {
5533                         *exit_status = EXIT_SECCOMP;
5534                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5535                 }
5536 #endif
5537
5538 #if HAVE_LIBBPF
5539                 r = apply_restrict_filesystems(unit, context);
5540                 if (r < 0) {
5541                         *exit_status = EXIT_BPF;
5542                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5543                 }
5544 #endif
5545
5546         }
5547
5548         if (!strv_isempty(context->unset_environment)) {
5549                 char **ee = NULL;
5550
5551                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5552                 if (!ee) {
5553                         *exit_status = EXIT_MEMORY;
5554                         return log_oom();
5555                 }
5556
5557                 strv_free_and_replace(accum_env, ee);
5558         }
5559
5560         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5561                 replaced_argv = replace_env_argv(command->argv, accum_env);
5562                 if (!replaced_argv) {
5563                         *exit_status = EXIT_MEMORY;
5564                         return log_oom();
5565                 }
5566                 final_argv = replaced_argv;
5567         } else
5568                 final_argv = command->argv;
5569
5570         log_command_line(unit, "Executing", executable, final_argv);
5571
5572         if (exec_fd >= 0) {
5573                 uint8_t hot = 1;
5574
5575                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5576                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5577
5578                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5579                         *exit_status = EXIT_EXEC;
5580                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5581                 }
5582         }
5583
5584         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5585
5586         if (exec_fd >= 0) {
5587                 uint8_t hot = 0;
5588
5589                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5590                  * that POLLHUP on it no longer means execve() succeeded. */
5591
5592                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5593                         *exit_status = EXIT_EXEC;
5594                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5595                 }
5596         }
5597
5598         *exit_status = EXIT_EXEC;
5599         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5600 }
5601
5602 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5603 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5604
5605 int exec_spawn(Unit *unit,
5606                ExecCommand *command,
5607                const ExecContext *context,
5608                const ExecParameters *params,
5609                ExecRuntime *runtime,
5610                const CGroupContext *cgroup_context,
5611                pid_t *ret) {
5612
5613         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5614         _cleanup_free_ char *subcgroup_path = NULL;
5615         _cleanup_strv_free_ char **files_env = NULL;
5616         size_t n_storage_fds = 0, n_socket_fds = 0;
5617         pid_t pid;
5618
5619         assert(unit);
5620         assert(command);
5621         assert(context);
5622         assert(ret);
5623         assert(params);
5624         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5625
5626         LOG_CONTEXT_PUSH_UNIT(unit);
5627
5628         if (context->std_input == EXEC_INPUT_SOCKET ||
5629             context->std_output == EXEC_OUTPUT_SOCKET ||
5630             context->std_error == EXEC_OUTPUT_SOCKET) {
5631
5632                 if (params->n_socket_fds > 1)
5633                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5634
5635                 if (params->n_socket_fds == 0)
5636                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5637
5638                 socket_fd = params->fds[0];
5639         } else {
5640                 socket_fd = -EBADF;
5641                 fds = params->fds;
5642                 n_socket_fds = params->n_socket_fds;
5643                 n_storage_fds = params->n_storage_fds;
5644         }
5645
5646         r = exec_context_named_iofds(context, params, named_iofds);
5647         if (r < 0)
5648                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5649
5650         r = exec_context_load_environment(unit, context, &files_env);
5651         if (r < 0)
5652                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5653
5654         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5655            and, until the next SELinux policy changes, we save further reloads in future children. */
5656         mac_selinux_maybe_reload();
5657
5658         /* We won't know the real executable path until we create the mount namespace in the child, but we
5659            want to log from the parent, so we use the possibly inaccurate path here. */
5660         log_command_line(unit, "About to execute", command->path, command->argv);
5661
5662         if (params->cgroup_path) {
5663                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5664                 if (r < 0)
5665                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5666                 if (r > 0) { /* We are using a child cgroup */
5667                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5668                         if (r < 0)
5669                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5670
5671                         /* Normally we would not propagate the xattrs to children but since we created this
5672                          * sub-cgroup internally we should do it. */
5673                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5674                         cgroup_log_xattr_apply(unit, subcgroup_path);
5675                 }
5676         }
5677
5678         pid = fork();
5679         if (pid < 0)
5680                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5681
5682         if (pid == 0) {
5683                 int exit_status = EXIT_SUCCESS;
5684
5685                 r = exec_child(unit,
5686                                command,
5687                                context,
5688                                params,
5689                                runtime,
5690                                cgroup_context,
5691                                socket_fd,
5692                                named_iofds,
5693                                fds,
5694                                n_socket_fds,
5695                                n_storage_fds,
5696                                files_env,
5697                                unit->manager->user_lookup_fds[1],
5698                                &exit_status);
5699
5700                 if (r < 0) {
5701                         const char *status =
5702                                 exit_status_to_string(exit_status,
5703                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5704
5705                         log_unit_struct_errno(unit, LOG_ERR, r,
5706                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5707                                               LOG_UNIT_INVOCATION_ID(unit),
5708                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5709                                                                status, command->path),
5710                                               "EXECUTABLE=%s", command->path);
5711                 }
5712
5713                 _exit(exit_status);
5714         }
5715
5716         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5717
5718         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5719          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5720          * process will be killed too). */
5721         if (subcgroup_path)
5722                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5723
5724         exec_status_start(&command->exec_status, pid);
5725
5726         *ret = pid;
5727         return 0;
5728 }
5729
5730 void exec_context_init(ExecContext *c) {
5731         assert(c);
5732
5733         c->umask = 0022;
5734         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5735         c->cpu_sched_policy = SCHED_OTHER;
5736         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5737         c->syslog_level_prefix = true;
5738         c->ignore_sigpipe = true;
5739         c->timer_slack_nsec = NSEC_INFINITY;
5740         c->personality = PERSONALITY_INVALID;
5741         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5742                 c->directories[t].mode = 0755;
5743         c->timeout_clean_usec = USEC_INFINITY;
5744         c->capability_bounding_set = CAP_MASK_UNSET;
5745         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5746         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5747         c->log_level_max = -1;
5748 #if HAVE_SECCOMP
5749         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5750 #endif
5751         c->tty_rows = UINT_MAX;
5752         c->tty_cols = UINT_MAX;
5753         numa_policy_reset(&c->numa_policy);
5754         c->private_mounts = -1;
5755 }
5756
5757 void exec_context_done(ExecContext *c) {
5758         assert(c);
5759
5760         c->environment = strv_free(c->environment);
5761         c->environment_files = strv_free(c->environment_files);
5762         c->pass_environment = strv_free(c->pass_environment);
5763         c->unset_environment = strv_free(c->unset_environment);
5764
5765         rlimit_free_all(c->rlimit);
5766
5767         for (size_t l = 0; l < 3; l++) {
5768                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5769                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5770         }
5771
5772         c->working_directory = mfree(c->working_directory);
5773         c->root_directory = mfree(c->root_directory);
5774         c->root_image = mfree(c->root_image);
5775         c->root_image_options = mount_options_free_all(c->root_image_options);
5776         c->root_hash = mfree(c->root_hash);
5777         c->root_hash_size = 0;
5778         c->root_hash_path = mfree(c->root_hash_path);
5779         c->root_hash_sig = mfree(c->root_hash_sig);
5780         c->root_hash_sig_size = 0;
5781         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5782         c->root_verity = mfree(c->root_verity);
5783         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5784         c->extension_directories = strv_free(c->extension_directories);
5785         c->tty_path = mfree(c->tty_path);
5786         c->syslog_identifier = mfree(c->syslog_identifier);
5787         c->user = mfree(c->user);
5788         c->group = mfree(c->group);
5789
5790         c->supplementary_groups = strv_free(c->supplementary_groups);
5791
5792         c->pam_name = mfree(c->pam_name);
5793
5794         c->read_only_paths = strv_free(c->read_only_paths);
5795         c->read_write_paths = strv_free(c->read_write_paths);
5796         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5797         c->exec_paths = strv_free(c->exec_paths);
5798         c->no_exec_paths = strv_free(c->no_exec_paths);
5799         c->exec_search_path = strv_free(c->exec_search_path);
5800
5801         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5802         c->bind_mounts = NULL;
5803         c->n_bind_mounts = 0;
5804         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5805         c->temporary_filesystems = NULL;
5806         c->n_temporary_filesystems = 0;
5807         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5808
5809         cpu_set_reset(&c->cpu_set);
5810         numa_policy_reset(&c->numa_policy);
5811
5812         c->utmp_id = mfree(c->utmp_id);
5813         c->selinux_context = mfree(c->selinux_context);
5814         c->apparmor_profile = mfree(c->apparmor_profile);
5815         c->smack_process_label = mfree(c->smack_process_label);
5816
5817         c->restrict_filesystems = set_free(c->restrict_filesystems);
5818
5819         c->syscall_filter = hashmap_free(c->syscall_filter);
5820         c->syscall_archs = set_free(c->syscall_archs);
5821         c->address_families = set_free(c->address_families);
5822
5823         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5824                 exec_directory_done(&c->directories[t]);
5825
5826         c->log_level_max = -1;
5827
5828         exec_context_free_log_extra_fields(c);
5829         c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5830         c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
5831
5832         c->log_ratelimit_interval_usec = 0;
5833         c->log_ratelimit_burst = 0;
5834
5835         c->stdin_data = mfree(c->stdin_data);
5836         c->stdin_data_size = 0;
5837
5838         c->network_namespace_path = mfree(c->network_namespace_path);
5839         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5840
5841         c->log_namespace = mfree(c->log_namespace);
5842
5843         c->load_credentials = hashmap_free(c->load_credentials);
5844         c->set_credentials = hashmap_free(c->set_credentials);
5845
5846         c->root_image_policy = image_policy_free(c->root_image_policy);
5847         c->mount_image_policy = image_policy_free(c->mount_image_policy);
5848         c->extension_image_policy = image_policy_free(c->extension_image_policy);
5849 }
5850
5851 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5852         assert(c);
5853
5854         if (!runtime_prefix)
5855                 return 0;
5856
5857         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5858                 _cleanup_free_ char *p = NULL;
5859
5860                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5861                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5862                 else
5863                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5864                 if (!p)
5865                         return -ENOMEM;
5866
5867                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5868                  * service next. */
5869                 (void) rm_rf(p, REMOVE_ROOT);
5870
5871                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5872                         _cleanup_free_ char *symlink_abs = NULL;
5873
5874                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5875                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5876                         else
5877                                 symlink_abs = path_join(runtime_prefix, *symlink);
5878                         if (!symlink_abs)
5879                                 return -ENOMEM;
5880
5881                         (void) unlink(symlink_abs);
5882                 }
5883         }
5884
5885         return 0;
5886 }
5887
5888 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5889         _cleanup_free_ char *p = NULL;
5890
5891         assert(c);
5892
5893         if (!runtime_prefix || !unit)
5894                 return 0;
5895
5896         p = path_join(runtime_prefix, "credentials", unit);
5897         if (!p)
5898                 return -ENOMEM;
5899
5900         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5901          * unmount it, and afterwards remove the mount point */
5902         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5903         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5904
5905         return 0;
5906 }
5907
5908 int exec_context_destroy_mount_ns_dir(Unit *u) {
5909         _cleanup_free_ char *p = NULL;
5910
5911         if (!u || !MANAGER_IS_SYSTEM(u->manager))
5912                 return 0;
5913
5914         p = path_join("/run/systemd/propagate/", u->id);
5915         if (!p)
5916                 return -ENOMEM;
5917
5918         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5919         if (rmdir(p) < 0 && errno != ENOENT)
5920                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5921
5922         return 0;
5923 }
5924
5925 static void exec_command_done(ExecCommand *c) {
5926         assert(c);
5927
5928         c->path = mfree(c->path);
5929         c->argv = strv_free(c->argv);
5930 }
5931
5932 void exec_command_done_array(ExecCommand *c, size_t n) {
5933         for (size_t i = 0; i < n; i++)
5934                 exec_command_done(c+i);
5935 }
5936
5937 ExecCommand* exec_command_free_list(ExecCommand *c) {
5938         ExecCommand *i;
5939
5940         while ((i = c)) {
5941                 LIST_REMOVE(command, c, i);
5942                 exec_command_done(i);
5943                 free(i);
5944         }
5945
5946         return NULL;
5947 }
5948
5949 void exec_command_free_array(ExecCommand **c, size_t n) {
5950         for (size_t i = 0; i < n; i++)
5951                 c[i] = exec_command_free_list(c[i]);
5952 }
5953
5954 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5955         for (size_t i = 0; i < n; i++)
5956                 exec_status_reset(&c[i].exec_status);
5957 }
5958
5959 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5960         for (size_t i = 0; i < n; i++)
5961                 LIST_FOREACH(command, z, c[i])
5962                         exec_status_reset(&z->exec_status);
5963 }
5964
5965 typedef struct InvalidEnvInfo {
5966         const Unit *unit;
5967         const char *path;
5968 } InvalidEnvInfo;
5969
5970 static void invalid_env(const char *p, void *userdata) {
5971         InvalidEnvInfo *info = userdata;
5972
5973         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5974 }
5975
5976 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5977         assert(c);
5978
5979         switch (fd_index) {
5980
5981         case STDIN_FILENO:
5982                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5983                         return NULL;
5984
5985                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5986
5987         case STDOUT_FILENO:
5988                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5989                         return NULL;
5990
5991                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5992
5993         case STDERR_FILENO:
5994                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5995                         return NULL;
5996
5997                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5998
5999         default:
6000                 return NULL;
6001         }
6002 }
6003
6004 static int exec_context_named_iofds(
6005                 const ExecContext *c,
6006                 const ExecParameters *p,
6007                 int named_iofds[static 3]) {
6008
6009         size_t targets;
6010         const char* stdio_fdname[3];
6011         size_t n_fds;
6012
6013         assert(c);
6014         assert(p);
6015         assert(named_iofds);
6016
6017         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6018                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6019                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
6020
6021         for (size_t i = 0; i < 3; i++)
6022                 stdio_fdname[i] = exec_context_fdname(c, i);
6023
6024         n_fds = p->n_storage_fds + p->n_socket_fds;
6025
6026         for (size_t i = 0; i < n_fds  && targets > 0; i++)
6027                 if (named_iofds[STDIN_FILENO] < 0 &&
6028                     c->std_input == EXEC_INPUT_NAMED_FD &&
6029                     stdio_fdname[STDIN_FILENO] &&
6030                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6031
6032                         named_iofds[STDIN_FILENO] = p->fds[i];
6033                         targets--;
6034
6035                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6036                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
6037                            stdio_fdname[STDOUT_FILENO] &&
6038                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6039
6040                         named_iofds[STDOUT_FILENO] = p->fds[i];
6041                         targets--;
6042
6043                 } else if (named_iofds[STDERR_FILENO] < 0 &&
6044                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
6045                            stdio_fdname[STDERR_FILENO] &&
6046                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6047
6048                         named_iofds[STDERR_FILENO] = p->fds[i];
6049                         targets--;
6050                 }
6051
6052         return targets == 0 ? 0 : -ENOENT;
6053 }
6054
6055 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6056         _cleanup_strv_free_ char **v = NULL;
6057         int r;
6058
6059         assert(c);
6060         assert(ret);
6061
6062         STRV_FOREACH(i, c->environment_files) {
6063                 _cleanup_globfree_ glob_t pglob = {};
6064                 bool ignore = false;
6065                 char *fn = *i;
6066
6067                 if (fn[0] == '-') {
6068                         ignore = true;
6069                         fn++;
6070                 }
6071
6072                 if (!path_is_absolute(fn)) {
6073                         if (ignore)
6074                                 continue;
6075                         return -EINVAL;
6076                 }
6077
6078                 /* Filename supports globbing, take all matching files */
6079                 r = safe_glob(fn, 0, &pglob);
6080                 if (r < 0) {
6081                         if (ignore)
6082                                 continue;
6083                         return r;
6084                 }
6085
6086                 /* When we don't match anything, -ENOENT should be returned */
6087                 assert(pglob.gl_pathc > 0);
6088
6089                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
6090                         _cleanup_strv_free_ char **p = NULL;
6091
6092                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6093                         if (r < 0) {
6094                                 if (ignore)
6095                                         continue;
6096                                 return r;
6097                         }
6098
6099                         /* Log invalid environment variables with filename */
6100                         if (p) {
6101                                 InvalidEnvInfo info = {
6102                                         .unit = unit,
6103                                         .path = pglob.gl_pathv[n]
6104                                 };
6105
6106                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
6107                         }
6108
6109                         if (!v)
6110                                 v = TAKE_PTR(p);
6111                         else {
6112                                 char **m = strv_env_merge(v, p);
6113                                 if (!m)
6114                                         return -ENOMEM;
6115
6116                                 strv_free_and_replace(v, m);
6117                         }
6118                 }
6119         }
6120
6121         *ret = TAKE_PTR(v);
6122
6123         return 0;
6124 }
6125
6126 static bool tty_may_match_dev_console(const char *tty) {
6127         _cleanup_free_ char *resolved = NULL;
6128
6129         if (!tty)
6130                 return true;
6131
6132         tty = skip_dev_prefix(tty);
6133
6134         /* trivial identity? */
6135         if (streq(tty, "console"))
6136                 return true;
6137
6138         if (resolve_dev_console(&resolved) < 0)
6139                 return true; /* if we could not resolve, assume it may */
6140
6141         /* "tty0" means the active VC, so it may be the same sometimes */
6142         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6143 }
6144
6145 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6146         assert(ec);
6147
6148         return ec->tty_reset ||
6149                 ec->tty_vhangup ||
6150                 ec->tty_vt_disallocate ||
6151                 is_terminal_input(ec->std_input) ||
6152                 is_terminal_output(ec->std_output) ||
6153                 is_terminal_output(ec->std_error);
6154 }
6155
6156 bool exec_context_may_touch_console(const ExecContext *ec) {
6157
6158         return exec_context_may_touch_tty(ec) &&
6159                tty_may_match_dev_console(exec_context_tty_path(ec));
6160 }
6161
6162 static void strv_fprintf(FILE *f, char **l) {
6163         assert(f);
6164
6165         STRV_FOREACH(g, l)
6166                 fprintf(f, " %s", *g);
6167 }
6168
6169 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6170         assert(f);
6171         assert(prefix);
6172         assert(name);
6173
6174         if (!strv_isempty(strv)) {
6175                 fprintf(f, "%s%s:", prefix, name);
6176                 strv_fprintf(f, strv);
6177                 fputs("\n", f);
6178         }
6179 }
6180
6181 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6182         int r;
6183
6184         assert(c);
6185         assert(f);
6186
6187         prefix = strempty(prefix);
6188
6189         fprintf(f,
6190                 "%sUMask: %04o\n"
6191                 "%sWorkingDirectory: %s\n"
6192                 "%sRootDirectory: %s\n"
6193                 "%sNonBlocking: %s\n"
6194                 "%sPrivateTmp: %s\n"
6195                 "%sPrivateDevices: %s\n"
6196                 "%sProtectKernelTunables: %s\n"
6197                 "%sProtectKernelModules: %s\n"
6198                 "%sProtectKernelLogs: %s\n"
6199                 "%sProtectClock: %s\n"
6200                 "%sProtectControlGroups: %s\n"
6201                 "%sPrivateNetwork: %s\n"
6202                 "%sPrivateUsers: %s\n"
6203                 "%sProtectHome: %s\n"
6204                 "%sProtectSystem: %s\n"
6205                 "%sMountAPIVFS: %s\n"
6206                 "%sIgnoreSIGPIPE: %s\n"
6207                 "%sMemoryDenyWriteExecute: %s\n"
6208                 "%sRestrictRealtime: %s\n"
6209                 "%sRestrictSUIDSGID: %s\n"
6210                 "%sKeyringMode: %s\n"
6211                 "%sProtectHostname: %s\n"
6212                 "%sProtectProc: %s\n"
6213                 "%sProcSubset: %s\n",
6214                 prefix, c->umask,
6215                 prefix, empty_to_root(c->working_directory),
6216                 prefix, empty_to_root(c->root_directory),
6217                 prefix, yes_no(c->non_blocking),
6218                 prefix, yes_no(c->private_tmp),
6219                 prefix, yes_no(c->private_devices),
6220                 prefix, yes_no(c->protect_kernel_tunables),
6221                 prefix, yes_no(c->protect_kernel_modules),
6222                 prefix, yes_no(c->protect_kernel_logs),
6223                 prefix, yes_no(c->protect_clock),
6224                 prefix, yes_no(c->protect_control_groups),
6225                 prefix, yes_no(c->private_network),
6226                 prefix, yes_no(c->private_users),
6227                 prefix, protect_home_to_string(c->protect_home),
6228                 prefix, protect_system_to_string(c->protect_system),
6229                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6230                 prefix, yes_no(c->ignore_sigpipe),
6231                 prefix, yes_no(c->memory_deny_write_execute),
6232                 prefix, yes_no(c->restrict_realtime),
6233                 prefix, yes_no(c->restrict_suid_sgid),
6234                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6235                 prefix, yes_no(c->protect_hostname),
6236                 prefix, protect_proc_to_string(c->protect_proc),
6237                 prefix, proc_subset_to_string(c->proc_subset));
6238
6239         if (c->root_image)
6240                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6241
6242         if (c->root_image_options) {
6243                 fprintf(f, "%sRootImageOptions:", prefix);
6244                 LIST_FOREACH(mount_options, o, c->root_image_options)
6245                         if (!isempty(o->options))
6246                                 fprintf(f, " %s:%s",
6247                                         partition_designator_to_string(o->partition_designator),
6248                                         o->options);
6249                 fprintf(f, "\n");
6250         }
6251
6252         if (c->root_hash) {
6253                 _cleanup_free_ char *encoded = NULL;
6254                 encoded = hexmem(c->root_hash, c->root_hash_size);
6255                 if (encoded)
6256                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6257         }
6258
6259         if (c->root_hash_path)
6260                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6261
6262         if (c->root_hash_sig) {
6263                 _cleanup_free_ char *encoded = NULL;
6264                 ssize_t len;
6265                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6266                 if (len)
6267                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6268         }
6269
6270         if (c->root_hash_sig_path)
6271                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6272
6273         if (c->root_verity)
6274                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6275
6276         STRV_FOREACH(e, c->environment)
6277                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6278
6279         STRV_FOREACH(e, c->environment_files)
6280                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6281
6282         STRV_FOREACH(e, c->pass_environment)
6283                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6284
6285         STRV_FOREACH(e, c->unset_environment)
6286                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6287
6288         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6289
6290         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6291                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6292
6293                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6294                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6295
6296                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6297                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6298                 }
6299         }
6300
6301         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6302
6303         if (c->nice_set)
6304                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6305
6306         if (c->oom_score_adjust_set)
6307                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6308
6309         if (c->coredump_filter_set)
6310                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6311
6312         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6313                 if (c->rlimit[i]) {
6314                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6315                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6316                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6317                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6318                 }
6319
6320         if (c->ioprio_set) {
6321                 _cleanup_free_ char *class_str = NULL;
6322
6323                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6324                 if (r >= 0)
6325                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6326
6327                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6328         }
6329
6330         if (c->cpu_sched_set) {
6331                 _cleanup_free_ char *policy_str = NULL;
6332
6333                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6334                 if (r >= 0)
6335                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6336
6337                 fprintf(f,
6338                         "%sCPUSchedulingPriority: %i\n"
6339                         "%sCPUSchedulingResetOnFork: %s\n",
6340                         prefix, c->cpu_sched_priority,
6341                         prefix, yes_no(c->cpu_sched_reset_on_fork));
6342         }
6343
6344         if (c->cpu_set.set) {
6345                 _cleanup_free_ char *affinity = NULL;
6346
6347                 affinity = cpu_set_to_range_string(&c->cpu_set);
6348                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6349         }
6350
6351         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6352                 _cleanup_free_ char *nodes = NULL;
6353
6354                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6355                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6356                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6357         }
6358
6359         if (c->timer_slack_nsec != NSEC_INFINITY)
6360                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6361
6362         fprintf(f,
6363                 "%sStandardInput: %s\n"
6364                 "%sStandardOutput: %s\n"
6365                 "%sStandardError: %s\n",
6366                 prefix, exec_input_to_string(c->std_input),
6367                 prefix, exec_output_to_string(c->std_output),
6368                 prefix, exec_output_to_string(c->std_error));
6369
6370         if (c->std_input == EXEC_INPUT_NAMED_FD)
6371                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6372         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6373                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6374         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6375                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6376
6377         if (c->std_input == EXEC_INPUT_FILE)
6378                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6379         if (c->std_output == EXEC_OUTPUT_FILE)
6380                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6381         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6382                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6383         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6384                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6385         if (c->std_error == EXEC_OUTPUT_FILE)
6386                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6387         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6388                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6389         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6390                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6391
6392         if (c->tty_path)
6393                 fprintf(f,
6394                         "%sTTYPath: %s\n"
6395                         "%sTTYReset: %s\n"
6396                         "%sTTYVHangup: %s\n"
6397                         "%sTTYVTDisallocate: %s\n"
6398                         "%sTTYRows: %u\n"
6399                         "%sTTYColumns: %u\n",
6400                         prefix, c->tty_path,
6401                         prefix, yes_no(c->tty_reset),
6402                         prefix, yes_no(c->tty_vhangup),
6403                         prefix, yes_no(c->tty_vt_disallocate),
6404                         prefix, c->tty_rows,
6405                         prefix, c->tty_cols);
6406
6407         if (IN_SET(c->std_output,
6408                    EXEC_OUTPUT_KMSG,
6409                    EXEC_OUTPUT_JOURNAL,
6410                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6411                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6412             IN_SET(c->std_error,
6413                    EXEC_OUTPUT_KMSG,
6414                    EXEC_OUTPUT_JOURNAL,
6415                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
6416                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6417
6418                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6419
6420                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6421                 if (r >= 0)
6422                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6423
6424                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6425                 if (r >= 0)
6426                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6427         }
6428
6429         if (c->log_level_max >= 0) {
6430                 _cleanup_free_ char *t = NULL;
6431
6432                 (void) log_level_to_string_alloc(c->log_level_max, &t);
6433
6434                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6435         }
6436
6437         if (c->log_ratelimit_interval_usec > 0)
6438                 fprintf(f,
6439                         "%sLogRateLimitIntervalSec: %s\n",
6440                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6441
6442         if (c->log_ratelimit_burst > 0)
6443                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6444
6445         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6446                 fprintf(f, "%sLogFilterPatterns:", prefix);
6447
6448                 char *pattern;
6449                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6450                         fprintf(f, " %s", pattern);
6451                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6452                         fprintf(f, " ~%s", pattern);
6453                 fputc('\n', f);
6454         }
6455
6456         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6457                 fprintf(f, "%sLogExtraFields: ", prefix);
6458                 fwrite(c->log_extra_fields[j].iov_base,
6459                        1, c->log_extra_fields[j].iov_len,
6460                        f);
6461                 fputc('\n', f);
6462         }
6463
6464         if (c->log_namespace)
6465                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6466
6467         if (c->secure_bits) {
6468                 _cleanup_free_ char *str = NULL;
6469
6470                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6471                 if (r >= 0)
6472                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6473         }
6474
6475         if (c->capability_bounding_set != CAP_MASK_UNSET) {
6476                 _cleanup_free_ char *str = NULL;
6477
6478                 r = capability_set_to_string(c->capability_bounding_set, &str);
6479                 if (r >= 0)
6480                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6481         }
6482
6483         if (c->capability_ambient_set != 0) {
6484                 _cleanup_free_ char *str = NULL;
6485
6486                 r = capability_set_to_string(c->capability_ambient_set, &str);
6487                 if (r >= 0)
6488                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6489         }
6490
6491         if (c->user)
6492                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6493         if (c->group)
6494                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6495
6496         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6497
6498         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6499
6500         if (c->pam_name)
6501                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6502
6503         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6504         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6505         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6506         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6507         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6508         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6509
6510         for (size_t i = 0; i < c->n_bind_mounts; i++)
6511                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6512                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6513                         c->bind_mounts[i].ignore_enoent ? "-": "",
6514                         c->bind_mounts[i].source,
6515                         c->bind_mounts[i].destination,
6516                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6517
6518         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6519                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6520
6521                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6522                         t->path,
6523                         isempty(t->options) ? "" : ":",
6524                         strempty(t->options));
6525         }
6526
6527         if (c->utmp_id)
6528                 fprintf(f,
6529                         "%sUtmpIdentifier: %s\n",
6530                         prefix, c->utmp_id);
6531
6532         if (c->selinux_context)
6533                 fprintf(f,
6534                         "%sSELinuxContext: %s%s\n",
6535                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6536
6537         if (c->apparmor_profile)
6538                 fprintf(f,
6539                         "%sAppArmorProfile: %s%s\n",
6540                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6541
6542         if (c->smack_process_label)
6543                 fprintf(f,
6544                         "%sSmackProcessLabel: %s%s\n",
6545                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6546
6547         if (c->personality != PERSONALITY_INVALID)
6548                 fprintf(f,
6549                         "%sPersonality: %s\n",
6550                         prefix, strna(personality_to_string(c->personality)));
6551
6552         fprintf(f,
6553                 "%sLockPersonality: %s\n",
6554                 prefix, yes_no(c->lock_personality));
6555
6556         if (c->syscall_filter) {
6557                 fprintf(f,
6558                         "%sSystemCallFilter: ",
6559                         prefix);
6560
6561                 if (!c->syscall_allow_list)
6562                         fputc('~', f);
6563
6564 #if HAVE_SECCOMP
6565                 void *id, *val;
6566                 bool first = true;
6567                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6568                         _cleanup_free_ char *name = NULL;
6569                         const char *errno_name = NULL;
6570                         int num = PTR_TO_INT(val);
6571
6572                         if (first)
6573                                 first = false;
6574                         else
6575                                 fputc(' ', f);
6576
6577                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6578                         fputs(strna(name), f);
6579
6580                         if (num >= 0) {
6581                                 errno_name = seccomp_errno_or_action_to_string(num);
6582                                 if (errno_name)
6583                                         fprintf(f, ":%s", errno_name);
6584                                 else
6585                                         fprintf(f, ":%d", num);
6586                         }
6587                 }
6588 #endif
6589
6590                 fputc('\n', f);
6591         }
6592
6593         if (c->syscall_archs) {
6594                 fprintf(f,
6595                         "%sSystemCallArchitectures:",
6596                         prefix);
6597
6598 #if HAVE_SECCOMP
6599                 void *id;
6600                 SET_FOREACH(id, c->syscall_archs)
6601                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6602 #endif
6603                 fputc('\n', f);
6604         }
6605
6606         if (exec_context_restrict_namespaces_set(c)) {
6607                 _cleanup_free_ char *s = NULL;
6608
6609                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6610                 if (r >= 0)
6611                         fprintf(f, "%sRestrictNamespaces: %s\n",
6612                                 prefix, strna(s));
6613         }
6614
6615 #if HAVE_LIBBPF
6616         if (exec_context_restrict_filesystems_set(c)) {
6617                 char *fs;
6618                 SET_FOREACH(fs, c->restrict_filesystems)
6619                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6620         }
6621 #endif
6622
6623         if (c->network_namespace_path)
6624                 fprintf(f,
6625                         "%sNetworkNamespacePath: %s\n",
6626                         prefix, c->network_namespace_path);
6627
6628         if (c->syscall_errno > 0) {
6629                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6630
6631 #if HAVE_SECCOMP
6632                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6633                 if (errno_name)
6634                         fputs(errno_name, f);
6635                 else
6636                         fprintf(f, "%d", c->syscall_errno);
6637 #endif
6638                 fputc('\n', f);
6639         }
6640
6641         for (size_t i = 0; i < c->n_mount_images; i++) {
6642                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6643                         c->mount_images[i].ignore_enoent ? "-": "",
6644                         c->mount_images[i].source,
6645                         c->mount_images[i].destination);
6646                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6647                         fprintf(f, ":%s:%s",
6648                                 partition_designator_to_string(o->partition_designator),
6649                                 strempty(o->options));
6650                 fprintf(f, "\n");
6651         }
6652
6653         for (size_t i = 0; i < c->n_extension_images; i++) {
6654                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6655                         c->extension_images[i].ignore_enoent ? "-": "",
6656                         c->extension_images[i].source);
6657                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6658                         fprintf(f, ":%s:%s",
6659                                 partition_designator_to_string(o->partition_designator),
6660                                 strempty(o->options));
6661                 fprintf(f, "\n");
6662         }
6663
6664         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6665 }
6666
6667 bool exec_context_maintains_privileges(const ExecContext *c) {
6668         assert(c);
6669
6670         /* Returns true if the process forked off would run under
6671          * an unchanged UID or as root. */
6672
6673         if (!c->user)
6674                 return true;
6675
6676         if (streq(c->user, "root") || streq(c->user, "0"))
6677                 return true;
6678
6679         return false;
6680 }
6681
6682 int exec_context_get_effective_ioprio(const ExecContext *c) {
6683         int p;
6684
6685         assert(c);
6686
6687         if (c->ioprio_set)
6688                 return c->ioprio;
6689
6690         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6691         if (p < 0)
6692                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6693
6694         return ioprio_normalize(p);
6695 }
6696
6697 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6698         assert(c);
6699
6700         /* Explicit setting wins */
6701         if (c->mount_apivfs_set)
6702                 return c->mount_apivfs;
6703
6704         /* Default to "yes" if root directory or image are specified */
6705         if (exec_context_with_rootfs(c))
6706                 return true;
6707
6708         return false;
6709 }
6710
6711 void exec_context_free_log_extra_fields(ExecContext *c) {
6712         assert(c);
6713
6714         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6715                 free(c->log_extra_fields[l].iov_base);
6716         c->log_extra_fields = mfree(c->log_extra_fields);
6717         c->n_log_extra_fields = 0;
6718 }
6719
6720 void exec_context_revert_tty(ExecContext *c) {
6721         _cleanup_close_ int fd = -EBADF;
6722         const char *path;
6723         struct stat st;
6724         int r;
6725
6726         assert(c);
6727
6728         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6729         exec_context_tty_reset(c, NULL);
6730
6731         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6732          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6733          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6734         if (!exec_context_may_touch_tty(c))
6735                 return;
6736
6737         path = exec_context_tty_path(c);
6738         if (!path)
6739                 return;
6740
6741         fd = open(path, O_PATH|O_CLOEXEC);
6742         if (fd < 0)
6743                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6744                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6745                                              path);
6746
6747         if (fstat(fd, &st) < 0)
6748                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6749
6750         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6751          * if things are a character device, since a proper check either means we'd have to open the TTY and
6752          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6753          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6754          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6755         if (!S_ISCHR(st.st_mode))
6756                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6757
6758         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6759         if (r < 0)
6760                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6761 }
6762
6763 int exec_context_get_clean_directories(
6764                 ExecContext *c,
6765                 char **prefix,
6766                 ExecCleanMask mask,
6767                 char ***ret) {
6768
6769         _cleanup_strv_free_ char **l = NULL;
6770         int r;
6771
6772         assert(c);
6773         assert(prefix);
6774         assert(ret);
6775
6776         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6777                 if (!FLAGS_SET(mask, 1U << t))
6778                         continue;
6779
6780                 if (!prefix[t])
6781                         continue;
6782
6783                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6784                         char *j;
6785
6786                         j = path_join(prefix[t], c->directories[t].items[i].path);
6787                         if (!j)
6788                                 return -ENOMEM;
6789
6790                         r = strv_consume(&l, j);
6791                         if (r < 0)
6792                                 return r;
6793
6794                         /* Also remove private directories unconditionally. */
6795                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6796                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6797                                 if (!j)
6798                                         return -ENOMEM;
6799
6800                                 r = strv_consume(&l, j);
6801                                 if (r < 0)
6802                                         return r;
6803                         }
6804
6805                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6806                                 j = path_join(prefix[t], *symlink);
6807                                 if (!j)
6808                                         return -ENOMEM;
6809
6810                                 r = strv_consume(&l, j);
6811                                 if (r < 0)
6812                                         return r;
6813                         }
6814                 }
6815         }
6816
6817         *ret = TAKE_PTR(l);
6818         return 0;
6819 }
6820
6821 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6822         ExecCleanMask mask = 0;
6823
6824         assert(c);
6825         assert(ret);
6826
6827         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6828                 if (c->directories[t].n_items > 0)
6829                         mask |= 1U << t;
6830
6831         *ret = mask;
6832         return 0;
6833 }
6834
6835 bool exec_context_has_encrypted_credentials(ExecContext *c) {
6836         ExecLoadCredential *load_cred;
6837         ExecSetCredential *set_cred;
6838
6839         assert(c);
6840
6841         HASHMAP_FOREACH(load_cred, c->load_credentials)
6842                 if (load_cred->encrypted)
6843                         return true;
6844
6845         HASHMAP_FOREACH(set_cred, c->set_credentials)
6846                 if (set_cred->encrypted)
6847                         return true;
6848
6849         return false;
6850 }
6851
6852 void exec_status_start(ExecStatus *s, pid_t pid) {
6853         assert(s);
6854
6855         *s = (ExecStatus) {
6856                 .pid = pid,
6857         };
6858
6859         dual_timestamp_get(&s->start_timestamp);
6860 }
6861
6862 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6863         assert(s);
6864
6865         if (s->pid != pid)
6866                 *s = (ExecStatus) {
6867                         .pid = pid,
6868                 };
6869
6870         dual_timestamp_get(&s->exit_timestamp);
6871
6872         s->code = code;
6873         s->status = status;
6874
6875         if (context && context->utmp_id)
6876                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6877 }
6878
6879 void exec_status_reset(ExecStatus *s) {
6880         assert(s);
6881
6882         *s = (ExecStatus) {};
6883 }
6884
6885 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6886         assert(s);
6887         assert(f);
6888
6889         if (s->pid <= 0)
6890                 return;
6891
6892         prefix = strempty(prefix);
6893
6894         fprintf(f,
6895                 "%sPID: "PID_FMT"\n",
6896                 prefix, s->pid);
6897
6898         if (dual_timestamp_is_set(&s->start_timestamp))
6899                 fprintf(f,
6900                         "%sStart Timestamp: %s\n",
6901                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6902
6903         if (dual_timestamp_is_set(&s->exit_timestamp))
6904                 fprintf(f,
6905                         "%sExit Timestamp: %s\n"
6906                         "%sExit Code: %s\n"
6907                         "%sExit Status: %i\n",
6908                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6909                         prefix, sigchld_code_to_string(s->code),
6910                         prefix, s->status);
6911 }
6912
6913 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6914         _cleanup_free_ char *cmd = NULL;
6915         const char *prefix2;
6916
6917         assert(c);
6918         assert(f);
6919
6920         prefix = strempty(prefix);
6921         prefix2 = strjoina(prefix, "\t");
6922
6923         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6924
6925         fprintf(f,
6926                 "%sCommand Line: %s\n",
6927                 prefix, strnull(cmd));
6928
6929         exec_status_dump(&c->exec_status, f, prefix2);
6930 }
6931
6932 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6933         assert(f);
6934
6935         prefix = strempty(prefix);
6936
6937         LIST_FOREACH(command, i, c)
6938                 exec_command_dump(i, f, prefix);
6939 }
6940
6941 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6942         ExecCommand *end;
6943
6944         assert(l);
6945         assert(e);
6946
6947         if (*l) {
6948                 /* It's kind of important, that we keep the order here */
6949                 end = LIST_FIND_TAIL(command, *l);
6950                 LIST_INSERT_AFTER(command, *l, end, e);
6951         } else
6952                 *l = e;
6953 }
6954
6955 int exec_command_set(ExecCommand *c, const char *path, ...) {
6956         va_list ap;
6957         char **l, *p;
6958
6959         assert(c);
6960         assert(path);
6961
6962         va_start(ap, path);
6963         l = strv_new_ap(path, ap);
6964         va_end(ap);
6965
6966         if (!l)
6967                 return -ENOMEM;
6968
6969         p = strdup(path);
6970         if (!p) {
6971                 strv_free(l);
6972                 return -ENOMEM;
6973         }
6974
6975         free_and_replace(c->path, p);
6976
6977         return strv_free_and_replace(c->argv, l);
6978 }
6979
6980 int exec_command_append(ExecCommand *c, const char *path, ...) {
6981         _cleanup_strv_free_ char **l = NULL;
6982         va_list ap;
6983         int r;
6984
6985         assert(c);
6986         assert(path);
6987
6988         va_start(ap, path);
6989         l = strv_new_ap(path, ap);
6990         va_end(ap);
6991
6992         if (!l)
6993                 return -ENOMEM;
6994
6995         r = strv_extend_strv(&c->argv, l, false);
6996         if (r < 0)
6997                 return r;
6998
6999         return 0;
7000 }
7001
7002 static void *remove_tmpdir_thread(void *p) {
7003         _cleanup_free_ char *path = p;
7004
7005         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
7006         return NULL;
7007 }
7008
7009 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7010         if (!rt)
7011                 return NULL;
7012
7013         if (rt->manager)
7014                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7015
7016         rt->id = mfree(rt->id);
7017         rt->tmp_dir = mfree(rt->tmp_dir);
7018         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7019         safe_close_pair(rt->netns_storage_socket);
7020         safe_close_pair(rt->ipcns_storage_socket);
7021         return mfree(rt);
7022 }
7023
7024 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7025 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7026
7027 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7028         int r;
7029
7030         if (!rt)
7031                 return NULL;
7032
7033         assert(rt->n_ref > 0);
7034         rt->n_ref--;
7035
7036         if (rt->n_ref > 0)
7037                 return NULL;
7038
7039         if (rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
7040                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
7041
7042                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
7043                 if (r < 0)
7044                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
7045                 else
7046                         rt->tmp_dir = NULL;
7047         }
7048
7049         if (rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
7050                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
7051
7052                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
7053                 if (r < 0)
7054                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
7055                 else
7056                         rt->var_tmp_dir = NULL;
7057         }
7058
7059         return exec_shared_runtime_free(rt);
7060 }
7061
7062 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7063         _cleanup_free_ char *id_copy = NULL;
7064         ExecSharedRuntime *n;
7065
7066         assert(ret);
7067
7068         id_copy = strdup(id);
7069         if (!id_copy)
7070                 return -ENOMEM;
7071
7072         n = new(ExecSharedRuntime, 1);
7073         if (!n)
7074                 return -ENOMEM;
7075
7076         *n = (ExecSharedRuntime) {
7077                 .id = TAKE_PTR(id_copy),
7078                 .netns_storage_socket = PIPE_EBADF,
7079                 .ipcns_storage_socket = PIPE_EBADF,
7080         };
7081
7082         *ret = n;
7083         return 0;
7084 }
7085
7086 static int exec_shared_runtime_add(
7087                 Manager *m,
7088                 const char *id,
7089                 char **tmp_dir,
7090                 char **var_tmp_dir,
7091                 int netns_storage_socket[2],
7092                 int ipcns_storage_socket[2],
7093                 ExecSharedRuntime **ret) {
7094
7095         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7096         int r;
7097
7098         assert(m);
7099         assert(id);
7100
7101         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7102
7103         r = exec_shared_runtime_allocate(&rt, id);
7104         if (r < 0)
7105                 return r;
7106
7107         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7108         if (r < 0)
7109                 return r;
7110
7111         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7112         rt->tmp_dir = TAKE_PTR(*tmp_dir);
7113         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7114
7115         if (netns_storage_socket) {
7116                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7117                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7118         }
7119
7120         if (ipcns_storage_socket) {
7121                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7122                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7123         }
7124
7125         rt->manager = m;
7126
7127         if (ret)
7128                 *ret = rt;
7129         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7130         TAKE_PTR(rt);
7131         return 0;
7132 }
7133
7134 static int exec_shared_runtime_make(
7135                 Manager *m,
7136                 const ExecContext *c,
7137                 const char *id,
7138                 ExecSharedRuntime **ret) {
7139
7140         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7141         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7142         int r;
7143
7144         assert(m);
7145         assert(c);
7146         assert(id);
7147
7148         /* It is not necessary to create ExecSharedRuntime object. */
7149         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7150                 *ret = NULL;
7151                 return 0;
7152         }
7153
7154         if (c->private_tmp &&
7155             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7156               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7157                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7158                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7159                 if (r < 0)
7160                         return r;
7161         }
7162
7163         if (exec_needs_network_namespace(c)) {
7164                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7165                         return -errno;
7166         }
7167
7168         if (exec_needs_ipc_namespace(c)) {
7169                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7170                         return -errno;
7171         }
7172
7173         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7174         if (r < 0)
7175                 return r;
7176
7177         return 1;
7178 }
7179
7180 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7181         ExecSharedRuntime *rt;
7182         int r;
7183
7184         assert(m);
7185         assert(id);
7186         assert(ret);
7187
7188         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7189         if (rt)
7190                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7191                 goto ref;
7192
7193         if (!create) {
7194                 *ret = NULL;
7195                 return 0;
7196         }
7197
7198         /* If not found, then create a new object. */
7199         r = exec_shared_runtime_make(m, c, id, &rt);
7200         if (r < 0)
7201                 return r;
7202         if (r == 0) {
7203                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7204                 *ret = NULL;
7205                 return 0;
7206         }
7207
7208 ref:
7209         /* increment reference counter. */
7210         rt->n_ref++;
7211         *ret = rt;
7212         return 1;
7213 }
7214
7215 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7216         ExecSharedRuntime *rt;
7217
7218         assert(m);
7219         assert(f);
7220         assert(fds);
7221
7222         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7223                 fprintf(f, "exec-runtime=%s", rt->id);
7224
7225                 if (rt->tmp_dir)
7226                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7227
7228                 if (rt->var_tmp_dir)
7229                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7230
7231                 if (rt->netns_storage_socket[0] >= 0) {
7232                         int copy;
7233
7234                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7235                         if (copy < 0)
7236                                 return copy;
7237
7238                         fprintf(f, " netns-socket-0=%i", copy);
7239                 }
7240
7241                 if (rt->netns_storage_socket[1] >= 0) {
7242                         int copy;
7243
7244                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7245                         if (copy < 0)
7246                                 return copy;
7247
7248                         fprintf(f, " netns-socket-1=%i", copy);
7249                 }
7250
7251                 if (rt->ipcns_storage_socket[0] >= 0) {
7252                         int copy;
7253
7254                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7255                         if (copy < 0)
7256                                 return copy;
7257
7258                         fprintf(f, " ipcns-socket-0=%i", copy);
7259                 }
7260
7261                 if (rt->ipcns_storage_socket[1] >= 0) {
7262                         int copy;
7263
7264                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7265                         if (copy < 0)
7266                                 return copy;
7267
7268                         fprintf(f, " ipcns-socket-1=%i", copy);
7269                 }
7270
7271                 fputc('\n', f);
7272         }
7273
7274         return 0;
7275 }
7276
7277 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7278         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7279         ExecSharedRuntime *rt;
7280         int r;
7281
7282         /* This is for the migration from old (v237 or earlier) deserialization text.
7283          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7284          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7285          * so or not from the serialized text, then we always creates a new object owned by this. */
7286
7287         assert(u);
7288         assert(key);
7289         assert(value);
7290
7291         /* Manager manages ExecSharedRuntime objects by the unit id.
7292          * So, we omit the serialized text when the unit does not have id (yet?)... */
7293         if (isempty(u->id)) {
7294                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7295                 return 0;
7296         }
7297
7298         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7299                 return log_oom();
7300
7301         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7302         if (!rt) {
7303                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7304                         return log_oom();
7305
7306                 rt = rt_create;
7307         }
7308
7309         if (streq(key, "tmp-dir")) {
7310                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7311                         return -ENOMEM;
7312
7313         } else if (streq(key, "var-tmp-dir")) {
7314                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7315                         return -ENOMEM;
7316
7317         } else if (streq(key, "netns-socket-0")) {
7318                 int fd;
7319
7320                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
7321                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7322                         return 0;
7323                 }
7324
7325                 safe_close(rt->netns_storage_socket[0]);
7326                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7327
7328         } else if (streq(key, "netns-socket-1")) {
7329                 int fd;
7330
7331                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
7332                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7333                         return 0;
7334                 }
7335
7336                 safe_close(rt->netns_storage_socket[1]);
7337                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7338
7339         } else
7340                 return 0;
7341
7342         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7343         if (rt_create) {
7344                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7345                 if (r < 0) {
7346                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7347                         return 0;
7348                 }
7349
7350                 rt_create->manager = u->manager;
7351
7352                 /* Avoid cleanup */
7353                 TAKE_PTR(rt_create);
7354         }
7355
7356         return 1;
7357 }
7358
7359 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7360         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7361         char *id = NULL;
7362         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7363         const char *p, *v = ASSERT_PTR(value);
7364         size_t n;
7365
7366         assert(m);
7367         assert(fds);
7368
7369         n = strcspn(v, " ");
7370         id = strndupa_safe(v, n);
7371         if (v[n] != ' ')
7372                 goto finalize;
7373         p = v + n + 1;
7374
7375         v = startswith(p, "tmp-dir=");
7376         if (v) {
7377                 n = strcspn(v, " ");
7378                 tmp_dir = strndup(v, n);
7379                 if (!tmp_dir)
7380                         return log_oom();
7381                 if (v[n] != ' ')
7382                         goto finalize;
7383                 p = v + n + 1;
7384         }
7385
7386         v = startswith(p, "var-tmp-dir=");
7387         if (v) {
7388                 n = strcspn(v, " ");
7389                 var_tmp_dir = strndup(v, n);
7390                 if (!var_tmp_dir)
7391                         return log_oom();
7392                 if (v[n] != ' ')
7393                         goto finalize;
7394                 p = v + n + 1;
7395         }
7396
7397         v = startswith(p, "netns-socket-0=");
7398         if (v) {
7399                 char *buf;
7400
7401                 n = strcspn(v, " ");
7402                 buf = strndupa_safe(v, n);
7403
7404                 r = safe_atoi(buf, &netns_fdpair[0]);
7405                 if (r < 0)
7406                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7407                 if (!fdset_contains(fds, netns_fdpair[0]))
7408                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7409                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7410                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7411                 if (v[n] != ' ')
7412                         goto finalize;
7413                 p = v + n + 1;
7414         }
7415
7416         v = startswith(p, "netns-socket-1=");
7417         if (v) {
7418                 char *buf;
7419
7420                 n = strcspn(v, " ");
7421                 buf = strndupa_safe(v, n);
7422
7423                 r = safe_atoi(buf, &netns_fdpair[1]);
7424                 if (r < 0)
7425                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7426                 if (!fdset_contains(fds, netns_fdpair[1]))
7427                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7428                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7429                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7430                 if (v[n] != ' ')
7431                         goto finalize;
7432                 p = v + n + 1;
7433         }
7434
7435         v = startswith(p, "ipcns-socket-0=");
7436         if (v) {
7437                 char *buf;
7438
7439                 n = strcspn(v, " ");
7440                 buf = strndupa_safe(v, n);
7441
7442                 r = safe_atoi(buf, &ipcns_fdpair[0]);
7443                 if (r < 0)
7444                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7445                 if (!fdset_contains(fds, ipcns_fdpair[0]))
7446                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7447                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7448                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7449                 if (v[n] != ' ')
7450                         goto finalize;
7451                 p = v + n + 1;
7452         }
7453
7454         v = startswith(p, "ipcns-socket-1=");
7455         if (v) {
7456                 char *buf;
7457
7458                 n = strcspn(v, " ");
7459                 buf = strndupa_safe(v, n);
7460
7461                 r = safe_atoi(buf, &ipcns_fdpair[1]);
7462                 if (r < 0)
7463                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7464                 if (!fdset_contains(fds, ipcns_fdpair[1]))
7465                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7466                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7467                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7468         }
7469
7470 finalize:
7471         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7472         if (r < 0)
7473                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7474         return 0;
7475 }
7476
7477 void exec_shared_runtime_vacuum(Manager *m) {
7478         ExecSharedRuntime *rt;
7479
7480         assert(m);
7481
7482         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7483
7484         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7485                 if (rt->n_ref > 0)
7486                         continue;
7487
7488                 (void) exec_shared_runtime_free(rt);
7489         }
7490 }
7491
7492 int exec_runtime_make(ExecSharedRuntime *shared, DynamicCreds *creds, ExecRuntime **ret) {
7493         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7494
7495         assert(ret);
7496
7497         if (!shared && !creds) {
7498                 *ret = NULL;
7499                 return 0;
7500         }
7501
7502         rt = new(ExecRuntime, 1);
7503         if (!rt)
7504                 return -ENOMEM;
7505
7506         *rt = (ExecRuntime) {
7507                 .shared = shared,
7508                 .dynamic_creds = creds,
7509         };
7510
7511         *ret = TAKE_PTR(rt);
7512         return 1;
7513 }
7514
7515 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7516         if (!rt)
7517                 return NULL;
7518
7519         exec_shared_runtime_unref(rt->shared);
7520         dynamic_creds_unref(rt->dynamic_creds);
7521         return mfree(rt);
7522 }
7523
7524 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7525         if (!rt)
7526                 return NULL;
7527
7528         rt->shared = exec_shared_runtime_destroy(rt->shared);
7529         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7530         return exec_runtime_free(rt);
7531 }
7532
7533 void exec_params_clear(ExecParameters *p) {
7534         if (!p)
7535                 return;
7536
7537         p->environment = strv_free(p->environment);
7538         p->fd_names = strv_free(p->fd_names);
7539         p->fds = mfree(p->fds);
7540         p->exec_fd = safe_close(p->exec_fd);
7541 }
7542
7543 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7544         if (!sc)
7545                 return NULL;
7546
7547         free(sc->id);
7548         free(sc->data);
7549         return mfree(sc);
7550 }
7551
7552 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7553         if (!lc)
7554                 return NULL;
7555
7556         free(lc->id);
7557         free(lc->path);
7558         return mfree(lc);
7559 }
7560
7561 void exec_directory_done(ExecDirectory *d) {
7562         if (!d)
7563                 return;
7564
7565         for (size_t i = 0; i < d->n_items; i++) {
7566                 free(d->items[i].path);
7567                 strv_free(d->items[i].symlinks);
7568         }
7569
7570         d->items = mfree(d->items);
7571         d->n_items = 0;
7572         d->mode = 0755;
7573 }
7574
7575 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7576         assert(d);
7577         assert(path);
7578
7579         for (size_t i = 0; i < d->n_items; i++)
7580                 if (path_equal(d->items[i].path, path))
7581                         return &d->items[i];
7582
7583         return NULL;
7584 }
7585
7586 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7587         _cleanup_strv_free_ char **s = NULL;
7588         _cleanup_free_ char *p = NULL;
7589         ExecDirectoryItem *existing;
7590         int r;
7591
7592         assert(d);
7593         assert(path);
7594
7595         existing = exec_directory_find(d, path);
7596         if (existing) {
7597                 r = strv_extend(&existing->symlinks, symlink);
7598                 if (r < 0)
7599                         return r;
7600
7601                 return 0; /* existing item is updated */
7602         }
7603
7604         p = strdup(path);
7605         if (!p)
7606                 return -ENOMEM;
7607
7608         if (symlink) {
7609                 s = strv_new(symlink);
7610                 if (!s)
7611                         return -ENOMEM;
7612         }
7613
7614         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7615                 return -ENOMEM;
7616
7617         d->items[d->n_items++] = (ExecDirectoryItem) {
7618                 .path = TAKE_PTR(p),
7619                 .symlinks = TAKE_PTR(s),
7620         };
7621
7622         return 1; /* new item is added */
7623 }
7624
7625 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7626         assert(a);
7627         assert(b);
7628
7629         return path_compare(a->path, b->path);
7630 }
7631
7632 void exec_directory_sort(ExecDirectory *d) {
7633         assert(d);
7634
7635         /* Sort the exec directories to make always parent directories processed at first in
7636          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7637          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7638          * list. See also comments in setup_exec_directory() and issue #24783. */
7639
7640         if (d->n_items <= 1)
7641                 return;
7642
7643         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7644
7645         for (size_t i = 1; i < d->n_items; i++)
7646                 for (size_t j = 0; j < i; j++)
7647                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7648                                 d->items[i].only_create = true;
7649                                 break;
7650                         }
7651 }
7652
7653 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7654         ExecDirectoryType t;
7655
7656         assert(s);
7657
7658         if (streq(s, "all"))
7659                 return EXEC_CLEAN_ALL;
7660         if (streq(s, "fdstore"))
7661                 return EXEC_CLEAN_FDSTORE;
7662
7663         t = exec_resource_type_from_string(s);
7664         if (t < 0)
7665                 return (ExecCleanMask) t;
7666
7667         return 1U << t;
7668 }
7669
7670 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7671 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7672
7673 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7674         [EXEC_INPUT_NULL] = "null",
7675         [EXEC_INPUT_TTY] = "tty",
7676         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7677         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7678         [EXEC_INPUT_SOCKET] = "socket",
7679         [EXEC_INPUT_NAMED_FD] = "fd",
7680         [EXEC_INPUT_DATA] = "data",
7681         [EXEC_INPUT_FILE] = "file",
7682 };
7683
7684 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7685
7686 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7687         [EXEC_OUTPUT_INHERIT] = "inherit",
7688         [EXEC_OUTPUT_NULL] = "null",
7689         [EXEC_OUTPUT_TTY] = "tty",
7690         [EXEC_OUTPUT_KMSG] = "kmsg",
7691         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7692         [EXEC_OUTPUT_JOURNAL] = "journal",
7693         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7694         [EXEC_OUTPUT_SOCKET] = "socket",
7695         [EXEC_OUTPUT_NAMED_FD] = "fd",
7696         [EXEC_OUTPUT_FILE] = "file",
7697         [EXEC_OUTPUT_FILE_APPEND] = "append",
7698         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7699 };
7700
7701 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7702
7703 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7704         [EXEC_UTMP_INIT] = "init",
7705         [EXEC_UTMP_LOGIN] = "login",
7706         [EXEC_UTMP_USER] = "user",
7707 };
7708
7709 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7710
7711 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7712         [EXEC_PRESERVE_NO] = "no",
7713         [EXEC_PRESERVE_YES] = "yes",
7714         [EXEC_PRESERVE_RESTART] = "restart",
7715 };
7716
7717 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7718
7719 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7720 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7721         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7722         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7723         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7724         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7725         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7726 };
7727
7728 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7729
7730 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7731 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7732         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7733         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7734         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7735         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7736         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7737 };
7738
7739 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7740
7741 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7742  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7743  * directories, specifically .timer units with their timestamp touch file. */
7744 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7745         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7746         [EXEC_DIRECTORY_STATE] = "state",
7747         [EXEC_DIRECTORY_CACHE] = "cache",
7748         [EXEC_DIRECTORY_LOGS] = "logs",
7749         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7750 };
7751
7752 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7753
7754 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7755  * the service payload in. */
7756 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7757         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7758         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7759         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7760         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7761         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7762 };
7763
7764 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7765
7766 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7767         [EXEC_KEYRING_INHERIT] = "inherit",
7768         [EXEC_KEYRING_PRIVATE] = "private",
7769         [EXEC_KEYRING_SHARED] = "shared",
7770 };
7771
7772 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);