src/core/exec-invoke.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <grp.h>
   4 #include <linux/ioprio.h>
   5 #include <linux/prctl.h>
   6 #include <linux/sched.h>
   7 #include <linux/securebits.h>
   8 #include <poll.h>
   9 #include <sys/eventfd.h>
  10 #include <sys/ioctl.h>
  11 #include <sys/mount.h>
  12 #include <sys/prctl.h>
  13 #include <unistd.h>
  14
  15 #if HAVE_PAM
  16 #include <security/pam_appl.h>
  17 #endif
  18
  19 #include "sd-messages.h"
  20
  21 #include "apparmor-util.h"
  22 #include "argv-util.h"
  23 #include "ask-password-api.h"
  24 #include "barrier.h"
  25 #include "bitfield.h"
  26 #include "bpf-dlopen.h"
  27 #include "bpf-restrict-fs.h"
  28 #include "btrfs-util.h"
  29 #include "capability-util.h"
  30 #include "cgroup-setup.h"
  31 #include "cgroup.h"
  32 #include "chase.h"
  33 #include "chown-recursive.h"
  34 #include "constants.h"
  35 #include "copy.h"
  36 #include "coredump-util.h"
  37 #include "dissect-image.h"
  38 #include "dynamic-user.h"
  39 #include "env-util.h"
  40 #include "escape.h"
  41 #include "exec-credential.h"
  42 #include "exec-invoke.h"
  43 #include "execute.h"
  44 #include "exit-status.h"
  45 #include "fd-util.h"
  46 #include "fs-util.h"
  47 #include "hexdecoct.h"
  48 #include "hostname-setup.h"
  49 #include "image-policy.h"
  50 #include "io-util.h"
  51 #include "iovec-util.h"
  52 #include "journal-send.h"
  53 #include "manager.h"
  54 #include "memfd-util.h"
  55 #include "missing_sched.h"
  56 #include "missing_syscall.h"
  57 #include "mkdir-label.h"
  58 #include "mount-util.h"
  59 #include "namespace-util.h"
  60 #include "nsflags.h"
  61 #include "open-file.h"
  62 #include "osc-context.h"
  63 #include "path-util.h"
  64 #include "pidref.h"
  65 #include "proc-cmdline.h"
  66 #include "process-util.h"
  67 #include "psi-util.h"
  68 #include "rlimit-util.h"
  69 #include "seccomp-util.h"
  70 #include "selinux-util.h"
  71 #include "set.h"
  72 #include "signal-util.h"
  73 #include "smack-util.h"
  74 #include "socket-util.h"
  75 #include "stat-util.h"
  76 #include "string-table.h"
  77 #include "strv.h"
  78 #include "terminal-util.h"
  79 #include "user-util.h"
  80 #include "utmp-wtmp.h"
  81 #include "vpick.h"
  82
  83 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  84 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  85
  86 #define SNDBUF_SIZE (8*1024*1024)
  87
  88 static int flag_fds(
  89                 const int fds[],
  90                 size_t n_socket_fds,
  91                 size_t n_fds,
  92                 bool nonblock) {
  93
  94         int r;
  95
  96         assert(fds || n_fds == 0);
  97
  98         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
  99          * O_NONBLOCK only applies to socket activation though. */
 100
 101         for (size_t i = 0; i < n_fds; i++) {
 102
 103                 if (i < n_socket_fds) {
 104                         r = fd_nonblock(fds[i], nonblock);
 105                         if (r < 0)
 106                                 return r;
 107                 }
 108
 109                 /* We unconditionally drop FD_CLOEXEC from the fds,
 110                  * since after all we want to pass these fds to our
 111                  * children */
 112
 113                 r = fd_cloexec(fds[i], false);
 114                 if (r < 0)
 115                         return r;
 116         }
 117
 118         return 0;
 119 }
 120
 121 static bool is_terminal_input(ExecInput i) {
 122         return IN_SET(i,
 123                       EXEC_INPUT_TTY,
 124                       EXEC_INPUT_TTY_FORCE,
 125                       EXEC_INPUT_TTY_FAIL);
 126 }
 127
 128 static bool is_terminal_output(ExecOutput o) {
 129         return IN_SET(o,
 130                       EXEC_OUTPUT_TTY,
 131                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 132                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 133 }
 134
 135 static bool is_kmsg_output(ExecOutput o) {
 136         return IN_SET(o,
 137                       EXEC_OUTPUT_KMSG,
 138                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 139 }
 140
 141 static int open_null_as(int flags, int nfd) {
 142         int fd;
 143
 144         assert(nfd >= 0);
 145
 146         fd = open("/dev/null", flags|O_NOCTTY);
 147         if (fd < 0)
 148                 return -errno;
 149
 150         return move_fd(fd, nfd, false);
 151 }
 152
 153 static int connect_journal_socket(
 154                 int fd,
 155                 const char *log_namespace,
 156                 uid_t uid,
 157                 gid_t gid) {
 158
 159         uid_t olduid = UID_INVALID;
 160         gid_t oldgid = GID_INVALID;
 161         const char *j;
 162         int r;
 163
 164         assert(fd >= 0);
 165
 166         j = journal_stream_path(log_namespace);
 167         if (!j)
 168                 return -EINVAL;
 169
 170         if (gid_is_valid(gid)) {
 171                 oldgid = getgid();
 172
 173                 if (setegid(gid) < 0)
 174                         return -errno;
 175         }
 176
 177         if (uid_is_valid(uid)) {
 178                 olduid = getuid();
 179
 180                 if (seteuid(uid) < 0) {
 181                         r = -errno;
 182                         goto restore_gid;
 183                 }
 184         }
 185
 186         r = connect_unix_path(fd, AT_FDCWD, j);
 187
 188         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 189            an LSM interferes. */
 190
 191         if (uid_is_valid(uid))
 192                 (void) seteuid(olduid);
 193
 194  restore_gid:
 195         if (gid_is_valid(gid))
 196                 (void) setegid(oldgid);
 197
 198         return r;
 199 }
 200
 201 static int connect_logger_as(
 202                 const ExecContext *context,
 203                 const ExecParameters *params,
 204                 ExecOutput output,
 205                 const char *ident,
 206                 int nfd,
 207                 uid_t uid,
 208                 gid_t gid) {
 209
 210         _cleanup_close_ int fd = -EBADF;
 211         int r;
 212
 213         assert(context);
 214         assert(params);
 215         assert(output < _EXEC_OUTPUT_MAX);
 216         assert(ident);
 217         assert(nfd >= 0);
 218
 219         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 220         if (fd < 0)
 221                 return -errno;
 222
 223         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 224         if (r < 0)
 225                 return r;
 226
 227         if (shutdown(fd, SHUT_RD) < 0)
 228                 return -errno;
 229
 230         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 231
 232         if (dprintf(fd,
 233                 "%s\n"
 234                 "%s\n"
 235                 "%i\n"
 236                 "%i\n"
 237                 "%i\n"
 238                 "%i\n"
 239                 "%i\n",
 240                 context->syslog_identifier ?: ident,
 241                 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
 242                 context->syslog_priority,
 243                 !!context->syslog_level_prefix,
 244                 false,
 245                 is_kmsg_output(output),
 246                 is_terminal_output(output)) < 0)
 247                 return -errno;
 248
 249         return move_fd(TAKE_FD(fd), nfd, false);
 250 }
 251
 252 static int open_terminal_as(const char *path, int flags, int nfd) {
 253         int fd;
 254
 255         assert(path);
 256         assert(nfd >= 0);
 257
 258         fd = open_terminal(path, flags | O_NOCTTY);
 259         if (fd < 0)
 260                 return fd;
 261
 262         return move_fd(fd, nfd, false);
 263 }
 264
 265 static int acquire_path(const char *path, int flags, mode_t mode) {
 266         _cleanup_close_ int fd = -EBADF;
 267         int r;
 268
 269         assert(path);
 270
 271         if (IN_SET(flags & O_ACCMODE_STRICT, O_WRONLY, O_RDWR))
 272                 flags |= O_CREAT;
 273
 274         fd = open(path, flags|O_NOCTTY, mode);
 275         if (fd >= 0)
 276                 return TAKE_FD(fd);
 277
 278         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 279                 return -errno;
 280
 281         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 282
 283         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 284         if (fd < 0)
 285                 return -errno;
 286
 287         r = connect_unix_path(fd, AT_FDCWD, path);
 288         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 289                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 290                  * wasn't an AF_UNIX socket after all */
 291                 return -ENXIO;
 292         if (r < 0)
 293                 return r;
 294
 295         if ((flags & O_ACCMODE_STRICT) == O_RDONLY)
 296                 r = shutdown(fd, SHUT_WR);
 297         else if ((flags & O_ACCMODE_STRICT) == O_WRONLY)
 298                 r = shutdown(fd, SHUT_RD);
 299         else
 300                 r = 0;
 301         if (r < 0)
 302                 return -errno;
 303
 304         return TAKE_FD(fd);
 305 }
 306
 307 static int fixup_input(
 308                 const ExecContext *context,
 309                 int socket_fd,
 310                 bool apply_tty_stdin) {
 311
 312         ExecInput std_input;
 313
 314         assert(context);
 315
 316         std_input = context->std_input;
 317
 318         if (is_terminal_input(std_input) && !apply_tty_stdin)
 319                 return EXEC_INPUT_NULL;
 320
 321         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 322                 return EXEC_INPUT_NULL;
 323
 324         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 325                 return EXEC_INPUT_NULL;
 326
 327         return std_input;
 328 }
 329
 330 static int fixup_output(ExecOutput output, int socket_fd) {
 331
 332         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 333                 return EXEC_OUTPUT_INHERIT;
 334
 335         return output;
 336 }
 337
 338 static int setup_input(
 339                 const ExecContext *context,
 340                 const ExecParameters *params,
 341                 int socket_fd,
 342                 const int named_iofds[static 3]) {
 343
 344         ExecInput i;
 345         int r;
 346
 347         assert(context);
 348         assert(params);
 349         assert(named_iofds);
 350
 351         if (params->stdin_fd >= 0) {
 352                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 353                         return -errno;
 354
 355                 /* Try to make this our controlling tty, if it is a tty */
 356                 if (isatty_safe(STDIN_FILENO) && ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE) < 0)
 357                         log_debug_errno(errno, "Failed to make standard input TTY our controlling terminal: %m");
 358
 359                 return STDIN_FILENO;
 360         }
 361
 362         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 363
 364         switch (i) {
 365
 366         case EXEC_INPUT_NULL:
 367                 return open_null_as(O_RDONLY, STDIN_FILENO);
 368
 369         case EXEC_INPUT_TTY:
 370         case EXEC_INPUT_TTY_FORCE:
 371         case EXEC_INPUT_TTY_FAIL: {
 372                 _cleanup_close_ int tty_fd = -EBADF;
 373                 _cleanup_free_ char *resolved = NULL;
 374                 const char *tty_path;
 375
 376                 tty_path = ASSERT_PTR(exec_context_tty_path(context));
 377
 378                 if (tty_is_console(tty_path)) {
 379                         r = resolve_dev_console(&resolved);
 380                         if (r < 0)
 381                                 log_debug_errno(r, "Failed to resolve /dev/console, ignoring: %m");
 382                         else {
 383                                 log_debug("Resolved /dev/console to %s", resolved);
 384                                 tty_path = resolved;
 385                         }
 386                 }
 387
 388                 tty_fd = acquire_terminal(tty_path,
 389                                           i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 390                                           i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 391                                                                       ACQUIRE_TERMINAL_WAIT,
 392                                           USEC_INFINITY);
 393                 if (tty_fd < 0)
 394                         return tty_fd;
 395
 396                 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
 397                 if (r < 0)
 398                         return r;
 399
 400                 TAKE_FD(tty_fd);
 401                 return r;
 402         }
 403
 404         case EXEC_INPUT_SOCKET:
 405                 assert(socket_fd >= 0);
 406
 407                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 408
 409         case EXEC_INPUT_NAMED_FD:
 410                 assert(named_iofds[STDIN_FILENO] >= 0);
 411
 412                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 413                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 414
 415         case EXEC_INPUT_DATA: {
 416                 int fd;
 417
 418                 fd = memfd_new_and_seal("exec-input", context->stdin_data, context->stdin_data_size);
 419                 if (fd < 0)
 420                         return fd;
 421
 422                 return move_fd(fd, STDIN_FILENO, false);
 423         }
 424
 425         case EXEC_INPUT_FILE: {
 426                 bool rw;
 427                 int fd;
 428
 429                 assert(context->stdio_file[STDIN_FILENO]);
 430
 431                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 432                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 433
 434                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 435                 if (fd < 0)
 436                         return fd;
 437
 438                 return move_fd(fd, STDIN_FILENO, false);
 439         }
 440
 441         default:
 442                 assert_not_reached();
 443         }
 444 }
 445
 446 static bool can_inherit_stderr_from_stdout(
 447                 const ExecContext *context,
 448                 ExecOutput o,
 449                 ExecOutput e) {
 450
 451         assert(context);
 452
 453         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 454          * stderr fd */
 455
 456         if (e == EXEC_OUTPUT_INHERIT)
 457                 return true;
 458         if (e != o)
 459                 return false;
 460
 461         if (e == EXEC_OUTPUT_NAMED_FD)
 462                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 463
 464         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 465                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 466
 467         return true;
 468 }
 469
 470 static int setup_output(
 471                 const ExecContext *context,
 472                 const ExecParameters *params,
 473                 int fileno,
 474                 int socket_fd,
 475                 const int named_iofds[static 3],
 476                 const char *ident,
 477                 uid_t uid,
 478                 gid_t gid,
 479                 dev_t *journal_stream_dev,
 480                 ino_t *journal_stream_ino) {
 481
 482         ExecOutput o;
 483         ExecInput i;
 484         int r;
 485
 486         assert(context);
 487         assert(params);
 488         assert(ident);
 489         assert(journal_stream_dev);
 490         assert(journal_stream_ino);
 491
 492         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 493
 494                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 495                         return -errno;
 496
 497                 return STDOUT_FILENO;
 498         }
 499
 500         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 501                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 502                         return -errno;
 503
 504                 return STDERR_FILENO;
 505         }
 506
 507         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 508         o = fixup_output(context->std_output, socket_fd);
 509
 510         // FIXME: we probably should spend some time here to verify that if we inherit an fd from stdin
 511         // (possibly indirect via inheritance from stdout) it is actually opened for write!
 512
 513         if (fileno == STDERR_FILENO) {
 514                 ExecOutput e;
 515                 e = fixup_output(context->std_error, socket_fd);
 516
 517                 /* This expects the input and output are already set up */
 518
 519                 /* Don't change the stderr file descriptor if we inherit all
 520                  * the way and are not on a tty */
 521                 if (e == EXEC_OUTPUT_INHERIT &&
 522                     o == EXEC_OUTPUT_INHERIT &&
 523                     i == EXEC_INPUT_NULL &&
 524                     !is_terminal_input(context->std_input) &&
 525                     getppid() != 1)
 526                         return fileno;
 527
 528                 /* Duplicate from stdout if possible */
 529                 if (can_inherit_stderr_from_stdout(context, o, e))
 530                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 531
 532                 o = e;
 533
 534         } else if (o == EXEC_OUTPUT_INHERIT) {
 535                 /* If input got downgraded, inherit the original value */
 536                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 537                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 538
 539                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 540                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 541                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 542
 543                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 544                 if (getppid() != 1)
 545                         return fileno;
 546
 547                 /* We need to open /dev/null here anew, to get the right access mode. */
 548                 return open_null_as(O_WRONLY, fileno);
 549         }
 550
 551         switch (o) {
 552
 553         case EXEC_OUTPUT_NULL:
 554                 return open_null_as(O_WRONLY, fileno);
 555
 556         case EXEC_OUTPUT_TTY:
 557                 if (is_terminal_input(i))
 558                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 559
 560                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 561
 562         case EXEC_OUTPUT_KMSG:
 563         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 564         case EXEC_OUTPUT_JOURNAL:
 565         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 566                 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
 567                 if (r < 0) {
 568                         log_warning_errno(r, "Failed to connect %s to the journal socket, ignoring: %m",
 569                                           fileno == STDOUT_FILENO ? "stdout" : "stderr");
 570                         r = open_null_as(O_WRONLY, fileno);
 571                 } else {
 572                         struct stat st;
 573
 574                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 575                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 576                          * services to detect whether they are connected to the journal or not.
 577                          *
 578                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 579                          * about STDERR as that's usually the best way to do logging. */
 580
 581                         if (fstat(fileno, &st) >= 0 &&
 582                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 583                                 *journal_stream_dev = st.st_dev;
 584                                 *journal_stream_ino = st.st_ino;
 585                         }
 586                 }
 587                 return r;
 588
 589         case EXEC_OUTPUT_SOCKET:
 590                 assert(socket_fd >= 0);
 591
 592                 return RET_NERRNO(dup2(socket_fd, fileno));
 593
 594         case EXEC_OUTPUT_NAMED_FD:
 595                 assert(named_iofds[fileno] >= 0);
 596
 597                 (void) fd_nonblock(named_iofds[fileno], false);
 598                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 599
 600         case EXEC_OUTPUT_FILE:
 601         case EXEC_OUTPUT_FILE_APPEND:
 602         case EXEC_OUTPUT_FILE_TRUNCATE: {
 603                 bool rw;
 604                 int fd, flags;
 605
 606                 assert(context->stdio_file[fileno]);
 607
 608                 rw = context->std_input == EXEC_INPUT_FILE &&
 609                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 610
 611                 if (rw)
 612                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 613
 614                 flags = O_WRONLY;
 615                 if (o == EXEC_OUTPUT_FILE_APPEND)
 616                         flags |= O_APPEND;
 617                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 618                         flags |= O_TRUNC;
 619
 620                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 621                 if (fd < 0)
 622                         return fd;
 623
 624                 return move_fd(fd, fileno, 0);
 625         }
 626
 627         default:
 628                 assert_not_reached();
 629         }
 630 }
 631
 632 static int chown_terminal(int fd, uid_t uid) {
 633         int r;
 634
 635         assert(fd >= 0);
 636
 637         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 638         if (!isatty_safe(fd))
 639                 return 0;
 640
 641         /* This might fail. What matters are the results. */
 642         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 643         if (r < 0)
 644                 return r;
 645
 646         return 1;
 647 }
 648
 649 static int setup_confirm_stdio(
 650                 const ExecContext *context,
 651                 const char *vc,
 652                 int *ret_saved_stdin,
 653                 int *ret_saved_stdout) {
 654
 655         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 656         int r;
 657
 658         assert(context);
 659         assert(ret_saved_stdin);
 660         assert(ret_saved_stdout);
 661
 662         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD_CLOEXEC, 3);
 663         if (saved_stdin < 0)
 664                 return -errno;
 665
 666         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD_CLOEXEC, 3);
 667         if (saved_stdout < 0)
 668                 return -errno;
 669
 670         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 671         if (fd < 0)
 672                 return fd;
 673
 674         _cleanup_close_ int lock_fd = lock_dev_console();
 675         if (lock_fd < 0)
 676                 log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
 677
 678         r = chown_terminal(fd, getuid());
 679         if (r < 0)
 680                 return r;
 681
 682         r = terminal_reset_defensive(fd, TERMINAL_RESET_SWITCH_TO_TEXT);
 683         if (r < 0)
 684                 return r;
 685
 686         r = exec_context_apply_tty_size(context, fd, fd, vc);
 687         if (r < 0)
 688                 return r;
 689
 690         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 691         TAKE_FD(fd);
 692         if (r < 0)
 693                 return r;
 694
 695         *ret_saved_stdin = TAKE_FD(saved_stdin);
 696         *ret_saved_stdout = TAKE_FD(saved_stdout);
 697         return 0;
 698 }
 699
 700 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
 701         assert(err != 0);
 702         assert(fd >= 0);
 703         assert(unit_id);
 704
 705         errno = abs(err);
 706
 707         if (errno == ETIMEDOUT)
 708                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
 709         else
 710                 dprintf(fd, "Couldn't ask confirmation for %s, assuming positive response: %m\n", unit_id);
 711 }
 712
 713 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
 714         _cleanup_close_ int fd = -EBADF;
 715
 716         assert(vc);
 717
 718         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 719         if (fd < 0)
 720                 return;
 721
 722         write_confirm_error_fd(err, fd, unit_id);
 723 }
 724
 725 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 726         int r = 0;
 727
 728         assert(saved_stdin);
 729         assert(saved_stdout);
 730
 731         release_terminal();
 732
 733         if (*saved_stdin >= 0)
 734                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 735                         r = -errno;
 736
 737         if (*saved_stdout >= 0)
 738                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 739                         r = -errno;
 740
 741         *saved_stdin = safe_close(*saved_stdin);
 742         *saved_stdout = safe_close(*saved_stdout);
 743
 744         return r;
 745 }
 746
 747 enum {
 748         CONFIRM_PRETEND_FAILURE = -1,
 749         CONFIRM_PRETEND_SUCCESS =  0,
 750         CONFIRM_EXECUTE = 1,
 751 };
 752
 753 static bool confirm_spawn_disabled(void) {
 754         return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
 755 }
 756
 757 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
 758         int saved_stdout = -EBADF, saved_stdin = -EBADF, r;
 759         _cleanup_free_ char *e = NULL;
 760         char c;
 761
 762         assert(context);
 763         assert(params);
 764
 765         /* For any internal errors, assume a positive response. */
 766         r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
 767         if (r < 0) {
 768                 write_confirm_error(r, params->confirm_spawn, params->unit_id);
 769                 return CONFIRM_EXECUTE;
 770         }
 771
 772         /* confirm_spawn might have been disabled while we were sleeping. */
 773         if (!params->confirm_spawn || confirm_spawn_disabled()) {
 774                 r = 1;
 775                 goto restore_stdio;
 776         }
 777
 778         e = ellipsize(cmdline, 60, 100);
 779         if (!e) {
 780                 log_oom();
 781                 r = CONFIRM_EXECUTE;
 782                 goto restore_stdio;
 783         }
 784
 785         for (;;) {
 786                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 787                 if (r < 0) {
 788                         write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
 789                         r = CONFIRM_EXECUTE;
 790                         goto restore_stdio;
 791                 }
 792
 793                 switch (c) {
 794                 case 'c':
 795                         printf("Resuming normal execution.\n");
 796                         manager_disable_confirm_spawn();
 797                         r = 1;
 798                         break;
 799                 case 'D':
 800                         printf("  Unit: %s\n",
 801                                params->unit_id);
 802                         exec_context_dump(context, stdout, "  ");
 803                         exec_params_dump(params, stdout, "  ");
 804                         continue; /* ask again */
 805                 case 'f':
 806                         printf("Failing execution.\n");
 807                         r = CONFIRM_PRETEND_FAILURE;
 808                         break;
 809                 case 'h':
 810                         printf("  c - continue, proceed without asking anymore\n"
 811                                "  D - dump, show the state of the unit\n"
 812                                "  f - fail, don't execute the command and pretend it failed\n"
 813                                "  h - help\n"
 814                                "  i - info, show a short summary of the unit\n"
 815                                "  j - jobs, show jobs that are in progress\n"
 816                                "  s - skip, don't execute the command and pretend it succeeded\n"
 817                                "  y - yes, execute the command\n");
 818                         continue; /* ask again */
 819                 case 'i':
 820                         printf("  Unit:        %s\n"
 821                                "  Command:     %s\n",
 822                                params->unit_id, cmdline);
 823                         continue; /* ask again */
 824                 case 'j':
 825                         if (sigqueue(getppid(),
 826                                      SIGRTMIN+18,
 827                                      (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
 828                                 return -errno;
 829
 830                         continue; /* ask again */
 831                 case 'n':
 832                         /* 'n' was removed in favor of 'f'. */
 833                         printf("Didn't understand 'n', did you mean 'f'?\n");
 834                         continue; /* ask again */
 835                 case 's':
 836                         printf("Skipping execution.\n");
 837                         r = CONFIRM_PRETEND_SUCCESS;
 838                         break;
 839                 case 'y':
 840                         r = CONFIRM_EXECUTE;
 841                         break;
 842                 default:
 843                         assert_not_reached();
 844                 }
 845                 break;
 846         }
 847
 848 restore_stdio:
 849         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 850         return r;
 851 }
 852
 853 static int get_fixed_user(
 854                 const char *user_or_uid,
 855                 bool prefer_nss,
 856                 const char **ret_username,
 857                 uid_t *ret_uid,
 858                 gid_t *ret_gid,
 859                 const char **ret_home,
 860                 const char **ret_shell) {
 861
 862         int r;
 863
 864         assert(user_or_uid);
 865         assert(ret_username);
 866
 867         r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell,
 868                            USER_CREDS_CLEAN|(prefer_nss ? USER_CREDS_PREFER_NSS : 0));
 869         if (r < 0)
 870                 return r;
 871
 872         /* user_or_uid is normalized by get_user_creds to username */
 873         *ret_username = user_or_uid;
 874
 875         return 0;
 876 }
 877
 878 static int get_fixed_group(
 879                 const char *group_or_gid,
 880                 const char **ret_groupname,
 881                 gid_t *ret_gid) {
 882
 883         int r;
 884
 885         assert(group_or_gid);
 886         assert(ret_groupname);
 887
 888         r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
 889         if (r < 0)
 890                 return r;
 891
 892         /* group_or_gid is normalized by get_group_creds to groupname */
 893         *ret_groupname = group_or_gid;
 894
 895         return 0;
 896 }
 897
 898 static int get_supplementary_groups(
 899                 const ExecContext *c,
 900                 const char *user,
 901                 gid_t gid,
 902                 gid_t **ret_gids) {
 903
 904         int r;
 905
 906         assert(c);
 907         assert(ret_gids);
 908
 909         /*
 910          * If user is given, then lookup GID and supplementary groups list.
 911          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 912          * here and as early as possible so we keep the list of supplementary
 913          * groups of the caller.
 914          */
 915         bool keep_groups = false;
 916         if (user && gid_is_valid(gid) && gid != 0) {
 917                 /* First step, initialize groups from /etc/groups */
 918                 if (initgroups(user, gid) < 0)
 919                         return -errno;
 920
 921                 keep_groups = true;
 922         }
 923
 924         if (strv_isempty(c->supplementary_groups)) {
 925                 *ret_gids = NULL;
 926                 return 0;
 927         }
 928
 929         /*
 930          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 931          * be positive, otherwise fail.
 932          */
 933         errno = 0;
 934         int ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 935         if (ngroups_max <= 0)
 936                 return errno_or_else(EOPNOTSUPP);
 937
 938         _cleanup_free_ gid_t *l_gids = new(gid_t, ngroups_max);
 939         if (!l_gids)
 940                 return -ENOMEM;
 941
 942         int k = 0;
 943         if (keep_groups) {
 944                 /*
 945                  * Lookup the list of groups that the user belongs to, we
 946                  * avoid NSS lookups here too for gid=0.
 947                  */
 948                 k = ngroups_max;
 949                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 950                         return -EINVAL;
 951         }
 952
 953         STRV_FOREACH(i, c->supplementary_groups) {
 954                 if (k >= ngroups_max)
 955                         return -E2BIG;
 956
 957                 const char *g = *i;
 958                 r = get_group_creds(&g, l_gids + k, /* flags = */ 0);
 959                 if (r < 0)
 960                         return r;
 961
 962                 k++;
 963         }
 964
 965         if (k == 0) {
 966                 *ret_gids = NULL;
 967                 return 0;
 968         }
 969
 970         /* Otherwise get the final list of supplementary groups */
 971         gid_t *groups = newdup(gid_t, l_gids, k);
 972         if (!groups)
 973                 return -ENOMEM;
 974
 975         *ret_gids = groups;
 976         return k;
 977 }
 978
 979 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
 980         int r;
 981
 982         /* Handle SupplementaryGroups= if it is not empty */
 983         if (ngids > 0) {
 984                 r = maybe_setgroups(ngids, supplementary_gids);
 985                 if (r < 0)
 986                         return r;
 987         }
 988
 989         if (gid_is_valid(gid)) {
 990                 /* Then set our gids */
 991                 if (setresgid(gid, gid, gid) < 0)
 992                         return -errno;
 993         }
 994
 995         return 0;
 996 }
 997
 998 static int set_securebits(unsigned bits, unsigned mask) {
 999         unsigned applied;
1000         int current;
1001
1002         current = prctl(PR_GET_SECUREBITS);
1003         if (current < 0)
1004                 return -errno;
1005
1006         /* Clear all securebits defined in mask and set bits */
1007         applied = ((unsigned) current & ~mask) | bits;
1008         if ((unsigned) current == applied)
1009                 return 0;
1010
1011         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1012                 return -errno;
1013
1014         return 1;
1015 }
1016
1017 static int enforce_user(
1018                 const ExecContext *context,
1019                 uid_t uid,
1020                 uint64_t capability_ambient_set) {
1021
1022         int r;
1023
1024         assert(context);
1025
1026         if (!uid_is_valid(uid))
1027                 return 0;
1028
1029         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1030          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1031          * case. */
1032
1033         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1034
1035                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1036                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1037                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1038                 if (r < 0)
1039                         return r;
1040         }
1041
1042         /* Second step: actually set the uids */
1043         if (setresuid(uid, uid, uid) < 0)
1044                 return -errno;
1045
1046         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1047          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1048          * outside of this call. */
1049         return 0;
1050 }
1051
1052 #if HAVE_PAM
1053
1054 static void pam_response_free_array(struct pam_response *responses, size_t n_responses) {
1055         assert(responses || n_responses == 0);
1056
1057         FOREACH_ARRAY(resp, responses, n_responses)
1058                 erase_and_free(resp->resp);
1059
1060         free(responses);
1061 }
1062
1063 typedef struct AskPasswordConvData {
1064         const ExecContext *context;
1065         const ExecParameters *params;
1066 } AskPasswordConvData;
1067
1068 static int ask_password_conv(
1069                 int num_msg,
1070                 const struct pam_message *msg[],
1071                 struct pam_response **ret,
1072                 void *userdata) {
1073
1074         AskPasswordConvData *data = ASSERT_PTR(userdata);
1075         bool set_credential_env_var = false;
1076         int r;
1077
1078         assert(num_msg >= 0);
1079         assert(msg);
1080         assert(data->context);
1081         assert(data->params);
1082
1083         size_t n = num_msg;
1084         struct pam_response *responses = new0(struct pam_response, n);
1085         if (!responses)
1086                 return PAM_BUF_ERR;
1087         CLEANUP_ARRAY(responses, n, pam_response_free_array);
1088
1089         for (size_t i = 0; i < n; i++) {
1090                 const struct pam_message *mi = *msg + i;
1091
1092                 switch (mi->msg_style) {
1093
1094                 case PAM_PROMPT_ECHO_ON:
1095                 case PAM_PROMPT_ECHO_OFF: {
1096
1097                         /* Locally set the $CREDENTIALS_DIRECTORY to the credentials directory we just populated */
1098                         if (!set_credential_env_var) {
1099                                 _cleanup_free_ char *creds_dir = NULL;
1100                                 r = exec_context_get_credential_directory(data->context, data->params, data->params->unit_id, &creds_dir);
1101                                 if (r < 0)
1102                                         return log_error_errno(r, "Failed to determine credentials directory: %m");
1103
1104                                 if (creds_dir) {
1105                                         if (setenv("CREDENTIALS_DIRECTORY", creds_dir, /* overwrite= */ true) < 0)
1106                                                 return log_error_errno(r, "Failed to set $CREDENTIALS_DIRECTORY: %m");
1107                                 } else
1108                                         (void) unsetenv("CREDENTIALS_DIRECTORY");
1109
1110                                 set_credential_env_var = true;
1111                         }
1112
1113                         _cleanup_free_ char *credential_name = strjoin("pam.authtok.", data->context->pam_name);
1114                         if (!credential_name)
1115                                 return log_oom();
1116
1117                         AskPasswordRequest req = {
1118                                 .message = mi->msg,
1119                                 .credential = credential_name,
1120                                 .tty_fd = -EBADF,
1121                                 .hup_fd = -EBADF,
1122                                 .until = usec_add(now(CLOCK_MONOTONIC), 15 * USEC_PER_SEC),
1123                         };
1124
1125                         _cleanup_strv_free_erase_ char **acquired = NULL;
1126                         r = ask_password_auto(
1127                                         &req,
1128                                         ASK_PASSWORD_ACCEPT_CACHED|
1129                                         ASK_PASSWORD_NO_TTY|
1130                                         (mi->msg_style == PAM_PROMPT_ECHO_ON ? ASK_PASSWORD_ECHO : 0),
1131                                         &acquired);
1132                         if (r < 0) {
1133                                 log_error_errno(r, "Failed to query for password: %m");
1134                                 return PAM_CONV_ERR;
1135                         }
1136
1137                         responses[i].resp = strdup(ASSERT_PTR(acquired[0]));
1138                         if (!responses[i].resp) {
1139                                 log_oom();
1140                                 return PAM_BUF_ERR;
1141                         }
1142                         break;
1143                 }
1144
1145                 case PAM_ERROR_MSG:
1146                         log_error("PAM: %s", mi->msg);
1147                         break;
1148
1149                 case PAM_TEXT_INFO:
1150                         log_info("PAM: %s", mi->msg);
1151                         break;
1152
1153                 default:
1154                         return PAM_CONV_ERR;
1155                 }
1156         }
1157
1158         *ret = TAKE_PTR(responses);
1159         n = 0;
1160
1161         return PAM_SUCCESS;
1162 }
1163
1164 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1165         int r, s;
1166
1167         assert(handle);
1168
1169         r = pam_close_session(handle, flags);
1170         if (r != PAM_SUCCESS)
1171                 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1172
1173         s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1174         if (s != PAM_SUCCESS)
1175                 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1176
1177         return r != PAM_SUCCESS ? r : s;
1178 }
1179 #endif
1180
1181 static int attach_to_subcgroup(
1182                 const ExecContext *context,
1183                 const CGroupContext *cgroup_context,
1184                 const ExecParameters *params,
1185                 const char *prefix) {
1186
1187         _cleanup_free_ char *subgroup = NULL;
1188         int r;
1189
1190         assert(context);
1191         assert(cgroup_context);
1192         assert(params);
1193
1194         /* If we're a control process that needs a subgroup, we've already been spawned into it as otherwise
1195          * we'd violate the "no inner processes" rule, so no need to do anything. */
1196         if (exec_params_needs_control_subcgroup(params))
1197                 return 0;
1198
1199         r = exec_params_get_cgroup_path(params, cgroup_context, prefix, &subgroup);
1200         if (r < 0)
1201                 return log_error_errno(r, "Failed to acquire cgroup path: %m");
1202         /* No subgroup required? Then there's nothing to do. */
1203         if (r == 0)
1204                 return 0;
1205
1206         r = cg_attach(subgroup, 0);
1207         if (r == -EUCLEAN)
1208                 return log_error_errno(r,
1209                                 "Failed to attach process " PID_FMT " to cgroup '%s', "
1210                                 "because the cgroup or one of its parents or "
1211                                 "siblings is in the threaded mode.",
1212                                 getpid_cached(), subgroup);
1213         if (r < 0)
1214                 return log_error_errno(r,
1215                                 "Failed to attach process " PID_FMT " to cgroup %s: %m",
1216                                 getpid_cached(), subgroup);
1217
1218         return 0;
1219 }
1220
1221 static int setup_pam(
1222                 const ExecContext *context,
1223                 const CGroupContext *cgroup_context,
1224                 ExecParameters *params,
1225                 const char *user,
1226                 uid_t uid,
1227                 gid_t gid,
1228                 char ***env, /* updated on success */
1229                 const int fds[], size_t n_fds,
1230                 bool needs_sandboxing,
1231                 int exec_fd) {
1232
1233 #if HAVE_PAM
1234         AskPasswordConvData conv_data = {
1235                 .context = context,
1236                 .params = params,
1237         };
1238
1239         const struct pam_conv conv = {
1240                 .conv = ask_password_conv,
1241                 .appdata_ptr = &conv_data,
1242         };
1243
1244         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1245         _cleanup_strv_free_ char **e = NULL;
1246         _cleanup_free_ char *tty = NULL;
1247         pam_handle_t *handle = NULL;
1248         sigset_t old_ss;
1249         int pam_code = PAM_SUCCESS, r;
1250         bool close_session = false;
1251         pid_t parent_pid;
1252         int flags = 0;
1253
1254         assert(context);
1255         assert(params);
1256         assert(user);
1257         assert(uid_is_valid(uid));
1258         assert(gid_is_valid(gid));
1259         assert(fds || n_fds == 0);
1260         assert(env);
1261
1262         /* We set up PAM in the parent process, then fork. The child
1263          * will then stay around until killed via PR_GET_PDEATHSIG or
1264          * systemd via the cgroup logic. It will then remove the PAM
1265          * session again. The parent process will exec() the actual
1266          * daemon. We do things this way to ensure that the main PID
1267          * of the daemon is the one we initially fork()ed. */
1268
1269         r = barrier_create(&barrier);
1270         if (r < 0)
1271                 goto fail;
1272
1273         if (log_get_max_level() < LOG_DEBUG)
1274                 flags |= PAM_SILENT;
1275
1276         pam_code = pam_start(context->pam_name, user, &conv, &handle);
1277         if (pam_code != PAM_SUCCESS) {
1278                 handle = NULL;
1279                 goto fail;
1280         }
1281
1282         if (getttyname_malloc(STDIN_FILENO, &tty) >= 0) {
1283                 _cleanup_free_ char *q = path_join("/dev", tty);
1284                 if (!q) {
1285                         r = -ENOMEM;
1286                         goto fail;
1287                 }
1288
1289                 free_and_replace(tty, q);
1290         }
1291
1292         if (tty) {
1293                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1294                 if (pam_code != PAM_SUCCESS)
1295                         goto fail;
1296         }
1297
1298         STRV_FOREACH(nv, *env) {
1299                 pam_code = pam_putenv(handle, *nv);
1300                 if (pam_code != PAM_SUCCESS)
1301                         goto fail;
1302         }
1303
1304         pam_code = pam_acct_mgmt(handle, flags);
1305         if (pam_code != PAM_SUCCESS)
1306                 goto fail;
1307
1308         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1309         if (pam_code != PAM_SUCCESS)
1310                 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1311
1312         pam_code = pam_open_session(handle, flags);
1313         if (pam_code != PAM_SUCCESS)
1314                 goto fail;
1315
1316         close_session = true;
1317
1318         e = pam_getenvlist(handle);
1319         if (!e) {
1320                 pam_code = PAM_BUF_ERR;
1321                 goto fail;
1322         }
1323
1324         /* Block SIGTERM, so that we know that it won't get lost in the child */
1325
1326         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
1327
1328         parent_pid = getpid_cached();
1329
1330         r = safe_fork("(sd-pam)", 0, NULL);
1331         if (r < 0)
1332                 goto fail;
1333         if (r == 0) {
1334                 int ret = EXIT_PAM;
1335
1336                 if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
1337                         /* Move PAM process into subgroup immediately if the main process hasn't been moved
1338                          * into the subgroup yet (when cgroup namespacing is enabled) and a subgroup is
1339                          * configured. */
1340                         r = attach_to_subcgroup(context, cgroup_context, params, params->cgroup_path);
1341                         if (r < 0)
1342                                 return r;
1343                 }
1344
1345                 /* The child's job is to reset the PAM session on termination */
1346                 barrier_set_role(&barrier, BARRIER_CHILD);
1347
1348                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1349                  * those fds are open here that have been opened by PAM. */
1350                 (void) close_many(fds, n_fds);
1351
1352                 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1353                  * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1354                  * we'd never signal completion. */
1355                 exec_fd = safe_close(exec_fd);
1356
1357                 /* Drop privileges - we don't need any to pam_close_session and this will make
1358                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1359                  * threads to fail to exit normally */
1360
1361                 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1362                 if (r < 0)
1363                         log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1364
1365                 (void) ignore_signals(SIGPIPE);
1366
1367                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1368                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1369                  * this way. We rely on the control groups kill logic to do the rest for us. */
1370                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1371                         goto child_finish;
1372
1373                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1374                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1375                  *
1376                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1377                 (void) barrier_place(&barrier);
1378
1379                 /* Check if our parent process might already have died? */
1380                 if (getppid() == parent_pid) {
1381                         sigset_t ss;
1382                         int sig;
1383
1384                         assert_se(sigemptyset(&ss) >= 0);
1385                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1386
1387                         assert_se(sigwait(&ss, &sig) == 0);
1388                         assert(sig == SIGTERM);
1389                 }
1390
1391                 /* If our parent died we'll end the session */
1392                 if (getppid() != parent_pid) {
1393                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1394                         if (pam_code != PAM_SUCCESS)
1395                                 goto child_finish;
1396                 }
1397
1398                 ret = 0;
1399
1400         child_finish:
1401                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1402                  * know about this. See pam_end(3) */
1403                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1404                 _exit(ret);
1405         }
1406
1407         barrier_set_role(&barrier, BARRIER_PARENT);
1408
1409         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1410          * here. */
1411         handle = NULL;
1412
1413         /* Unblock SIGTERM again in the parent */
1414         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1415
1416         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1417          * this fd around. */
1418         closelog();
1419
1420         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1421          * recover. However, warn loudly if it happens. */
1422         if (!barrier_place_and_sync(&barrier))
1423                 log_error("PAM initialization failed");
1424
1425         return strv_free_and_replace(*env, e);
1426
1427 fail:
1428         if (pam_code != PAM_SUCCESS) {
1429                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1430                 r = -EPERM;  /* PAM errors do not map to errno */
1431         } else
1432                 log_error_errno(r, "PAM failed: %m");
1433
1434         if (handle) {
1435                 if (close_session)
1436                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1437
1438                 (void) pam_end(handle, pam_code | flags);
1439         }
1440
1441         closelog();
1442         return r;
1443 #else
1444         return 0;
1445 #endif
1446 }
1447
1448 static void rename_process_from_path(const char *path) {
1449         _cleanup_free_ char *buf = NULL;
1450         const char *p;
1451
1452         assert(path);
1453
1454         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1455          * /bin/ps */
1456
1457         if (path_extract_filename(path, &buf) < 0) {
1458                 rename_process("(...)");
1459                 return;
1460         }
1461
1462         size_t l = strlen(buf);
1463         if (l > 8) {
1464                 /* The end of the process name is usually more interesting, since the first bit might just be
1465                  * "systemd-" */
1466                 p = buf + l - 8;
1467                 l = 8;
1468         } else
1469                 p = buf;
1470
1471         char process_name[11];
1472         process_name[0] = '(';
1473         memcpy(process_name+1, p, l);
1474         process_name[1+l] = ')';
1475         process_name[1+l+1] = 0;
1476
1477         (void) rename_process(process_name);
1478 }
1479
1480 static bool context_has_address_families(const ExecContext *c) {
1481         assert(c);
1482
1483         return c->address_families_allow_list ||
1484                 !set_isempty(c->address_families);
1485 }
1486
1487 static bool context_has_syscall_filters(const ExecContext *c) {
1488         assert(c);
1489
1490         return c->syscall_allow_list ||
1491                 !hashmap_isempty(c->syscall_filter);
1492 }
1493
1494 static bool context_has_syscall_logs(const ExecContext *c) {
1495         assert(c);
1496
1497         return c->syscall_log_allow_list ||
1498                 !hashmap_isempty(c->syscall_log);
1499 }
1500
1501 static bool context_has_seccomp(const ExecContext *c) {
1502         assert(c);
1503
1504         /* We need NNP if we have any form of seccomp and are unprivileged */
1505         return c->lock_personality ||
1506                 c->memory_deny_write_execute ||
1507                 c->private_devices ||
1508                 c->protect_clock ||
1509                 c->protect_hostname == PROTECT_HOSTNAME_YES ||
1510                 c->protect_kernel_tunables ||
1511                 c->protect_kernel_modules ||
1512                 c->protect_kernel_logs ||
1513                 context_has_address_families(c) ||
1514                 exec_context_restrict_namespaces_set(c) ||
1515                 c->restrict_realtime ||
1516                 c->restrict_suid_sgid ||
1517                 !set_isempty(c->syscall_archs) ||
1518                 context_has_syscall_filters(c) ||
1519                 context_has_syscall_logs(c);
1520 }
1521
1522 static bool context_has_no_new_privileges(const ExecContext *c) {
1523         assert(c);
1524
1525         if (c->no_new_privileges)
1526                 return true;
1527
1528         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1529                 return false;
1530
1531         return context_has_seccomp(c);
1532 }
1533
1534 #if HAVE_SECCOMP
1535
1536 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1537         void *id, *val;
1538         bool have_capget = false, have_capset = false, have_prctl = false;
1539
1540         assert(c);
1541
1542         /* No syscall filter, we are allowed to drop privileges */
1543         if (hashmap_isempty(c->syscall_filter))
1544                 return true;
1545
1546         HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1547                 _cleanup_free_ char *name = NULL;
1548
1549                 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1550
1551                 if (streq(name, "capget"))
1552                         have_capget = true;
1553                 else if (streq(name, "capset"))
1554                         have_capset = true;
1555                 else if (streq(name, "prctl"))
1556                         have_prctl = true;
1557         }
1558
1559         if (c->syscall_allow_list)
1560                 return have_capget && have_capset && have_prctl;
1561         else
1562                 return !(have_capget || have_capset || have_prctl);
1563 }
1564
1565 static bool skip_seccomp_unavailable(const char *msg) {
1566         assert(msg);
1567
1568         if (is_seccomp_available())
1569                 return false;
1570
1571         log_debug("SECCOMP features not detected in the kernel, skipping %s", msg);
1572         return true;
1573 }
1574
1575 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p) {
1576         uint32_t negative_action, default_action, action;
1577         int r;
1578
1579         assert(c);
1580         assert(p);
1581
1582         if (!context_has_syscall_filters(c))
1583                 return 0;
1584
1585         if (skip_seccomp_unavailable("SystemCallFilter="))
1586                 return 0;
1587
1588         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1589
1590         if (c->syscall_allow_list) {
1591                 default_action = negative_action;
1592                 action = SCMP_ACT_ALLOW;
1593         } else {
1594                 default_action = SCMP_ACT_ALLOW;
1595                 action = negative_action;
1596         }
1597
1598         /* Sending over exec_fd or handoff_timestamp_fd requires write() syscall. */
1599         if (p->exec_fd >= 0 || p->handoff_timestamp_fd >= 0) {
1600                 r = seccomp_filter_set_add_by_name(c->syscall_filter, c->syscall_allow_list, "write");
1601                 if (r < 0)
1602                         return r;
1603         }
1604
1605         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1606 }
1607
1608 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1609 #ifdef SCMP_ACT_LOG
1610         uint32_t default_action, action;
1611 #endif
1612
1613         assert(c);
1614         assert(p);
1615
1616         if (!context_has_syscall_logs(c))
1617                 return 0;
1618
1619 #ifdef SCMP_ACT_LOG
1620         if (skip_seccomp_unavailable("SystemCallLog="))
1621                 return 0;
1622
1623         if (c->syscall_log_allow_list) {
1624                 /* Log nothing but the ones listed */
1625                 default_action = SCMP_ACT_ALLOW;
1626                 action = SCMP_ACT_LOG;
1627         } else {
1628                 /* Log everything but the ones listed */
1629                 default_action = SCMP_ACT_LOG;
1630                 action = SCMP_ACT_ALLOW;
1631         }
1632
1633         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1634 #else
1635         /* old libseccomp */
1636         log_debug( "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1637         return 0;
1638 #endif
1639 }
1640
1641 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1642         assert(c);
1643         assert(p);
1644
1645         if (set_isempty(c->syscall_archs))
1646                 return 0;
1647
1648         if (skip_seccomp_unavailable("SystemCallArchitectures="))
1649                 return 0;
1650
1651         return seccomp_restrict_archs(c->syscall_archs);
1652 }
1653
1654 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1655         assert(c);
1656         assert(p);
1657
1658         if (!context_has_address_families(c))
1659                 return 0;
1660
1661         if (skip_seccomp_unavailable("RestrictAddressFamilies="))
1662                 return 0;
1663
1664         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1665 }
1666
1667 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1668         int r;
1669
1670         assert(c);
1671         assert(p);
1672
1673         if (!c->memory_deny_write_execute)
1674                 return 0;
1675
1676         /* use prctl() if kernel supports it (6.3) */
1677         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1678         if (r == 0) {
1679                 log_debug("Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1680                 return 0;
1681         }
1682         if (r < 0 && errno != EINVAL)
1683                 return log_debug_errno(errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1684         /* else use seccomp */
1685         log_debug("Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1686
1687         if (skip_seccomp_unavailable("MemoryDenyWriteExecute="))
1688                 return 0;
1689
1690         return seccomp_memory_deny_write_execute();
1691 }
1692
1693 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1694         assert(c);
1695         assert(p);
1696
1697         if (!c->restrict_realtime)
1698                 return 0;
1699
1700         if (skip_seccomp_unavailable("RestrictRealtime="))
1701                 return 0;
1702
1703         return seccomp_restrict_realtime();
1704 }
1705
1706 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1707         assert(c);
1708         assert(p);
1709
1710         if (!c->restrict_suid_sgid)
1711                 return 0;
1712
1713         if (skip_seccomp_unavailable("RestrictSUIDSGID="))
1714                 return 0;
1715
1716         return seccomp_restrict_suid_sgid();
1717 }
1718
1719 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1720         assert(c);
1721         assert(p);
1722
1723         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1724          * let's protect even those systems where this is left on in the kernel. */
1725
1726         if (!c->protect_kernel_tunables)
1727                 return 0;
1728
1729         if (skip_seccomp_unavailable("ProtectKernelTunables="))
1730                 return 0;
1731
1732         return seccomp_protect_sysctl();
1733 }
1734
1735 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1736         assert(c);
1737         assert(p);
1738
1739         /* Turn off module syscalls on ProtectKernelModules=yes */
1740
1741         if (!c->protect_kernel_modules)
1742                 return 0;
1743
1744         if (skip_seccomp_unavailable("ProtectKernelModules="))
1745                 return 0;
1746
1747         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1748 }
1749
1750 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1751         assert(c);
1752         assert(p);
1753
1754         if (!c->protect_kernel_logs)
1755                 return 0;
1756
1757         if (skip_seccomp_unavailable("ProtectKernelLogs="))
1758                 return 0;
1759
1760         return seccomp_protect_syslog();
1761 }
1762
1763 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1764         assert(c);
1765         assert(p);
1766
1767         if (!c->protect_clock)
1768                 return 0;
1769
1770         if (skip_seccomp_unavailable("ProtectClock="))
1771                 return 0;
1772
1773         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1774 }
1775
1776 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1777         assert(c);
1778         assert(p);
1779
1780         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1781
1782         if (!c->private_devices)
1783                 return 0;
1784
1785         if (skip_seccomp_unavailable("PrivateDevices="))
1786                 return 0;
1787
1788         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1789 }
1790
1791 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1792         assert(c);
1793         assert(p);
1794
1795         if (!exec_context_restrict_namespaces_set(c))
1796                 return 0;
1797
1798         if (skip_seccomp_unavailable("RestrictNamespaces="))
1799                 return 0;
1800
1801         return seccomp_restrict_namespaces(c->restrict_namespaces);
1802 }
1803
1804 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1805         unsigned long personality;
1806         int r;
1807
1808         assert(c);
1809         assert(p);
1810
1811         if (!c->lock_personality)
1812                 return 0;
1813
1814         if (skip_seccomp_unavailable("LockPersonality="))
1815                 return 0;
1816
1817         personality = c->personality;
1818
1819         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1820         if (personality == PERSONALITY_INVALID) {
1821
1822                 r = opinionated_personality(&personality);
1823                 if (r < 0)
1824                         return r;
1825         }
1826
1827         return seccomp_lock_personality(personality);
1828 }
1829
1830 #endif
1831
1832 #if HAVE_LIBBPF
1833 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1834         int r;
1835
1836         assert(c);
1837         assert(p);
1838
1839         if (!exec_context_restrict_filesystems_set(c))
1840                 return 0;
1841
1842         if (p->bpf_restrict_fs_map_fd < 0) {
1843                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1844                 log_debug("LSM BPF not supported, skipping RestrictFileSystems=");
1845                 return 0;
1846         }
1847
1848         /* We are in a new binary, so dl-open again */
1849         r = dlopen_bpf();
1850         if (r < 0)
1851                 return r;
1852
1853         return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1854 }
1855 #endif
1856
1857 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1858         int r;
1859
1860         assert(c);
1861         assert(p);
1862         assert(ret_exit_status);
1863
1864         if (c->protect_hostname == PROTECT_HOSTNAME_NO)
1865                 return 0;
1866
1867         if (namespace_type_supported(NAMESPACE_UTS)) {
1868                 if (unshare(CLONE_NEWUTS) < 0) {
1869                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1870                                 *ret_exit_status = EXIT_NAMESPACE;
1871                                 return log_error_errno(errno, "Failed to set up UTS namespacing: %m");
1872                         }
1873
1874                         log_warning("ProtectHostname=%s is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.",
1875                                     protect_hostname_to_string(c->protect_hostname));
1876
1877                 } else if (c->private_hostname) {
1878                         r = sethostname_idempotent(c->private_hostname);
1879                         if (r < 0) {
1880                                 *ret_exit_status = EXIT_NAMESPACE;
1881                                 return log_error_errno(r, "Failed to set private hostname '%s': %m", c->private_hostname);
1882                         }
1883                 }
1884         } else
1885                 log_warning("ProtectHostname=%s is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.",
1886                             protect_hostname_to_string(c->protect_hostname));
1887
1888 #if HAVE_SECCOMP
1889         if (c->protect_hostname == PROTECT_HOSTNAME_YES) {
1890                 if (skip_seccomp_unavailable("ProtectHostname="))
1891                         return 0;
1892
1893                 r = seccomp_protect_hostname();
1894                 if (r < 0) {
1895                         *ret_exit_status = EXIT_SECCOMP;
1896                         return log_error_errno(r, "Failed to apply hostname restrictions: %m");
1897                 }
1898         }
1899 #endif
1900
1901         return 1;
1902 }
1903
1904 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1905         assert(idle_pipe);
1906
1907         idle_pipe[1] = safe_close(idle_pipe[1]);
1908         idle_pipe[2] = safe_close(idle_pipe[2]);
1909
1910         if (idle_pipe[0] >= 0) {
1911                 int r;
1912
1913                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1914
1915                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1916                         ssize_t n;
1917
1918                         /* Signal systemd that we are bored and want to continue. */
1919                         n = write(idle_pipe[3], "x", 1);
1920                         if (n > 0)
1921                                 /* Wait for systemd to react to the signal above. */
1922                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1923                 }
1924
1925                 idle_pipe[0] = safe_close(idle_pipe[0]);
1926
1927         }
1928
1929         idle_pipe[3] = safe_close(idle_pipe[3]);
1930 }
1931
1932 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1933
1934 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1935  * the service payload in. */
1936 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1937         [EXEC_DIRECTORY_RUNTIME]       = "RUNTIME_DIRECTORY",
1938         [EXEC_DIRECTORY_STATE]         = "STATE_DIRECTORY",
1939         [EXEC_DIRECTORY_CACHE]         = "CACHE_DIRECTORY",
1940         [EXEC_DIRECTORY_LOGS]          = "LOGS_DIRECTORY",
1941         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1942 };
1943
1944 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1945
1946 static int build_environment(
1947                 const ExecContext *c,
1948                 const ExecParameters *p,
1949                 const CGroupContext *cgroup_context,
1950                 size_t n_fds,
1951                 const char *home,
1952                 const char *username,
1953                 const char *shell,
1954                 dev_t journal_stream_dev,
1955                 ino_t journal_stream_ino,
1956                 const char *memory_pressure_path,
1957                 bool needs_sandboxing,
1958                 char ***ret) {
1959
1960         _cleanup_strv_free_ char **our_env = NULL;
1961         size_t n_env = 0;
1962         char *x;
1963         int r;
1964
1965         assert(c);
1966         assert(p);
1967         assert(cgroup_context);
1968         assert(ret);
1969
1970 #define N_ENV_VARS 19
1971         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX + 1);
1972         if (!our_env)
1973                 return -ENOMEM;
1974
1975         if (n_fds > 0) {
1976                 _cleanup_free_ char *joined = NULL;
1977
1978                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1979                         return -ENOMEM;
1980                 our_env[n_env++] = x;
1981
1982                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1983                         return -ENOMEM;
1984                 our_env[n_env++] = x;
1985
1986                 joined = strv_join(p->fd_names, ":");
1987                 if (!joined)
1988                         return -ENOMEM;
1989
1990                 x = strjoin("LISTEN_FDNAMES=", joined);
1991                 if (!x)
1992                         return -ENOMEM;
1993                 our_env[n_env++] = x;
1994         }
1995
1996         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1997                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1998                         return -ENOMEM;
1999                 our_env[n_env++] = x;
2000
2001                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
2002                         return -ENOMEM;
2003                 our_env[n_env++] = x;
2004         }
2005
2006         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
2007          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
2008          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
2009         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
2010                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
2011                 if (!x)
2012                         return -ENOMEM;
2013                 our_env[n_env++] = x;
2014         }
2015
2016         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
2017          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
2018          * really make much sense since we're not logged in. Hence we conditionalize the three based on
2019          * SetLoginEnvironment= switch. */
2020         if (!username && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2021                 assert(!c->user);
2022
2023                 r = get_fixed_user("root", /* prefer_nss = */ false, &username, NULL, NULL, &home, &shell);
2024                 if (r < 0)
2025                         return log_debug_errno(r, "Failed to determine user credentials for root: %m");
2026         }
2027
2028         bool set_user_login_env = exec_context_get_set_login_environment(c);
2029
2030         if (username) {
2031                 x = strjoin("USER=", username);
2032                 if (!x)
2033                         return -ENOMEM;
2034                 our_env[n_env++] = x;
2035
2036                 if (set_user_login_env) {
2037                         x = strjoin("LOGNAME=", username);
2038                         if (!x)
2039                                 return -ENOMEM;
2040                         our_env[n_env++] = x;
2041                 }
2042         }
2043
2044         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
2045          * (i.e. are "/" or "/bin/nologin"). */
2046
2047         if (home && set_user_login_env && !empty_or_root(home)) {
2048                 x = strjoin("HOME=", home);
2049                 if (!x)
2050                         return -ENOMEM;
2051
2052                 path_simplify(x + 5);
2053                 our_env[n_env++] = x;
2054         }
2055
2056         if (shell && set_user_login_env && !shell_is_placeholder(shell)) {
2057                 x = strjoin("SHELL=", shell);
2058                 if (!x)
2059                         return -ENOMEM;
2060
2061                 path_simplify(x + 6);
2062                 our_env[n_env++] = x;
2063         }
2064
2065         if (!sd_id128_is_null(p->invocation_id)) {
2066                 assert(p->invocation_id_string);
2067
2068                 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
2069                 if (!x)
2070                         return -ENOMEM;
2071
2072                 our_env[n_env++] = x;
2073         }
2074
2075         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2076                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2077                         return -ENOMEM;
2078
2079                 our_env[n_env++] = x;
2080         }
2081
2082         if (c->log_namespace) {
2083                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2084                 if (!x)
2085                         return -ENOMEM;
2086
2087                 our_env[n_env++] = x;
2088         }
2089
2090         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2091                 _cleanup_free_ char *joined = NULL;
2092                 const char *n;
2093
2094                 if (!p->prefix[t])
2095                         continue;
2096
2097                 if (c->directories[t].n_items == 0)
2098                         continue;
2099
2100                 n = exec_directory_env_name_to_string(t);
2101                 if (!n)
2102                         continue;
2103
2104                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2105                         _cleanup_free_ char *prefixed = NULL;
2106
2107                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2108                         if (!prefixed)
2109                                 return -ENOMEM;
2110
2111                         if (!strextend_with_separator(&joined, ":", prefixed))
2112                                 return -ENOMEM;
2113                 }
2114
2115                 x = strjoin(n, "=", joined);
2116                 if (!x)
2117                         return -ENOMEM;
2118
2119                 our_env[n_env++] = x;
2120         }
2121
2122         _cleanup_free_ char *creds_dir = NULL;
2123         r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2124         if (r < 0)
2125                 return r;
2126         if (r > 0) {
2127                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2128                 if (!x)
2129                         return -ENOMEM;
2130
2131                 our_env[n_env++] = x;
2132         }
2133
2134         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2135                 return -ENOMEM;
2136
2137         our_env[n_env++] = x;
2138
2139         if (memory_pressure_path) {
2140                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2141                 if (!x)
2142                         return -ENOMEM;
2143
2144                 our_env[n_env++] = x;
2145
2146                 if (!path_equal(memory_pressure_path, "/dev/null")) {
2147                         _cleanup_free_ char *b = NULL, *e = NULL;
2148
2149                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2150                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2151                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2152                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2153                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2154                                 return -ENOMEM;
2155
2156                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2157                                 return -ENOMEM;
2158
2159                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2160                         if (!x)
2161                                 return -ENOMEM;
2162
2163                         our_env[n_env++] = x;
2164                 }
2165         }
2166
2167         if (p->notify_socket) {
2168                 x = strjoin("NOTIFY_SOCKET=", exec_get_private_notify_socket_path(c, p, needs_sandboxing) ?: p->notify_socket);
2169                 if (!x)
2170                         return -ENOMEM;
2171
2172                 our_env[n_env++] = x;
2173         }
2174
2175         assert(c->private_var_tmp >= 0 && c->private_var_tmp < _PRIVATE_TMP_MAX);
2176         if (needs_sandboxing && c->private_tmp != c->private_var_tmp) {
2177                 assert(c->private_tmp == PRIVATE_TMP_DISCONNECTED);
2178                 assert(c->private_var_tmp == PRIVATE_TMP_NO);
2179
2180                 /* When private tmpfs is enabled only on /tmp/, then explicitly set $TMPDIR to suggest the
2181                  * service to use /tmp/. */
2182
2183                 x = strdup("TMPDIR=/tmp");
2184                 if (!x)
2185                         return -ENOMEM;
2186
2187                 our_env[n_env++] = x;
2188         }
2189
2190         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2191 #undef N_ENV_VARS
2192
2193         *ret = TAKE_PTR(our_env);
2194
2195         return 0;
2196 }
2197
2198 static int build_pass_environment(const ExecContext *c, char ***ret) {
2199         _cleanup_strv_free_ char **pass_env = NULL;
2200         size_t n_env = 0;
2201
2202         assert(c);
2203         assert(ret);
2204
2205         STRV_FOREACH(i, c->pass_environment) {
2206                 _cleanup_free_ char *x = NULL;
2207                 char *v;
2208
2209                 v = getenv(*i);
2210                 if (!v)
2211                         continue;
2212                 x = strjoin(*i, "=", v);
2213                 if (!x)
2214                         return -ENOMEM;
2215
2216                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2217                         return -ENOMEM;
2218
2219                 pass_env[n_env++] = TAKE_PTR(x);
2220                 pass_env[n_env] = NULL;
2221         }
2222
2223         *ret = TAKE_PTR(pass_env);
2224         return 0;
2225 }
2226
2227 static int setup_private_users(PrivateUsers private_users, uid_t ouid, gid_t ogid, uid_t uid, gid_t gid, bool allow_setgroups) {
2228         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2229         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2230         _cleanup_close_ int unshare_ready_fd = -EBADF;
2231         _cleanup_(sigkill_waitp) pid_t pid = 0;
2232         uint64_t c = 1;
2233         ssize_t n;
2234         int r;
2235
2236         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2237          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2238          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2239          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2240          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2241          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2242          * continues execution normally.
2243          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2244          * does not need CAP_SETUID to write the single line mapping to itself. */
2245
2246         if (private_users == PRIVATE_USERS_NO)
2247                 return 0;
2248
2249         if (private_users == PRIVATE_USERS_IDENTITY) {
2250                 uid_map = strdup("0 0 65536\n");
2251                 if (!uid_map)
2252                         return -ENOMEM;
2253         } else if (private_users == PRIVATE_USERS_FULL) {
2254                 /* Map all UID/GID from original to new user namespace. We can't use `0 0 UINT32_MAX` because
2255                  * this is the same UID/GID map as the init user namespace and systemd's running_in_userns()
2256                  * checks whether its in a user namespace by comparing uid_map/gid_map to `0 0 UINT32_MAX`.
2257                  * Thus, we still map all UIDs/GIDs but do it using two extents to differentiate the new user
2258                  * namespace from the init namespace:
2259                  *   0 0 1
2260                  *   1 1 UINT32_MAX - 1
2261                  *
2262                  * systemd will remove the heuristic in running_in_userns() and use namespace inodes in version 258
2263                  * (PR #35382). But some users may be running a container image with older systemd < 258 so we keep
2264                  * this uid_map/gid_map hack until version 259 for version N-1 compatibility.
2265                  *
2266                  * TODO: Switch to `0 0 UINT32_MAX` in systemd v259.
2267                  *
2268                  * Note the kernel defines the UID range between 0 and UINT32_MAX so we map all UIDs even though
2269                  * the UID range beyond INT32_MAX (e.g. i.e. the range above the signed 32-bit range) is
2270                  * icky. For example, setfsuid() returns the old UID as signed integer. But units can decide to
2271                  * use these UIDs/GIDs so we need to map them. */
2272                 r = asprintf(&uid_map, "0 0 1\n"
2273                                        "1 1 " UID_FMT "\n", (uid_t) (UINT32_MAX - 1));
2274                 if (r < 0)
2275                         return -ENOMEM;
2276         /* Can only set up multiple mappings with CAP_SETUID. */
2277         } else if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid)) {
2278                 r = asprintf(&uid_map,
2279                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2280                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2281                              ouid, ouid, uid, uid);
2282                 if (r < 0)
2283                         return -ENOMEM;
2284         } else {
2285                 r = asprintf(&uid_map,
2286                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2287                              ouid, ouid);
2288                 if (r < 0)
2289                         return -ENOMEM;
2290         }
2291
2292         if (private_users == PRIVATE_USERS_IDENTITY) {
2293                 gid_map = strdup("0 0 65536\n");
2294                 if (!gid_map)
2295                         return -ENOMEM;
2296         } else if (private_users == PRIVATE_USERS_FULL) {
2297                 r = asprintf(&gid_map, "0 0 1\n"
2298                                        "1 1 " GID_FMT "\n", (gid_t) (UINT32_MAX - 1));
2299                 if (r < 0)
2300                         return -ENOMEM;
2301         /* Can only set up multiple mappings with CAP_SETGID. */
2302         } else if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid)) {
2303                 r = asprintf(&gid_map,
2304                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2305                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2306                              ogid, ogid, gid, gid);
2307                 if (r < 0)
2308                         return -ENOMEM;
2309         } else {
2310                 r = asprintf(&gid_map,
2311                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2312                              ogid, ogid);
2313                 if (r < 0)
2314                         return -ENOMEM;
2315         }
2316
2317         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2318          * namespace. */
2319         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2320         if (unshare_ready_fd < 0)
2321                 return -errno;
2322
2323         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2324          * failed. */
2325         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2326                 return -errno;
2327
2328         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2329         if (r < 0)
2330                 return r;
2331         if (r == 0) {
2332                 _cleanup_close_ int fd = -EBADF;
2333                 const char *a;
2334                 pid_t ppid;
2335
2336                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2337                  * here, after the parent opened its own user namespace. */
2338
2339                 ppid = getppid();
2340                 errno_pipe[0] = safe_close(errno_pipe[0]);
2341
2342                 /* Wait until the parent unshared the user namespace */
2343                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0)
2344                         report_errno_and_exit(errno_pipe[1], -errno);
2345
2346                 /* Disable the setgroups() system call in the child user namespace, for good, unless PrivateUsers=full
2347                  * and using the system service manager. */
2348                 a = procfs_file_alloca(ppid, "setgroups");
2349                 fd = open(a, O_WRONLY|O_CLOEXEC);
2350                 if (fd < 0) {
2351                         if (errno != ENOENT) {
2352                                 r = log_debug_errno(errno, "Failed to open %s: %m", a);
2353                                 report_errno_and_exit(errno_pipe[1], r);
2354                         }
2355
2356                         /* If the file is missing the kernel is too old, let's continue anyway. */
2357                 } else {
2358                         const char *setgroups = allow_setgroups ? "allow\n" : "deny\n";
2359                         if (write(fd, setgroups, strlen(setgroups)) < 0) {
2360                                 r = log_debug_errno(errno, "Failed to write '%s' to %s: %m", setgroups, a);
2361                                 report_errno_and_exit(errno_pipe[1], r);
2362                         }
2363
2364                         fd = safe_close(fd);
2365                 }
2366
2367                 /* First write the GID map */
2368                 a = procfs_file_alloca(ppid, "gid_map");
2369                 fd = open(a, O_WRONLY|O_CLOEXEC);
2370                 if (fd < 0) {
2371                         r = log_debug_errno(errno, "Failed to open %s: %m", a);
2372                         report_errno_and_exit(errno_pipe[1], r);
2373                 }
2374
2375                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2376                         r = log_debug_errno(errno, "Failed to write GID map to %s: %m", a);
2377                         report_errno_and_exit(errno_pipe[1], r);
2378                 }
2379
2380                 fd = safe_close(fd);
2381
2382                 /* The write the UID map */
2383                 a = procfs_file_alloca(ppid, "uid_map");
2384                 fd = open(a, O_WRONLY|O_CLOEXEC);
2385                 if (fd < 0) {
2386                         r = log_debug_errno(errno, "Failed to open %s: %m", a);
2387                         report_errno_and_exit(errno_pipe[1], r);
2388                 }
2389
2390                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2391                         r = log_debug_errno(errno, "Failed to write UID map to %s: %m", a);
2392                         report_errno_and_exit(errno_pipe[1], r);
2393                 }
2394
2395                 _exit(EXIT_SUCCESS);
2396         }
2397
2398         errno_pipe[1] = safe_close(errno_pipe[1]);
2399
2400         if (unshare(CLONE_NEWUSER) < 0)
2401                 return log_debug_errno(errno, "Failed to unshare user namespace: %m");
2402
2403         /* Let the child know that the namespace is ready now */
2404         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2405                 return -errno;
2406
2407         /* Try to read an error code from the child */
2408         n = read(errno_pipe[0], &r, sizeof(r));
2409         if (n < 0)
2410                 return -errno;
2411         if (n == sizeof(r)) { /* an error code was sent to us */
2412                 if (r < 0)
2413                         return r;
2414                 return -EIO;
2415         }
2416         if (n != 0) /* on success we should have read 0 bytes */
2417                 return -EIO;
2418
2419         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2420         if (r < 0)
2421                 return r;
2422         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2423                 return -EIO;
2424
2425         return 1;
2426 }
2427
2428 static int can_mount_proc(void) {
2429         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2430         _cleanup_(sigkill_waitp) pid_t pid = 0;
2431         ssize_t n;
2432         int r;
2433
2434         /* If running via unprivileged user manager and /proc/ is masked (e.g. /proc/kmsg is over-mounted with tmpfs
2435          * like systemd-nspawn does), then mounting /proc/ will fail with EPERM. This is due to a kernel restriction
2436          * where unprivileged user namespaces cannot mount a less restrictive instance of /proc. */
2437
2438         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2439          * failed. */
2440         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2441                 return log_debug_errno(errno, "Failed to create pipe for communicating with child process (sd-proc-check): %m");
2442
2443         /* Fork a child process into its own mount and PID namespace. Note safe_fork() already remounts / as SLAVE
2444          * with FORK_MOUNTNS_SLAVE. */
2445         r = safe_fork("(sd-proc-check)",
2446                       FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_NEW_MOUNTNS|FORK_MOUNTNS_SLAVE|FORK_NEW_PIDNS, &pid);
2447         if (r < 0)
2448                 return log_debug_errno(r, "Failed to fork child process (sd-proc-check): %m");
2449         if (r == 0) {
2450                 errno_pipe[0] = safe_close(errno_pipe[0]);
2451
2452                 /* Try mounting /proc on /dev/shm/. No need to clean up the mount since the mount
2453                  * namespace will be cleaned up once the process exits. */
2454                 r = mount_follow_verbose(LOG_DEBUG, "proc", "/dev/shm/", "proc", MS_NOSUID|MS_NOEXEC|MS_NODEV, NULL);
2455                 if (r < 0) {
2456                         (void) write(errno_pipe[1], &r, sizeof(r));
2457                         _exit(EXIT_FAILURE);
2458                 }
2459
2460                 _exit(EXIT_SUCCESS);
2461         }
2462
2463         errno_pipe[1] = safe_close(errno_pipe[1]);
2464
2465         /* Try to read an error code from the child */
2466         n = read(errno_pipe[0], &r, sizeof(r));
2467         if (n < 0)
2468                 return log_debug_errno(errno, "Failed to read errno from pipe with child process (sd-proc-check): %m");
2469         if (n == sizeof(r)) { /* an error code was sent to us */
2470                 /* This is the expected case where proc cannot be mounted due to permissions. */
2471                 if (ERRNO_IS_NEG_PRIVILEGE(r))
2472                         return 0;
2473                 if (r < 0)
2474                         return r;
2475
2476                 return -EIO;
2477         }
2478         if (n != 0) /* on success we should have read 0 bytes */
2479                 return -EIO;
2480
2481         r = wait_for_terminate_and_check("(sd-proc-check)", TAKE_PID(pid), 0 /* flags= */);
2482         if (r < 0)
2483                 return log_debug_errno(r, "Failed to wait for (sd-proc-check) child process to terminate: %m");
2484         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2485                 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Child process (sd-proc-check) exited with unexpected exit status '%d'.", r);
2486
2487         return 1;
2488 }
2489
2490 static int setup_private_pids(const ExecContext *c, ExecParameters *p) {
2491         _cleanup_(pidref_done) PidRef pidref = PIDREF_NULL;
2492         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2493         ssize_t n;
2494         int r, q;
2495
2496         assert(c);
2497         assert(p);
2498         assert(p->pidref_transport_fd >= 0);
2499
2500         /* The first process created after unsharing a pid namespace becomes PID 1 in the pid namespace, so
2501          * we have to fork after unsharing the pid namespace to become PID 1. The parent sends the child
2502          * pidref to the manager and exits while the child process continues with the rest of exec_invoke()
2503          * and finally executes the actual payload. */
2504
2505         /* Create a communication channel so that the parent can tell the child a proper error code in case it
2506          * failed to send child pidref to the manager. */
2507         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2508                 return log_debug_errno(errno, "Failed to create pipe for communicating with parent process: %m");
2509
2510         /* Set FORK_DETACH to immediately re-parent the child process to the invoking manager process. */
2511         r = pidref_safe_fork("(sd-pidns-child)", FORK_NEW_PIDNS|FORK_DETACH, &pidref);
2512         if (r < 0)
2513                 return log_debug_errno(r, "Failed to fork child into new pid namespace: %m");
2514         if (r > 0) {
2515                 errno_pipe[0] = safe_close(errno_pipe[0]);
2516
2517                 /* In the parent process, we send the child pidref to the manager and exit.
2518                  * If PIDFD is not supported, only the child PID is sent. The server then
2519                  * uses the child PID to set the new exec main process. */
2520                 q = send_one_fd_iov(
2521                                 p->pidref_transport_fd,
2522                                 pidref.fd,
2523                                 &IOVEC_MAKE(&pidref.pid, sizeof(pidref.pid)),
2524                                 /*iovlen=*/ 1,
2525                                 /*flags=*/ 0);
2526                 /* Send error code to child process. */
2527                 (void) write(errno_pipe[1], &q, sizeof(q));
2528                 /* Exit here so we only go through the destructors in exec_invoke only once - in the child - as
2529                  * some destructors have external effects. The main codepaths continue in the child process. */
2530                 _exit(q < 0 ? EXIT_FAILURE : EXIT_SUCCESS);
2531         }
2532
2533         errno_pipe[1] = safe_close(errno_pipe[1]);
2534         p->pidref_transport_fd = safe_close(p->pidref_transport_fd);
2535
2536         /* Try to read an error code from the parent. Note a child process cannot wait for the parent so we always
2537          * receive an errno even on success. */
2538         n = read(errno_pipe[0], &r, sizeof(r));
2539         if (n < 0)
2540                 return log_debug_errno(errno, "Failed to read errno from pipe with parent process: %m");
2541         if (n != sizeof(r))
2542                 return log_debug_errno(SYNTHETIC_ERRNO(EIO), "Failed to read enough bytes from pipe with parent process");
2543         if (r < 0)
2544                 return log_debug_errno(r, "Failed to send child pidref to manager: %m");
2545
2546         /* NOTE! This function returns in the child process only. */
2547         return r;
2548 }
2549
2550 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2551         _cleanup_free_ char *src_abs = NULL;
2552         int r;
2553
2554         assert(source);
2555
2556         src_abs = path_join(root, source);
2557         if (!src_abs)
2558                 return -ENOMEM;
2559
2560         STRV_FOREACH(dst, symlinks) {
2561                 _cleanup_free_ char *dst_abs = NULL;
2562
2563                 dst_abs = path_join(root, *dst);
2564                 if (!dst_abs)
2565                         return -ENOMEM;
2566
2567                 r = mkdir_parents_label(dst_abs, 0755);
2568                 if (r < 0)
2569                         return r;
2570
2571                 r = symlink_idempotent(src_abs, dst_abs, true);
2572                 if (r < 0)
2573                         return r;
2574         }
2575
2576         return 0;
2577 }
2578
2579 static int setup_exec_directory(
2580                 const ExecContext *context,
2581                 const ExecParameters *params,
2582                 uid_t uid,
2583                 gid_t gid,
2584                 ExecDirectoryType type,
2585                 bool needs_mount_namespace,
2586                 int *exit_status) {
2587
2588         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2589                 [EXEC_DIRECTORY_RUNTIME]       = EXIT_RUNTIME_DIRECTORY,
2590                 [EXEC_DIRECTORY_STATE]         = EXIT_STATE_DIRECTORY,
2591                 [EXEC_DIRECTORY_CACHE]         = EXIT_CACHE_DIRECTORY,
2592                 [EXEC_DIRECTORY_LOGS]          = EXIT_LOGS_DIRECTORY,
2593                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2594         };
2595         int r;
2596
2597         assert(context);
2598         assert(params);
2599         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2600         assert(exit_status);
2601
2602         if (!params->prefix[type])
2603                 return 0;
2604
2605         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2606                 if (!uid_is_valid(uid))
2607                         uid = 0;
2608                 if (!gid_is_valid(gid))
2609                         gid = 0;
2610         }
2611
2612         FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
2613                 _cleanup_free_ char *p = NULL, *pp = NULL;
2614
2615                 p = path_join(params->prefix[type], i->path);
2616                 if (!p) {
2617                         r = -ENOMEM;
2618                         goto fail;
2619                 }
2620
2621                 r = mkdir_parents_label(p, 0755);
2622                 if (r < 0)
2623                         goto fail;
2624
2625                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2626
2627                         /* If we are in user mode, and a configuration directory exists but a state directory
2628                          * doesn't exist, then we likely are upgrading from an older systemd version that
2629                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2630                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2631                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2632                          * separated. If a service has both dirs configured but only the configuration dir
2633                          * exists and the state dir does not, we assume we are looking at an update
2634                          * situation. Hence, create a compatibility symlink, so that all expectations are
2635                          * met.
2636                          *
2637                          * (We also do something similar with the log directory, which still doesn't exist in
2638                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2639
2640                         /* this assumes the state dir is always created before the configuration dir */
2641                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2642                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2643
2644                         r = access_nofollow(p, F_OK);
2645                         if (r == -ENOENT) {
2646                                 _cleanup_free_ char *q = NULL;
2647
2648                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2649                                  * under the configuration hierarchy. */
2650
2651                                 if (type == EXEC_DIRECTORY_STATE)
2652                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path);
2653                                 else if (type == EXEC_DIRECTORY_LOGS)
2654                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path);
2655                                 else
2656                                         assert_not_reached();
2657                                 if (!q) {
2658                                         r = -ENOMEM;
2659                                         goto fail;
2660                                 }
2661
2662                                 r = access_nofollow(q, F_OK);
2663                                 if (r >= 0) {
2664                                         /* It does exist! This hence looks like an update. Symlink the
2665                                          * configuration directory into the state directory. */
2666
2667                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2668                                         if (r < 0)
2669                                                 goto fail;
2670
2671                                         log_notice("Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2672                                         continue;
2673                                 } else if (r != -ENOENT)
2674                                         log_warning_errno(r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2675
2676                         } else if (r < 0)
2677                                 log_warning_errno(r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2678                 }
2679
2680                 if (exec_directory_is_private(context, type)) {
2681                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2682                          * case we want to avoid leaving a directory around fully accessible that is owned by
2683                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2684                          * trick used by container managers to prohibit host users to get access to files of
2685                          * the same UID in containers: we place everything inside a directory that has an
2686                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2687                          * for unprivileged host code. We then use fs namespacing to make this directory
2688                          * permeable for the service itself.
2689                          *
2690                          * Specifically: for a service which wants a special directory "foo/" we first create
2691                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2692                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2693                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2694                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2695                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2696                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2697                          * for the service and making sure it only gets access to the dirs it needs but no
2698                          * others. Tricky? Yes, absolutely, but it works!
2699                          *
2700                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2701                          * to be owned by the service itself.
2702                          *
2703                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2704                          * for sharing files or sockets with other services. */
2705
2706                         pp = path_join(params->prefix[type], "private");
2707                         if (!pp) {
2708                                 r = -ENOMEM;
2709                                 goto fail;
2710                         }
2711
2712                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2713                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2714                         if (r < 0)
2715                                 goto fail;
2716
2717                         if (!path_extend(&pp, i->path)) {
2718                                 r = -ENOMEM;
2719                                 goto fail;
2720                         }
2721
2722                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2723                         r = mkdir_parents_label(pp, 0755);
2724                         if (r < 0)
2725                                 goto fail;
2726
2727                         if (is_dir(p, false) > 0 &&
2728                             (access_nofollow(pp, F_OK) == -ENOENT)) {
2729
2730                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2731                                  * it over. Most likely the service has been upgraded from one that didn't use
2732                                  * DynamicUser=1, to one that does. */
2733
2734                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2735                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2736                                          exec_directory_type_to_string(type), p, pp);
2737
2738                                 r = RET_NERRNO(rename(p, pp));
2739                                 if (r < 0)
2740                                         goto fail;
2741                         } else {
2742                                 /* Otherwise, create the actual directory for the service */
2743
2744                                 r = mkdir_label(pp, context->directories[type].mode);
2745                                 if (r < 0 && r != -EEXIST)
2746                                         goto fail;
2747                         }
2748
2749                         if (!FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE)) {
2750                                 /* And link it up from the original place.
2751                                  * Notes
2752                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2753                                  *    the host, and a new one for the child namespace will be created later.
2754                                  * 2) It is not necessary to create this symlink when one of its parent
2755                                  *    directories is specified and already created. E.g.
2756                                  *        StateDirectory=foo foo/bar
2757                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2758                                  *        pp = "/var/lib/private/foo/bar"
2759                                  *        p = "/var/lib/foo/bar"
2760                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2761                                  *    we do not need to create the symlink, but we cannot create the symlink.
2762                                  *    See issue #24783. */
2763                                 r = symlink_idempotent(pp, p, true);
2764                                 if (r < 0)
2765                                         goto fail;
2766                         }
2767
2768                 } else {
2769                         _cleanup_free_ char *target = NULL;
2770
2771                         if (EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type) &&
2772                             readlink_and_make_absolute(p, &target) >= 0) {
2773                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2774
2775                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2776                                  * by DynamicUser=1 (see above)?
2777                                  *
2778                                  * We do this for all directory types except for ConfigurationDirectory=,
2779                                  * since they all support the private/ symlink logic at least in some
2780                                  * configurations, see above. */
2781
2782                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2783                                 if (r < 0)
2784                                         goto fail;
2785
2786                                 q = path_join(params->prefix[type], "private", i->path);
2787                                 if (!q) {
2788                                         r = -ENOMEM;
2789                                         goto fail;
2790                                 }
2791
2792                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2793                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2794                                 if (r < 0)
2795                                         goto fail;
2796
2797                                 if (path_equal(q_resolved, target_resolved)) {
2798
2799                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2800                                          * but is no longer. Let's move the directory back up. */
2801
2802                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2803                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2804                                                  exec_directory_type_to_string(type), q, p);
2805
2806                                         r = RET_NERRNO(unlink(p));
2807                                         if (r < 0)
2808                                                 goto fail;
2809
2810                                         r = RET_NERRNO(rename(q, p));
2811                                         if (r < 0)
2812                                                 goto fail;
2813                                 }
2814                         }
2815
2816                         r = mkdir_label(p, context->directories[type].mode);
2817                         if (r < 0) {
2818                                 if (r != -EEXIST)
2819                                         goto fail;
2820
2821                                 if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(type)) {
2822                                         struct stat st;
2823
2824                                         /* Don't change the owner/access mode of the configuration directory,
2825                                          * as in the common case it is not written to by a service, and shall
2826                                          * not be writable. */
2827
2828                                         r = RET_NERRNO(stat(p, &st));
2829                                         if (r < 0)
2830                                                 goto fail;
2831
2832                                         /* Still complain if the access mode doesn't match */
2833                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2834                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2835                                                             "(File system: %o %sMode: %o)",
2836                                                             exec_directory_type_to_string(type), i->path,
2837                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2838
2839                                         continue;
2840                                 }
2841                         }
2842                 }
2843
2844                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2845                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2846                  * current UID/GID ownership.) */
2847                 const char *target_dir = pp ?: p;
2848                 r = chmod_and_chown(target_dir, context->directories[type].mode, UID_INVALID, GID_INVALID);
2849                 if (r < 0)
2850                         goto fail;
2851
2852                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2853                  * available to user code anyway */
2854                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2855                         continue;
2856
2857                 int idmapping_supported = is_idmapping_supported(target_dir);
2858                 if (idmapping_supported < 0) {
2859                         r = log_debug_errno(idmapping_supported, "Unable to determine if ID mapping is supported on mount '%s': %m", target_dir);
2860                         goto fail;
2861                 }
2862
2863                 log_debug("ID-mapping is%ssupported for exec directory %s", idmapping_supported ? " " : " not ", target_dir);
2864
2865                 /* Change the ownership of the whole tree, if necessary. When dynamic users are used we
2866                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2867                  * assignments to exist. */
2868                 uid_t chown_uid = uid;
2869                 gid_t chown_gid = gid;
2870                 bool do_chown = false;
2871
2872                 if (uid == 0 || gid == 0 || !idmapping_supported) {
2873                         do_chown = true;
2874                         i->idmapped = false;
2875                 } else {
2876                         /* Use 'nobody' uid/gid for exec directories if ID-mapping is supported. For backward compatibility,
2877                          * continue doing chmod/chown if the directory was chmod/chowned before (if uid/gid is not 'nobody') */
2878                         struct stat st;
2879                         r = RET_NERRNO(stat(target_dir, &st));
2880                         if (r < 0)
2881                                 goto fail;
2882
2883                         if (st.st_uid == UID_NOBODY && st.st_gid == GID_NOBODY) {
2884                                 do_chown = false;
2885                                 i->idmapped = true;
2886                        } else if (exec_directory_is_private(context, type) && st.st_uid == 0 && st.st_gid == 0) {
2887                                 chown_uid = UID_NOBODY;
2888                                 chown_gid = GID_NOBODY;
2889                                 do_chown = true;
2890                                 i->idmapped = true;
2891                         } else {
2892                                 do_chown = true;
2893                                 i->idmapped = false;
2894                         }
2895                 }
2896
2897                 if (do_chown) {
2898                         r = path_chown_recursive(target_dir, chown_uid, chown_gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2899                         if (r < 0)
2900                                 goto fail;
2901                 }
2902         }
2903
2904         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2905          * they are set up later, to allow configuring empty var/run/etc. */
2906         if (!needs_mount_namespace)
2907                 FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
2908                         r = create_many_symlinks(params->prefix[type], i->path, i->symlinks);
2909                         if (r < 0)
2910                                 goto fail;
2911                 }
2912
2913         return 0;
2914
2915 fail:
2916         *exit_status = exit_status_table[type];
2917         return r;
2918 }
2919
2920 #if ENABLE_SMACK
2921 static int setup_smack(
2922                 const ExecContext *context,
2923                 const ExecParameters *params,
2924                 int executable_fd) {
2925         int r;
2926
2927         assert(context);
2928         assert(params);
2929         assert(executable_fd >= 0);
2930
2931         if (context->smack_process_label) {
2932                 r = mac_smack_apply_pid(0, context->smack_process_label);
2933                 if (r < 0)
2934                         return r;
2935         } else if (params->fallback_smack_process_label) {
2936                 _cleanup_free_ char *exec_label = NULL;
2937
2938                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2939                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2940                         return r;
2941
2942                 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2943                 if (r < 0)
2944                         return r;
2945         }
2946
2947         return 0;
2948 }
2949 #endif
2950
2951 static int compile_bind_mounts(
2952                 const ExecContext *context,
2953                 const ExecParameters *params,
2954                 uid_t exec_directory_uid, /* only used for id-mapped mounts Exec directories */
2955                 gid_t exec_directory_gid, /* only used for id-mapped mounts Exec directories */
2956                 BindMount **ret_bind_mounts,
2957                 size_t *ret_n_bind_mounts,
2958                 char ***ret_empty_directories) {
2959
2960         _cleanup_strv_free_ char **empty_directories = NULL;
2961         BindMount *bind_mounts = NULL;
2962         size_t n, h = 0;
2963         int r;
2964
2965         assert(context);
2966         assert(params);
2967         assert(ret_bind_mounts);
2968         assert(ret_n_bind_mounts);
2969         assert(ret_empty_directories);
2970
2971         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2972
2973         n = context->n_bind_mounts;
2974         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2975                 if (!params->prefix[t])
2976                         continue;
2977
2978                 FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items)
2979                         n += !FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) || FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY);
2980         }
2981
2982         if (n <= 0) {
2983                 *ret_bind_mounts = NULL;
2984                 *ret_n_bind_mounts = 0;
2985                 *ret_empty_directories = NULL;
2986                 return 0;
2987         }
2988
2989         bind_mounts = new(BindMount, n);
2990         if (!bind_mounts)
2991                 return -ENOMEM;
2992
2993         FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) {
2994                 r = bind_mount_add(&bind_mounts, &h, item);
2995                 if (r < 0)
2996                         return r;
2997         }
2998
2999         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3000                 if (!params->prefix[t])
3001                         continue;
3002
3003                 if (context->directories[t].n_items == 0)
3004                         continue;
3005
3006                 if (exec_directory_is_private(context, t) &&
3007                     !exec_context_with_rootfs(context)) {
3008                         char *private_root;
3009
3010                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3011                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3012                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3013
3014                         private_root = path_join(params->prefix[t], "private");
3015                         if (!private_root)
3016                                 return -ENOMEM;
3017
3018                         r = strv_consume(&empty_directories, private_root);
3019                         if (r < 0)
3020                                 return r;
3021                 }
3022
3023                 FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) {
3024                         _cleanup_free_ char *s = NULL, *d = NULL;
3025
3026                         /* When one of the parent directories is in the list, we cannot create the symlink
3027                          * for the child directory. See also the comments in setup_exec_directory().
3028                          * But if it needs to be read only, then we have to create a bind mount anyway to
3029                          * make it so. */
3030                         if (FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE) && !FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY))
3031                                 continue;
3032
3033                         if (exec_directory_is_private(context, t))
3034                                 s = path_join(params->prefix[t], "private", i->path);
3035                         else
3036                                 s = path_join(params->prefix[t], i->path);
3037                         if (!s)
3038                                 return -ENOMEM;
3039
3040                         if (exec_directory_is_private(context, t) &&
3041                             exec_context_with_rootfs(context))
3042                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3043                                  * directory is not created on the root directory. So, let's bind-mount the directory
3044                                  * on the 'non-private' place. */
3045                                 d = path_join(params->prefix[t], i->path);
3046                         else
3047                                 d = strdup(s);
3048                         if (!d)
3049                                 return -ENOMEM;
3050
3051                         bind_mounts[h++] = (BindMount) {
3052                                 .source = TAKE_PTR(s),
3053                                 .destination = TAKE_PTR(d),
3054                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3055                                 .recursive = true,
3056                                 .read_only = FLAGS_SET(i->flags, EXEC_DIRECTORY_READ_ONLY),
3057                                 .idmapped = i->idmapped,
3058                                 .uid = exec_directory_uid,
3059                                 .gid = exec_directory_gid,
3060                         };
3061                 }
3062         }
3063
3064         assert(h == n);
3065
3066         *ret_bind_mounts = TAKE_PTR(bind_mounts);
3067         *ret_n_bind_mounts = n;
3068         *ret_empty_directories = TAKE_PTR(empty_directories);
3069
3070         return (int) n;
3071 }
3072
3073 /* ret_symlinks will contain a list of pairs src:dest that describes
3074  * the symlinks to create later on. For example, the symlinks needed
3075  * to safely give private directories to DynamicUser=1 users. */
3076 static int compile_symlinks(
3077                 const ExecContext *context,
3078                 const ExecParameters *params,
3079                 bool setup_os_release_symlink,
3080                 char ***ret_symlinks) {
3081
3082         _cleanup_strv_free_ char **symlinks = NULL;
3083         int r;
3084
3085         assert(context);
3086         assert(params);
3087         assert(ret_symlinks);
3088
3089         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
3090                 FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) {
3091                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3092
3093                         STRV_FOREACH(symlink, i->symlinks) {
3094                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3095
3096                                 src_abs = path_join(params->prefix[dt], i->path);
3097                                 dst_abs = path_join(params->prefix[dt], *symlink);
3098                                 if (!src_abs || !dst_abs)
3099                                         return -ENOMEM;
3100
3101                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3102                                 if (r < 0)
3103                                         return r;
3104                         }
3105
3106                         if (!exec_directory_is_private(context, dt) ||
3107                             exec_context_with_rootfs(context) ||
3108                             FLAGS_SET(i->flags, EXEC_DIRECTORY_ONLY_CREATE))
3109                                 continue;
3110
3111                         private_path = path_join(params->prefix[dt], "private", i->path);
3112                         if (!private_path)
3113                                 return -ENOMEM;
3114
3115                         path = path_join(params->prefix[dt], i->path);
3116                         if (!path)
3117                                 return -ENOMEM;
3118
3119                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3120                         if (r < 0)
3121                                 return r;
3122                 }
3123
3124         /* We make the host's os-release available via a symlink, so that we can copy it atomically
3125          * and readers will never get a half-written version. Note that, while the paths specified here are
3126          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
3127          * 'os-release -> .os-release-stage/os-release' is what will be created. */
3128         if (setup_os_release_symlink) {
3129                 r = strv_extend_many(
3130                                 &symlinks,
3131                                 "/run/host/.os-release-stage/os-release",
3132                                 "/run/host/os-release");
3133                 if (r < 0)
3134                         return r;
3135         }
3136
3137         *ret_symlinks = TAKE_PTR(symlinks);
3138
3139         return 0;
3140 }
3141
3142 static bool insist_on_sandboxing(
3143                 const ExecContext *context,
3144                 const char *root_dir,
3145                 const char *root_image,
3146                 const BindMount *bind_mounts,
3147                 size_t n_bind_mounts) {
3148
3149         assert(context);
3150         assert(n_bind_mounts == 0 || bind_mounts);
3151
3152         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3153          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3154          * rearrange stuff in a way we cannot ignore gracefully. */
3155
3156         if (context->n_temporary_filesystems > 0)
3157                 return true;
3158
3159         if (root_dir || root_image)
3160                 return true;
3161
3162         if (context->n_mount_images > 0)
3163                 return true;
3164
3165         if (context->dynamic_user)
3166                 return true;
3167
3168         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3169                 return true;
3170
3171         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3172          * essential. */
3173         FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
3174                 if (!path_equal(i->source, i->destination))
3175                         return true;
3176
3177         if (context->log_namespace)
3178                 return true;
3179
3180         return false;
3181 }
3182
3183 static int setup_ephemeral(
3184                 const ExecContext *context,
3185                 ExecRuntime *runtime,
3186                 char **root_image,            /* both input and output! modified if ephemeral logic enabled */
3187                 char **root_directory,        /* ditto */
3188                 char **reterr_path) {
3189
3190         _cleanup_close_ int fd = -EBADF;
3191         _cleanup_free_ char *new_root = NULL;
3192         int r;
3193
3194         assert(context);
3195         assert(runtime);
3196         assert(root_image);
3197         assert(root_directory);
3198
3199         if (!*root_image && !*root_directory)
3200                 return 0;
3201
3202         if (!runtime->ephemeral_copy)
3203                 return 0;
3204
3205         assert(runtime->ephemeral_storage_socket[0] >= 0);
3206         assert(runtime->ephemeral_storage_socket[1] >= 0);
3207
3208         new_root = strdup(runtime->ephemeral_copy);
3209         if (!new_root)
3210                 return log_oom_debug();
3211
3212         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3213         if (r < 0)
3214                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3215
3216         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3217
3218         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3219         if (fd >= 0)
3220                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3221                 return 0;
3222         if (fd != -EAGAIN)
3223                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3224
3225         if (*root_image) {
3226                 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
3227
3228                 fd = copy_file(*root_image, new_root, O_EXCL, 0600,
3229                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME|COPY_NOCOW_AFTER);
3230                 if (fd < 0) {
3231                         *reterr_path = strdup(*root_image);
3232                         return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
3233                                                *root_image, new_root);
3234                 }
3235         } else {
3236                 assert(*root_directory);
3237
3238                 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
3239
3240                 fd = btrfs_subvol_snapshot_at(
3241                                 AT_FDCWD, *root_directory,
3242                                 AT_FDCWD, new_root,
3243                                 BTRFS_SNAPSHOT_FALLBACK_COPY |
3244                                 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3245                                 BTRFS_SNAPSHOT_RECURSIVE |
3246                                 BTRFS_SNAPSHOT_LOCK_BSD);
3247                 if (fd < 0) {
3248                         *reterr_path = strdup(*root_directory);
3249                         return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
3250                                                *root_directory, new_root);
3251                 }
3252         }
3253
3254         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3255         if (r < 0)
3256                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3257
3258         if (*root_image)
3259                 free_and_replace(*root_image, new_root);
3260         else {
3261                 assert(*root_directory);
3262                 free_and_replace(*root_directory, new_root);
3263         }
3264
3265         return 1;
3266 }
3267
3268 static int verity_settings_prepare(
3269                 VeritySettings *verity,
3270                 const char *root_image,
3271                 const void *root_hash,
3272                 size_t root_hash_size,
3273                 const char *root_hash_path,
3274                 const void *root_hash_sig,
3275                 size_t root_hash_sig_size,
3276                 const char *root_hash_sig_path,
3277                 const char *verity_data_path) {
3278
3279         int r;
3280
3281         assert(verity);
3282
3283         if (root_hash) {
3284                 void *d;
3285
3286                 d = memdup(root_hash, root_hash_size);
3287                 if (!d)
3288                         return -ENOMEM;
3289
3290                 free_and_replace(verity->root_hash, d);
3291                 verity->root_hash_size = root_hash_size;
3292                 verity->designator = PARTITION_ROOT;
3293         }
3294
3295         if (root_hash_sig) {
3296                 void *d;
3297
3298                 d = memdup(root_hash_sig, root_hash_sig_size);
3299                 if (!d)
3300                         return -ENOMEM;
3301
3302                 free_and_replace(verity->root_hash_sig, d);
3303                 verity->root_hash_sig_size = root_hash_sig_size;
3304                 verity->designator = PARTITION_ROOT;
3305         }
3306
3307         if (verity_data_path) {
3308                 r = free_and_strdup(&verity->data_path, verity_data_path);
3309                 if (r < 0)
3310                         return r;
3311         }
3312
3313         r = verity_settings_load(
3314                         verity,
3315                         root_image,
3316                         root_hash_path,
3317                         root_hash_sig_path);
3318         if (r < 0)
3319                 return log_debug_errno(r, "Failed to load root hash: %m");
3320
3321         return 0;
3322 }
3323
3324 static int pick_versions(
3325                 const ExecContext *context,
3326                 const ExecParameters *params,
3327                 char **ret_root_image,
3328                 char **ret_root_directory,
3329                 char **reterr_path) {
3330
3331         int r;
3332
3333         assert(context);
3334         assert(params);
3335         assert(ret_root_image);
3336         assert(ret_root_directory);
3337
3338         if (context->root_image) {
3339                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3340
3341                 r = path_pick(/* toplevel_path= */ NULL,
3342                               /* toplevel_fd= */ AT_FDCWD,
3343                               context->root_image,
3344                               &pick_filter_image_raw,
3345                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3346                               &result);
3347                 if (r < 0) {
3348                         *reterr_path = strdup(context->root_image);
3349                         return r;
3350                 }
3351
3352                 if (!result.path) {
3353                         *reterr_path = strdup(context->root_image);
3354                         return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3355                 }
3356
3357                 *ret_root_image = TAKE_PTR(result.path);
3358                 *ret_root_directory = NULL;
3359                 return r;
3360         }
3361
3362         if (context->root_directory) {
3363                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3364
3365                 r = path_pick(/* toplevel_path= */ NULL,
3366                               /* toplevel_fd= */ AT_FDCWD,
3367                               context->root_directory,
3368                               &pick_filter_image_dir,
3369                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3370                               &result);
3371                 if (r < 0) {
3372                         *reterr_path = strdup(context->root_directory);
3373                         return r;
3374                 }
3375
3376                 if (!result.path) {
3377                         *reterr_path = strdup(context->root_directory);
3378                         return log_debug_errno(SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3379                 }
3380
3381                 *ret_root_image = NULL;
3382                 *ret_root_directory = TAKE_PTR(result.path);
3383                 return r;
3384         }
3385
3386         *ret_root_image = *ret_root_directory = NULL;
3387         return 0;
3388 }
3389
3390 static int apply_mount_namespace(
3391                 ExecCommandFlags command_flags,
3392                 const ExecContext *context,
3393                 const ExecParameters *params,
3394                 ExecRuntime *runtime,
3395                 const char *memory_pressure_path,
3396                 bool needs_sandboxing,
3397                 char **reterr_path,
3398                 uid_t exec_directory_uid,
3399                 gid_t exec_directory_gid) {
3400
3401         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3402         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3403                         **read_write_paths_cleanup = NULL;
3404         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3405                 *private_namespace_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3406         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3407         char **read_write_paths;
3408         bool setup_os_release_symlink;
3409         BindMount *bind_mounts = NULL;
3410         size_t n_bind_mounts = 0;
3411         int r;
3412
3413         assert(context);
3414         assert(params);
3415         assert(runtime);
3416
3417         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3418
3419         if (params->flags & EXEC_APPLY_CHROOT) {
3420                 r = pick_versions(
3421                                 context,
3422                                 params,
3423                                 &root_image,
3424                                 &root_dir,
3425                                 reterr_path);
3426                 if (r < 0)
3427                         return r;
3428
3429                 r = setup_ephemeral(
3430                                 context,
3431                                 runtime,
3432                                 &root_image,
3433                                 &root_dir,
3434                                 reterr_path);
3435                 if (r < 0)
3436                         return r;
3437         }
3438
3439         r = compile_bind_mounts(context, params, exec_directory_uid, exec_directory_gid, &bind_mounts, &n_bind_mounts, &empty_directories);
3440         if (r < 0)
3441                 return r;
3442
3443         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3444          * service will need to write to it in order to start the notifications. */
3445         if (exec_is_cgroup_mount_read_only(context) && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3446                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3447                 if (!read_write_paths_cleanup)
3448                         return -ENOMEM;
3449
3450                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3451                 if (r < 0)
3452                         return r;
3453
3454                 read_write_paths = read_write_paths_cleanup;
3455         } else
3456                 read_write_paths = context->read_write_paths;
3457
3458         if (needs_sandboxing) {
3459                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3460                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3461                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3462
3463                 if (context->private_tmp == PRIVATE_TMP_CONNECTED && runtime->shared) {
3464                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3465                                 tmp_dir = runtime->shared->tmp_dir;
3466                         else if (runtime->shared->tmp_dir)
3467                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3468
3469                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3470                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3471                         else if (runtime->shared->var_tmp_dir)
3472                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3473                 }
3474         }
3475
3476         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3477         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3478         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3479         if (r < 0)
3480                 return r;
3481
3482         if (context->mount_propagation_flag == MS_SHARED)
3483                 log_debug("shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3484
3485         r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3486         if (r < 0)
3487                 return r;
3488
3489         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3490                 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3491                 if (!propagate_dir)
3492                         return -ENOMEM;
3493
3494                 incoming_dir = strdup("/run/systemd/incoming");
3495                 if (!incoming_dir)
3496                         return -ENOMEM;
3497
3498                 private_namespace_dir = strdup("/run/systemd");
3499                 if (!private_namespace_dir)
3500                         return -ENOMEM;
3501
3502                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3503                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3504                 if (setup_os_release_symlink) {
3505                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3506                         if (!host_os_release_stage)
3507                                 return -ENOMEM;
3508                 }
3509         } else {
3510                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3511
3512                 if (asprintf(&private_namespace_dir, "/run/user/" UID_FMT "/systemd", geteuid()) < 0)
3513                         return -ENOMEM;
3514
3515                 if (setup_os_release_symlink) {
3516                         if (asprintf(&host_os_release_stage,
3517                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3518                                      geteuid()) < 0)
3519                                 return -ENOMEM;
3520                 }
3521         }
3522
3523         if (root_image) {
3524                 r = verity_settings_prepare(
3525                         &verity,
3526                         root_image,
3527                         context->root_hash, context->root_hash_size, context->root_hash_path,
3528                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3529                         context->root_verity);
3530                 if (r < 0)
3531                         return r;
3532         }
3533
3534         NamespaceParameters parameters = {
3535                 .runtime_scope = params->runtime_scope,
3536
3537                 .root_directory = root_dir,
3538                 .root_image = root_image,
3539                 .root_image_options = context->root_image_options,
3540                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3541
3542                 .read_write_paths = read_write_paths,
3543                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3544                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3545
3546                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3547                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3548
3549                 .empty_directories = empty_directories,
3550                 .symlinks = symlinks,
3551
3552                 .bind_mounts = bind_mounts,
3553                 .n_bind_mounts = n_bind_mounts,
3554
3555                 .temporary_filesystems = context->temporary_filesystems,
3556                 .n_temporary_filesystems = context->n_temporary_filesystems,
3557
3558                 .mount_images = context->mount_images,
3559                 .n_mount_images = context->n_mount_images,
3560                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3561
3562                 .tmp_dir = tmp_dir,
3563                 .var_tmp_dir = var_tmp_dir,
3564
3565                 .creds_path = creds_path,
3566                 .log_namespace = context->log_namespace,
3567                 .mount_propagation_flag = context->mount_propagation_flag,
3568
3569                 .verity = &verity,
3570
3571                 .extension_images = context->extension_images,
3572                 .n_extension_images = context->n_extension_images,
3573                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3574                 .extension_directories = context->extension_directories,
3575
3576                 .propagate_dir = propagate_dir,
3577                 .incoming_dir = incoming_dir,
3578                 .private_namespace_dir = private_namespace_dir,
3579                 .host_notify_socket = params->notify_socket,
3580                 .notify_socket_path = exec_get_private_notify_socket_path(context, params, needs_sandboxing),
3581                 .host_os_release_stage = host_os_release_stage,
3582
3583                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3584                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3585                  * sandbox inside the mount namespace. */
3586                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3587
3588                 .protect_control_groups = needs_sandboxing ? exec_get_protect_control_groups(context) : PROTECT_CONTROL_GROUPS_NO,
3589                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3590                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3591                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3592
3593                 .private_dev = needs_sandboxing && context->private_devices,
3594                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3595                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3596                 .private_pids = needs_sandboxing && exec_needs_pid_namespace(context, params) ? context->private_pids : PRIVATE_PIDS_NO,
3597                 .private_tmp = needs_sandboxing ? context->private_tmp : PRIVATE_TMP_NO,
3598                 .private_var_tmp = needs_sandboxing ? context->private_var_tmp : PRIVATE_TMP_NO,
3599
3600                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3601                 .bind_log_sockets = needs_sandboxing && exec_context_get_effective_bind_log_sockets(context),
3602
3603                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3604                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3605
3606                 .protect_home = needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
3607                 .protect_hostname = needs_sandboxing ? context->protect_hostname : PROTECT_HOSTNAME_NO,
3608                 .protect_system = needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
3609                 .protect_proc = needs_sandboxing ? context->protect_proc : PROTECT_PROC_DEFAULT,
3610                 .proc_subset = needs_sandboxing ? context->proc_subset : PROC_SUBSET_ALL,
3611         };
3612
3613         r = setup_namespace(&parameters, reterr_path);
3614         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3615          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3616          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3617          * completely different execution environment. */
3618         if (r == -ENOANO) {
3619                 if (insist_on_sandboxing(
3620                                     context,
3621                                     root_dir, root_image,
3622                                     bind_mounts,
3623                                     n_bind_mounts))
3624                         return log_debug_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
3625                                                "Failed to set up namespace, and refusing to continue since "
3626                                                "the selected namespacing options alter mount environment non-trivially.\n"
3627                                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3628                                                n_bind_mounts,
3629                                                context->n_temporary_filesystems,
3630                                                yes_no(root_dir),
3631                                                yes_no(root_image),
3632                                                yes_no(context->dynamic_user));
3633
3634                 log_debug("Failed to set up namespace, assuming containerized execution and ignoring.");
3635                 return 0;
3636         }
3637
3638         return r;
3639 }
3640
3641 static int apply_working_directory(
3642                 const ExecContext *context,
3643                 const ExecParameters *params,
3644                 ExecRuntime *runtime,
3645                 const char *pwent_home,
3646                 char * const *env) {
3647
3648         const char *wd;
3649         int r;
3650
3651         assert(context);
3652         assert(params);
3653         assert(runtime);
3654
3655         if (context->working_directory_home) {
3656                 /* Preferably use the data from $HOME, in case it was updated by a PAM module */
3657                 wd = strv_env_get(env, "HOME");
3658                 if (!wd) {
3659                         /* If that's not available, use the data from the struct passwd entry: */
3660                         if (!pwent_home)
3661                                 return -ENXIO;
3662
3663                         wd = pwent_home;
3664                 }
3665         } else
3666                 wd = empty_to_root(context->working_directory);
3667
3668         if (params->flags & EXEC_APPLY_CHROOT)
3669                 r = RET_NERRNO(chdir(wd));
3670         else {
3671                 _cleanup_close_ int dfd = -EBADF;
3672
3673                 r = chase(wd,
3674                           runtime->ephemeral_copy ?: context->root_directory,
3675                           CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3676                           /* ret_path= */ NULL,
3677                           &dfd);
3678                 if (r >= 0)
3679                         r = RET_NERRNO(fchdir(dfd));
3680         }
3681         return context->working_directory_missing_ok ? 0 : r;
3682 }
3683
3684 static int apply_root_directory(
3685                 const ExecContext *context,
3686                 const ExecParameters *params,
3687                 ExecRuntime *runtime,
3688                 const bool needs_mount_ns,
3689                 int *exit_status) {
3690
3691         assert(context);
3692         assert(params);
3693         assert(runtime);
3694         assert(exit_status);
3695
3696         if (params->flags & EXEC_APPLY_CHROOT)
3697                 if (!needs_mount_ns && context->root_directory)
3698                         if (chroot(runtime->ephemeral_copy ?: context->root_directory) < 0) {
3699                                 *exit_status = EXIT_CHROOT;
3700                                 return -errno;
3701                         }
3702
3703         return 0;
3704 }
3705
3706 static int setup_keyring(
3707                 const ExecContext *context,
3708                 const ExecParameters *p,
3709                 uid_t uid,
3710                 gid_t gid) {
3711
3712         key_serial_t keyring;
3713         int r = 0;
3714         uid_t saved_uid;
3715         gid_t saved_gid;
3716
3717         assert(context);
3718         assert(p);
3719
3720         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3721          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3722          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3723          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3724          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3725          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3726
3727         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3728                 return 0;
3729
3730         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3731          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3732          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3733          * & group is just as nasty as acquiring a reference to the user keyring. */
3734
3735         saved_uid = getuid();
3736         saved_gid = getgid();
3737
3738         if (gid_is_valid(gid) && gid != saved_gid) {
3739                 if (setregid(gid, -1) < 0)
3740                         return log_error_errno(errno, "Failed to change GID for user keyring: %m");
3741         }
3742
3743         if (uid_is_valid(uid) && uid != saved_uid) {
3744                 if (setreuid(uid, -1) < 0) {
3745                         r = log_error_errno(errno, "Failed to change UID for user keyring: %m");
3746                         goto out;
3747                 }
3748         }
3749
3750         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3751         if (keyring == -1) {
3752                 if (errno == ENOSYS)
3753                         log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
3754                 else if (ERRNO_IS_PRIVILEGE(errno))
3755                         log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
3756                 else if (errno == EDQUOT)
3757                         log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
3758                 else
3759                         r = log_error_errno(errno, "Setting up kernel keyring failed: %m");
3760
3761                 goto out;
3762         }
3763
3764         /* When requested link the user keyring into the session keyring. */
3765         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3766
3767                 if (keyctl(KEYCTL_LINK,
3768                            KEY_SPEC_USER_KEYRING,
3769                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3770                         r = log_error_errno(errno, "Failed to link user keyring into session keyring: %m");
3771                         goto out;
3772                 }
3773         }
3774
3775         /* Restore uid/gid back */
3776         if (uid_is_valid(uid) && uid != saved_uid) {
3777                 if (setreuid(saved_uid, -1) < 0) {
3778                         r = log_error_errno(errno, "Failed to change UID back for user keyring: %m");
3779                         goto out;
3780                 }
3781         }
3782
3783         if (gid_is_valid(gid) && gid != saved_gid) {
3784                 if (setregid(saved_gid, -1) < 0)
3785                         return log_error_errno(errno, "Failed to change GID back for user keyring: %m");
3786         }
3787
3788         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3789         if (!sd_id128_is_null(p->invocation_id)) {
3790                 key_serial_t key;
3791
3792                 key = add_key("user",
3793                               "invocation_id",
3794                               &p->invocation_id,
3795                               sizeof(p->invocation_id),
3796                               KEY_SPEC_SESSION_KEYRING);
3797                 if (key == -1)
3798                         log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
3799                 else {
3800                         if (keyctl(KEYCTL_SETPERM, key,
3801                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3802                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3803                                 r = log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
3804                 }
3805         }
3806
3807 out:
3808         /* Revert back uid & gid for the last time, and exit */
3809         /* no extra logging, as only the first already reported error matters */
3810         if (getuid() != saved_uid)
3811                 (void) setreuid(saved_uid, -1);
3812
3813         if (getgid() != saved_gid)
3814                 (void) setregid(saved_gid, -1);
3815
3816         return r;
3817 }
3818
3819 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3820         assert(array);
3821         assert(n);
3822         assert(pair);
3823
3824         if (pair[0] >= 0)
3825                 array[(*n)++] = pair[0];
3826         if (pair[1] >= 0)
3827                 array[(*n)++] = pair[1];
3828 }
3829
3830 static int close_remaining_fds(
3831                 const ExecParameters *params,
3832                 const ExecRuntime *runtime,
3833                 int socket_fd,
3834                 const int *fds,
3835                 size_t n_fds) {
3836
3837         size_t n_dont_close = 0;
3838         int dont_close[n_fds + 17];
3839
3840         assert(params);
3841         assert(runtime);
3842
3843         if (params->stdin_fd >= 0)
3844                 dont_close[n_dont_close++] = params->stdin_fd;
3845         if (params->stdout_fd >= 0)
3846                 dont_close[n_dont_close++] = params->stdout_fd;
3847         if (params->stderr_fd >= 0)
3848                 dont_close[n_dont_close++] = params->stderr_fd;
3849
3850         if (socket_fd >= 0)
3851                 dont_close[n_dont_close++] = socket_fd;
3852         if (n_fds > 0) {
3853                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3854                 n_dont_close += n_fds;
3855         }
3856
3857         append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3858
3859         if (runtime->shared) {
3860                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3861                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3862         }
3863
3864         if (runtime->dynamic_creds) {
3865                 if (runtime->dynamic_creds->user)
3866                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3867                 if (runtime->dynamic_creds->group)
3868                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3869         }
3870
3871         if (params->user_lookup_fd >= 0)
3872                 dont_close[n_dont_close++] = params->user_lookup_fd;
3873
3874         if (params->handoff_timestamp_fd >= 0)
3875                 dont_close[n_dont_close++] = params->handoff_timestamp_fd;
3876
3877         if (params->pidref_transport_fd >= 0)
3878                 dont_close[n_dont_close++] = params->pidref_transport_fd;
3879
3880         assert(n_dont_close <= ELEMENTSOF(dont_close));
3881
3882         return close_all_fds(dont_close, n_dont_close);
3883 }
3884
3885 static int send_user_lookup(
3886                 const char *unit_id,
3887                 int user_lookup_fd,
3888                 uid_t uid,
3889                 gid_t gid) {
3890
3891         assert(unit_id);
3892
3893         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3894          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3895          * specified. */
3896
3897         if (user_lookup_fd < 0)
3898                 return 0;
3899
3900         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3901                 return 0;
3902
3903         if (writev(user_lookup_fd,
3904                (struct iovec[]) {
3905                            IOVEC_MAKE(&uid, sizeof(uid)),
3906                            IOVEC_MAKE(&gid, sizeof(gid)),
3907                            IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3908                 return -errno;
3909
3910         return 0;
3911 }
3912
3913 static int acquire_home(const ExecContext *c, const char **home, char **ret_buf) {
3914         int r;
3915
3916         assert(c);
3917         assert(home);
3918         assert(ret_buf);
3919
3920         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3921
3922         if (*home) /* Already acquired from get_fixed_user()? */
3923                 return 0;
3924
3925         if (!c->working_directory_home)
3926                 return 0;
3927
3928         if (c->dynamic_user || (c->user && is_this_me(c->user) <= 0))
3929                 return -EADDRNOTAVAIL;
3930
3931         r = get_home_dir(ret_buf);
3932         if (r < 0)
3933                 return r;
3934
3935         *home = *ret_buf;
3936         return 1;
3937 }
3938
3939 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3940         _cleanup_strv_free_ char ** list = NULL;
3941         int r;
3942
3943         assert(c);
3944         assert(p);
3945         assert(ret);
3946
3947         assert(c->dynamic_user);
3948
3949         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3950          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3951          * directories. */
3952
3953         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3954
3955                 if (!EXEC_DIRECTORY_TYPE_SHALL_CHOWN(t))
3956                         continue;
3957
3958                 if (!p->prefix[t])
3959                         continue;
3960
3961                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3962                         char *e;
3963
3964                         if (exec_directory_is_private(c, t))
3965                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3966                         else
3967                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3968                         if (!e)
3969                                 return -ENOMEM;
3970
3971                         r = strv_consume(&list, e);
3972                         if (r < 0)
3973                                 return r;
3974                 }
3975         }
3976
3977         *ret = TAKE_PTR(list);
3978
3979         return 0;
3980 }
3981
3982 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3983         int r;
3984
3985         assert(c);
3986         assert(ret);
3987
3988         if (!c->numa_policy.nodes.set) {
3989                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3990                 *ret = (CPUSet) {};
3991                 return 0;
3992         }
3993
3994         _cleanup_(cpu_set_done) CPUSet s = {};
3995         r = numa_to_cpu_set(&c->numa_policy, &s);
3996         if (r < 0)
3997                 return r;
3998
3999         *ret = TAKE_STRUCT(s);
4000         return 0;
4001 }
4002
4003 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
4004         int r;
4005
4006         assert(fds);
4007         assert(n_fds);
4008         assert(*n_fds < fds_size);
4009         assert(fd);
4010
4011         if (*fd < 0)
4012                return 0;
4013
4014         if (*fd < 3 + (int) *n_fds) {
4015                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4016                  * the fds we pass to the process (or which are closed only during execve). */
4017
4018                 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4019                 if (r < 0)
4020                         return -errno;
4021
4022                 close_and_replace(*fd, r);
4023         }
4024
4025         fds[(*n_fds)++] = *fd;
4026         return 1;
4027 }
4028
4029 static int connect_unix_harder(const OpenFile *of, int ofd) {
4030         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4031
4032         union sockaddr_union addr = {
4033                 .un.sun_family = AF_UNIX,
4034         };
4035         socklen_t sa_len;
4036         int r;
4037
4038         assert(of);
4039         assert(ofd >= 0);
4040
4041         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4042         if (r < 0)
4043                 return log_debug_errno(r, "Failed to set sockaddr for '%s': %m", of->path);
4044         sa_len = r;
4045
4046         FOREACH_ELEMENT(i, socket_types) {
4047                 _cleanup_close_ int fd = -EBADF;
4048
4049                 fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0);
4050                 if (fd < 0)
4051                         return log_debug_errno(errno, "Failed to create socket for '%s': %m", of->path);
4052
4053                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4054                 if (r >= 0)
4055                         return TAKE_FD(fd);
4056                 if (r != -EPROTOTYPE)
4057                         return log_debug_errno(r, "Failed to connect to socket for '%s': %m", of->path);
4058         }
4059
4060         return log_debug_errno(SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.", of->path);
4061 }
4062
4063 static int get_open_file_fd(const OpenFile *of) {
4064         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4065         struct stat st;
4066
4067         assert(of);
4068
4069         ofd = open(of->path, O_PATH | O_CLOEXEC);
4070         if (ofd < 0)
4071                 return log_debug_errno(errno, "Failed to open '%s' as O_PATH: %m", of->path);
4072
4073         if (fstat(ofd, &st) < 0)
4074                 return log_debug_errno( errno, "Failed to stat '%s': %m", of->path);
4075
4076         if (S_ISSOCK(st.st_mode)) {
4077                 fd = connect_unix_harder(of, ofd);
4078                 if (fd < 0)
4079                         return fd;
4080
4081                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4082                         return log_debug_errno(errno, "Failed to shutdown send for socket '%s': %m", of->path);
4083
4084                 log_debug("Opened socket '%s' as fd %d.", of->path, fd);
4085         } else {
4086                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4087                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4088                         flags |= O_APPEND;
4089                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4090                         flags |= O_TRUNC;
4091
4092                 fd = fd_reopen(ofd, flags|O_NOCTTY|O_CLOEXEC);
4093                 if (fd < 0)
4094                         return log_debug_errno(fd, "Failed to reopen file '%s': %m", of->path);
4095
4096                 log_debug("Opened file '%s' as fd %d.", of->path, fd);
4097         }
4098
4099         return TAKE_FD(fd);
4100 }
4101
4102 static int collect_open_file_fds(ExecParameters *p, size_t *n_fds) {
4103         assert(p);
4104         assert(n_fds);
4105
4106         LIST_FOREACH(open_files, of, p->open_files) {
4107                 _cleanup_close_ int fd = -EBADF;
4108
4109                 fd = get_open_file_fd(of);
4110                 if (fd < 0) {
4111                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4112                                 log_full_errno(fd == -ENOENT || ERRNO_IS_NEG_PRIVILEGE(fd) ? LOG_DEBUG : LOG_WARNING,
4113                                                fd,
4114                                                "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
4115                                                of->path);
4116                                 continue;
4117                         }
4118
4119                         return log_error_errno(fd, "Failed to get OpenFile= file descriptor for '%s': %m", of->path);
4120                 }
4121
4122                 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
4123                         return log_oom();
4124
4125                 if (strv_extend(&p->fd_names, of->fdname) < 0)
4126                         return log_oom();
4127
4128                 p->fds[(*n_fds)++] = TAKE_FD(fd);
4129         }
4130
4131         return 0;
4132 }
4133
4134 static void log_command_line(
4135                 const ExecContext *context,
4136                 const ExecParameters *params,
4137                 const char *msg,
4138                 const char *executable,
4139                 char **argv) {
4140
4141         assert(context);
4142         assert(params);
4143         assert(msg);
4144         assert(executable);
4145
4146         if (!DEBUG_LOGGING)
4147                 return;
4148
4149         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4150
4151         log_struct(LOG_DEBUG,
4152                    LOG_ITEM("EXECUTABLE=%s", executable),
4153                    LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
4154                    LOG_EXEC_INVOCATION_ID(params));
4155 }
4156
4157 static bool exec_needs_cap_sys_admin(const ExecContext *context, const ExecParameters *params) {
4158         assert(context);
4159
4160         return context->private_users != PRIVATE_USERS_NO ||
4161                context->private_tmp != PRIVATE_TMP_NO ||
4162                context->private_devices ||
4163                context->private_network ||
4164                context->network_namespace_path ||
4165                context->private_ipc ||
4166                context->ipc_namespace_path ||
4167                context->private_mounts > 0 ||
4168                context->mount_apivfs > 0 ||
4169                context->bind_log_sockets > 0 ||
4170                context->n_bind_mounts > 0 ||
4171                context->n_temporary_filesystems > 0 ||
4172                context->root_directory ||
4173                !strv_isempty(context->extension_directories) ||
4174                context->protect_system != PROTECT_SYSTEM_NO ||
4175                context->protect_home != PROTECT_HOME_NO ||
4176                exec_needs_pid_namespace(context, params) ||
4177                context->protect_kernel_tunables ||
4178                context->protect_kernel_modules ||
4179                context->protect_kernel_logs ||
4180                exec_needs_cgroup_mount(context) ||
4181                context->protect_clock ||
4182                context->protect_hostname != PROTECT_HOSTNAME_NO ||
4183                !strv_isempty(context->read_write_paths) ||
4184                !strv_isempty(context->read_only_paths) ||
4185                !strv_isempty(context->inaccessible_paths) ||
4186                !strv_isempty(context->exec_paths) ||
4187                !strv_isempty(context->no_exec_paths) ||
4188                context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL;
4189 }
4190
4191 static PrivateUsers exec_context_get_effective_private_users(
4192                 const ExecContext *context,
4193                 const ExecParameters *params) {
4194
4195         assert(context);
4196         assert(params);
4197
4198         if (context->private_users != PRIVATE_USERS_NO)
4199                 return context->private_users;
4200
4201         /* If any namespace is delegated with DelegateNamespaces=, always set up a user namespace. */
4202         if (context->delegate_namespaces != NAMESPACE_FLAGS_INITIAL)
4203                 return PRIVATE_USERS_SELF;
4204
4205         return PRIVATE_USERS_NO;
4206 }
4207
4208 static bool exec_namespace_is_delegated(
4209                 const ExecContext *context,
4210                 const ExecParameters *params,
4211                 bool have_cap_sys_admin,
4212                 unsigned long namespace) {
4213
4214         assert(context);
4215         assert(params);
4216         assert(namespace != CLONE_NEWUSER);
4217
4218         /* If we need unprivileged private users, we've already unshared a user namespace by the time we call
4219          * setup_delegated_namespaces() for the first time so let's make sure we do all other namespace
4220          * unsharing in the first call to setup_delegated_namespaces() by returning false here. */
4221         if (!have_cap_sys_admin && exec_needs_cap_sys_admin(context, params))
4222                 return false;
4223
4224         if (context->delegate_namespaces == NAMESPACE_FLAGS_INITIAL)
4225                 return params->runtime_scope == RUNTIME_SCOPE_USER;
4226
4227         if (FLAGS_SET(context->delegate_namespaces, namespace))
4228                 return true;
4229
4230         /* Various namespaces imply mountns for private procfs/sysfs/cgroupfs instances, which means when
4231          * those are delegated mountns must be deferred too.
4232          *
4233          * The list should stay in sync with exec_needs_mount_namespace(). */
4234         if (namespace == CLONE_NEWNS)
4235                 return context->delegate_namespaces & (CLONE_NEWPID|CLONE_NEWCGROUP|CLONE_NEWNET);
4236
4237         return false;
4238 }
4239
4240 static int setup_delegated_namespaces(
4241                 const ExecContext *context,
4242                 ExecParameters *params,
4243                 ExecRuntime *runtime,
4244                 bool delegate,
4245                 const char *memory_pressure_path,
4246                 uid_t uid,
4247                 uid_t gid,
4248                 const ExecCommand *command,
4249                 bool needs_sandboxing,
4250                 bool have_cap_sys_admin,
4251                 int *reterr_exit_status) {
4252
4253         int r;
4254
4255         /* This function is called twice, once before unsharing the user namespace, and once after unsharing
4256          * the user namespace. When called before unsharing the user namespace, "delegate" is set to "false".
4257          * When called after unsharing the user namespace, "delegate" is set to "true". The net effect is
4258          * that all namespaces that should not be delegated are unshared when this function is called the
4259          * first time and all namespaces that should be delegated are unshared when this function is called
4260          * the second time. */
4261
4262         assert(context);
4263         assert(params);
4264         assert(runtime);
4265         assert(reterr_exit_status);
4266
4267         if (exec_needs_network_namespace(context) &&
4268             exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNET) == delegate &&
4269             runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4270
4271                 /* Try to enable network namespacing if network namespacing is available and we have
4272                  * CAP_NET_ADMIN in the current user namespace (either the system manager one or the unit's
4273                  * own user namespace). We need CAP_NET_ADMIN to be able to configure the loopback device in
4274                  * the new network namespace. And if we don't have that, then we could only create a network
4275                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4276                 if (namespace_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4277                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4278                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4279                                 log_notice_errno(r, "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4280                         else if (r < 0) {
4281                                 *reterr_exit_status = EXIT_NETWORK;
4282                                 return log_error_errno(r, "Failed to set up network namespacing: %m");
4283                         } else
4284                                 log_debug("Set up %snetwork namespace", delegate ? "delegated " : "");
4285                 } else if (context->network_namespace_path) {
4286                         *reterr_exit_status = EXIT_NETWORK;
4287                         return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
4288                 } else
4289                         log_notice("PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4290         }
4291
4292         if (exec_needs_ipc_namespace(context) &&
4293             exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWIPC) == delegate &&
4294             runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4295
4296                 if (namespace_type_supported(NAMESPACE_IPC)) {
4297                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4298                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4299                                 log_warning_errno(r, "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4300                         else if (r < 0) {
4301                                 *reterr_exit_status = EXIT_NAMESPACE;
4302                                 return log_error_errno(r, "Failed to set up IPC namespacing: %m");
4303                         } else
4304                                 log_debug("Set up %sIPC namespace", delegate ? "delegated " : "");
4305                 } else if (context->ipc_namespace_path) {
4306                         *reterr_exit_status = EXIT_NAMESPACE;
4307                         return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP), "IPCNamespacePath= is not supported, refusing.");
4308                 } else
4309                         log_warning("PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4310         }
4311
4312         if (needs_sandboxing && exec_needs_cgroup_namespace(context) &&
4313             exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWCGROUP) == delegate) {
4314                 if (unshare(CLONE_NEWCGROUP) < 0) {
4315                         *reterr_exit_status = EXIT_NAMESPACE;
4316                         return log_error_errno(errno, "Failed to set up cgroup namespacing: %m");
4317                 }
4318
4319                 log_debug("Set up %scgroup namespace", delegate ? "delegated " : "");
4320         }
4321
4322         /* Unshare a new PID namespace before setting up mounts to ensure /proc/ is mounted with only processes in PID namespace visible.
4323          * Note PrivatePIDs=yes implies MountAPIVFS=yes so we'll always ensure procfs is remounted. */
4324         if (needs_sandboxing && exec_needs_pid_namespace(context, params) &&
4325             exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWPID) == delegate) {
4326                 if (params->pidref_transport_fd < 0) {
4327                         *reterr_exit_status = EXIT_NAMESPACE;
4328                         return log_error_errno(SYNTHETIC_ERRNO(ENOTCONN), "PidRef socket is not set up: %m");
4329                 }
4330
4331                 /* If we had CAP_SYS_ADMIN prior to joining the user namespace, then we are privileged and don't need
4332                  * to check if we can mount /proc/.
4333                  *
4334                  * We need to check prior to entering the user namespace because if we're running unprivileged or in a
4335                  * system without CAP_SYS_ADMIN, then we can have CAP_SYS_ADMIN in the current user namespace but not
4336                  * once we unshare a mount namespace. */
4337                 if (!have_cap_sys_admin || delegate) {
4338                         r = can_mount_proc();
4339                         if (r < 0) {
4340                                 *reterr_exit_status = EXIT_NAMESPACE;
4341                                 return log_error_errno(r, "Failed to detect if /proc/ can be remounted: %m");
4342                         }
4343                         if (r == 0) {
4344                                 *reterr_exit_status = EXIT_NAMESPACE;
4345                                 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
4346                                                        "PrivatePIDs=yes is configured, but /proc/ cannot be re-mounted due to lack of privileges, refusing.");
4347                         }
4348                 }
4349
4350                 r = setup_private_pids(context, params);
4351                 if (r < 0) {
4352                         *reterr_exit_status = EXIT_NAMESPACE;
4353                         return log_error_errno(r, "Failed to set up pid namespace: %m");
4354                 }
4355
4356                 log_debug("Set up %spid namespace", delegate ? "delegated " : "");
4357         }
4358
4359         /* If PrivatePIDs= yes is configured, we're now running as pid 1 in a pid namespace! */
4360
4361         if (exec_needs_mount_namespace(context, params, runtime) &&
4362             exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWNS) == delegate) {
4363                 _cleanup_free_ char *error_path = NULL;
4364
4365                 r = apply_mount_namespace(command->flags,
4366                                           context,
4367                                           params,
4368                                           runtime,
4369                                           memory_pressure_path,
4370                                           needs_sandboxing,
4371                                           &error_path,
4372                                           uid,
4373                                           gid);
4374                 if (r < 0) {
4375                         *reterr_exit_status = EXIT_NAMESPACE;
4376                         return log_error_errno(r, "Failed to set up mount namespacing%s%s: %m",
4377                                                error_path ? ": " : "", strempty(error_path));
4378                 }
4379
4380                 log_debug("Set up %smount namespace", delegate ? "delegated " : "");
4381         }
4382
4383         if (needs_sandboxing &&
4384             exec_namespace_is_delegated(context, params, have_cap_sys_admin, CLONE_NEWUTS) == delegate) {
4385                 r = apply_protect_hostname(context, params, reterr_exit_status);
4386                 if (r < 0)
4387                         return r;
4388                 if (r > 0)
4389                         log_debug("Set up %sUTS namespace", delegate ? "delegated " : "");
4390         }
4391
4392         return 0;
4393 }
4394
4395 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
4396         assert(context);
4397
4398         if (confirm_spawn_disabled())
4399                 return false;
4400
4401         /* For some reasons units remaining in the same process group
4402          * as PID 1 fail to acquire the console even if it's not used
4403          * by any process. So skip the confirmation question for them. */
4404         return !context->same_pgrp;
4405 }
4406
4407 static int exec_context_named_iofds(
4408                 const ExecContext *c,
4409                 const ExecParameters *p,
4410                 int named_iofds[static 3]) {
4411
4412         size_t targets;
4413         const char* stdio_fdname[3];
4414         size_t n_fds;
4415
4416         assert(c);
4417         assert(p);
4418         assert(named_iofds);
4419
4420         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4421                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4422                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4423
4424         for (size_t i = 0; i < 3; i++)
4425                 stdio_fdname[i] = exec_context_fdname(c, i);
4426
4427         n_fds = p->n_storage_fds + p->n_socket_fds + p->n_extra_fds;
4428
4429         for (size_t i = 0; i < n_fds  && targets > 0; i++)
4430                 if (named_iofds[STDIN_FILENO] < 0 &&
4431                     c->std_input == EXEC_INPUT_NAMED_FD &&
4432                     stdio_fdname[STDIN_FILENO] &&
4433                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4434
4435                         named_iofds[STDIN_FILENO] = p->fds[i];
4436                         targets--;
4437
4438                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4439                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4440                            stdio_fdname[STDOUT_FILENO] &&
4441                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4442
4443                         named_iofds[STDOUT_FILENO] = p->fds[i];
4444                         targets--;
4445
4446                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4447                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4448                            stdio_fdname[STDERR_FILENO] &&
4449                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4450
4451                         named_iofds[STDERR_FILENO] = p->fds[i];
4452                         targets--;
4453                 }
4454
4455         return targets == 0 ? 0 : -ENOENT;
4456 }
4457
4458 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
4459         if (!shared)
4460                 return;
4461
4462         safe_close_pair(shared->netns_storage_socket);
4463         safe_close_pair(shared->ipcns_storage_socket);
4464 }
4465
4466 static void exec_runtime_close(ExecRuntime *rt) {
4467         if (!rt)
4468                 return;
4469
4470         safe_close_pair(rt->ephemeral_storage_socket);
4471
4472         exec_shared_runtime_close(rt->shared);
4473         dynamic_creds_close(rt->dynamic_creds);
4474 }
4475
4476 static void exec_params_close(ExecParameters *p) {
4477         if (!p)
4478                 return;
4479
4480         p->stdin_fd = safe_close(p->stdin_fd);
4481         p->stdout_fd = safe_close(p->stdout_fd);
4482         p->stderr_fd = safe_close(p->stderr_fd);
4483 }
4484
4485 static int exec_fd_mark_hot(
4486                 const ExecContext *c,
4487                 ExecParameters *p,
4488                 bool hot,
4489                 int *reterr_exit_status) {
4490
4491         assert(c);
4492         assert(p);
4493
4494         if (p->exec_fd < 0)
4495                 return 0;
4496
4497         uint8_t x = hot;
4498
4499         if (write(p->exec_fd, &x, sizeof(x)) < 0) {
4500                 if (reterr_exit_status)
4501                         *reterr_exit_status = EXIT_EXEC;
4502                 return log_error_errno(errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
4503         }
4504
4505         return 1;
4506 }
4507
4508 static int send_handoff_timestamp(
4509                 const ExecContext *c,
4510                 ExecParameters *p,
4511                 int *reterr_exit_status) {
4512
4513         assert(c);
4514         assert(p);
4515
4516         if (p->handoff_timestamp_fd < 0)
4517                 return 0;
4518
4519         dual_timestamp dt;
4520         dual_timestamp_now(&dt);
4521
4522         if (write(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2) < 0) {
4523                 if (reterr_exit_status)
4524                         *reterr_exit_status = EXIT_EXEC;
4525                 return log_error_errno(errno, "Failed to send handoff timestamp: %m");
4526         }
4527
4528         return 1;
4529 }
4530
4531 static void prepare_terminal(
4532                 const ExecContext *context,
4533                 ExecParameters *p) {
4534
4535         _cleanup_close_ int lock_fd = -EBADF;
4536
4537         /* This is the "constructive" reset, i.e. is about preparing things for our invocation rather than
4538          * cleaning up things from older invocations. */
4539
4540         assert(context);
4541         assert(p);
4542
4543         /* We only try to reset things if we there's the chance our stdout points to a TTY */
4544         if (!(is_terminal_output(context->std_output) ||
4545               (context->std_output == EXEC_OUTPUT_INHERIT && is_terminal_input(context->std_input)) ||
4546               context->std_output == EXEC_OUTPUT_NAMED_FD ||
4547               p->stdout_fd >= 0))
4548                 return;
4549
4550         /* Let's explicitly determine whether to reset via ANSI sequences or not, taking our ExecContext
4551          * information into account */
4552         bool use_ansi = exec_context_shall_ansi_seq_reset(context);
4553
4554         if (context->tty_reset) {
4555                 /* When we are resetting the TTY, then let's create a lock first, to synchronize access. This
4556                  * in particular matters as concurrent resets and the TTY size ANSI DSR logic done by the
4557                  * exec_context_apply_tty_size() below might interfere */
4558                 lock_fd = lock_dev_console();
4559                 if (lock_fd < 0)
4560                         log_debug_errno(lock_fd, "Failed to lock /dev/console, ignoring: %m");
4561
4562                 /* We explicitly control whether to send ansi sequences or not here, since we want to consult
4563                  * the env vars explicitly configured in the ExecContext, rather than our own environment
4564                  * block. */
4565                 (void) terminal_reset_defensive(STDOUT_FILENO, use_ansi ? TERMINAL_RESET_FORCE_ANSI_SEQ : TERMINAL_RESET_AVOID_ANSI_SEQ);
4566         }
4567
4568         (void) exec_context_apply_tty_size(context, STDIN_FILENO, STDOUT_FILENO, /* tty_path= */ NULL);
4569
4570         if (use_ansi)
4571                 (void) osc_context_open_service(p->unit_id, p->invocation_id, /* ret_seq= */ NULL);
4572 }
4573
4574 static int setup_term_environment(const ExecContext *context, char ***env) {
4575         int r;
4576
4577         assert(context);
4578         assert(env);
4579
4580         /* Already specified by user? */
4581         if (strv_env_get(*env, "TERM"))
4582                 return 0;
4583
4584         /* Do we need $TERM at all? */
4585         if (!is_terminal_input(context->std_input) &&
4586             !is_terminal_output(context->std_output) &&
4587             !is_terminal_output(context->std_error) &&
4588             !context->tty_path)
4589                 return 0;
4590
4591         const char *tty_path = exec_context_tty_path(context);
4592         if (tty_path) {
4593                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
4594                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
4595                  * container manager passes to PID 1 ends up all the way in the console login shown.
4596                  *
4597                  * Note that if this doesn't work out we won't bother with querying systemd.tty.term.console
4598                  * kernel cmdline option or DCS anymore either, because pid1 also imports $TERM based on those
4599                  * and it should have showed up as our $TERM if there were anything. */
4600                 if (tty_is_console(tty_path) && getppid() == 1) {
4601                         const char *term = strv_find_prefix(environ, "TERM=");
4602                         if (term) {
4603                                 r = strv_env_replace_strdup(env, term);
4604                                 if (r < 0)
4605                                         return r;
4606
4607                                 FOREACH_STRING(i, "COLORTERM=", "NO_COLOR=") {
4608                                         const char *s = strv_find_prefix(environ, i);
4609                                         if (!s)
4610                                                 continue;
4611
4612                                         r = strv_env_replace_strdup(env, s);
4613                                         if (r < 0)
4614                                                 return r;
4615                                 }
4616
4617                                 return 1;
4618                         }
4619
4620                 } else {
4621                         if (in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
4622                                 _cleanup_free_ char *key = NULL, *cmdline = NULL;
4623
4624                                 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
4625                                 if (!key)
4626                                         return -ENOMEM;
4627
4628                                 r = proc_cmdline_get_key(key, /* flags = */ 0, &cmdline);
4629                                 if (r > 0)
4630                                         return strv_env_assign(env, "TERM", cmdline);
4631                                 if (r < 0)
4632                                         log_debug_errno(r, "Failed to read '%s' from kernel cmdline, ignoring: %m", key);
4633                         }
4634
4635                         /* This handles real virtual terminals (returning "linux") and
4636                          * any terminals which support the DCS +q query sequence. */
4637                         _cleanup_free_ char *dcs_term = NULL;
4638                         r = query_term_for_tty(tty_path, &dcs_term);
4639                         if (r >= 0)
4640                                 return strv_env_assign(env, "TERM", dcs_term);
4641                 }
4642         }
4643
4644         /* If $TERM is not known and we pick a fallback default, then let's also set
4645          * $COLORTERM=truecolor. That's because our fallback default is vt220, which is
4646          * generally a safe bet (as it supports PageUp/PageDown unlike vt100, and is quite
4647          * universally available in terminfo/termcap), except for the fact that real DEC
4648          * vt220 gear never actually supported color. Most tools these days generate color on
4649          * vt220 anyway, ignoring the physical capabilities of the real hardware, but some
4650          * tools actually believe in the historical truth. Which is unfortunate since *we*
4651          * *don't* care about the historical truth, we just want sane defaults if nothing
4652          * better is explicitly configured. It's 2025 after all, at the time of writing,
4653          * pretty much all terminal emulators actually *do* support color, hence if we don't
4654          * know any better let's explicitly claim color support via $COLORTERM. Or in other
4655          * words: we now explicitly claim to be connected to a franken-vt220 with true color
4656          * support. */
4657         r = strv_env_replace_strdup(env, "COLORTERM=truecolor");
4658         if (r < 0)
4659                 return r;
4660
4661         return strv_env_replace_strdup(env, "TERM=" FALLBACK_TERM);
4662 }
4663
4664 int exec_invoke(
4665                 const ExecCommand *command,
4666                 const ExecContext *context,
4667                 ExecParameters *params,
4668                 ExecRuntime *runtime,
4669                 const CGroupContext *cgroup_context,
4670                 int *exit_status) {
4671
4672         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL;
4673         int r;
4674         const char *username = NULL, *groupname = NULL;
4675         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL, *own_user = NULL;
4676         const char *pwent_home = NULL, *shell = NULL;
4677         dev_t journal_stream_dev = 0;
4678         ino_t journal_stream_ino = 0;
4679         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4680                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4681                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4682                 have_cap_sys_admin,
4683                 userns_set_up = false,
4684                 keep_seccomp_privileges = false;
4685 #if HAVE_SELINUX
4686         _cleanup_free_ char *mac_selinux_context_net = NULL;
4687         bool use_selinux = false;
4688 #endif
4689 #if ENABLE_SMACK
4690         bool use_smack = false;
4691 #endif
4692 #if HAVE_APPARMOR
4693         bool use_apparmor = false;
4694 #endif
4695 #if HAVE_SECCOMP
4696         uint64_t saved_bset = 0;
4697 #endif
4698         uid_t saved_uid = getuid();
4699         gid_t saved_gid = getgid();
4700         uid_t uid = UID_INVALID;
4701         gid_t gid = GID_INVALID;
4702         size_t n_fds, /* fds to pass to the child */
4703                n_keep_fds; /* total number of fds not to close */
4704         int secure_bits;
4705         _cleanup_free_ gid_t *gids = NULL, *gids_after_pam = NULL;
4706         int ngids = 0, ngids_after_pam = 0;
4707         int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4708         size_t n_storage_fds, n_socket_fds, n_extra_fds;
4709
4710         assert(command);
4711         assert(context);
4712         assert(params);
4713         assert(runtime);
4714         assert(cgroup_context);
4715         assert(exit_status);
4716
4717         LOG_CONTEXT_PUSH_EXEC(context, params);
4718
4719         /* Explicitly test for CVE-2021-4034 inspired invocations */
4720         if (!command->path || strv_isempty(command->argv)) {
4721                 *exit_status = EXIT_EXEC;
4722                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Invalid command line arguments.");
4723         }
4724
4725         if (context->std_input == EXEC_INPUT_SOCKET ||
4726             context->std_output == EXEC_OUTPUT_SOCKET ||
4727             context->std_error == EXEC_OUTPUT_SOCKET) {
4728
4729                 if (params->n_socket_fds > 1)
4730                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4731
4732                 if (params->n_socket_fds == 0)
4733                         return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4734
4735                 socket_fd = params->fds[0];
4736                 n_storage_fds = n_socket_fds = n_extra_fds = 0;
4737         } else {
4738                 n_socket_fds = params->n_socket_fds;
4739                 n_storage_fds = params->n_storage_fds;
4740                 n_extra_fds = params->n_extra_fds;
4741         }
4742         n_fds = n_socket_fds + n_storage_fds + n_extra_fds;
4743
4744         r = exec_context_named_iofds(context, params, named_iofds);
4745         if (r < 0)
4746                 return log_error_errno(r, "Failed to load a named file descriptor: %m");
4747
4748         rename_process_from_path(command->path);
4749
4750         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4751          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4752          * both of which will be demoted to SIG_DFL. */
4753         (void) default_signals(SIGNALS_CRASH_HANDLER,
4754                                SIGNALS_IGNORE);
4755
4756         if (context->ignore_sigpipe)
4757                 (void) ignore_signals(SIGPIPE);
4758
4759         r = reset_signal_mask();
4760         if (r < 0) {
4761                 *exit_status = EXIT_SIGNAL_MASK;
4762                 return log_error_errno(r, "Failed to set process signal mask: %m");
4763         }
4764
4765         if (params->idle_pipe)
4766                 do_idle_pipe_dance(params->idle_pipe);
4767
4768         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4769          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4770          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4771          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4772
4773         log_forget_fds();
4774         log_set_open_when_needed(true);
4775         log_settle_target();
4776
4777         /* In case anything used libc syslog(), close this here, too */
4778         closelog();
4779
4780         r = collect_open_file_fds(params, &n_fds);
4781         if (r < 0) {
4782                 *exit_status = EXIT_FDS;
4783                 return log_error_errno(r, "Failed to get OpenFile= file descriptors: %m");
4784         }
4785
4786         int keep_fds[n_fds + 4];
4787         memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4788         n_keep_fds = n_fds;
4789
4790         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4791         if (r < 0) {
4792                 *exit_status = EXIT_FDS;
4793                 return log_error_errno(r, "Failed to collect shifted fd: %m");
4794         }
4795
4796         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
4797         if (r < 0) {
4798                 *exit_status = EXIT_FDS;
4799                 return log_error_errno(r, "Failed to collect shifted fd: %m");
4800         }
4801
4802 #if HAVE_LIBBPF
4803         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4804         if (r < 0) {
4805                 *exit_status = EXIT_FDS;
4806                 return log_error_errno(r, "Failed to collect shifted fd: %m");
4807         }
4808 #endif
4809
4810         r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4811         if (r < 0) {
4812                 *exit_status = EXIT_FDS;
4813                 return log_error_errno(r, "Failed to close unwanted file descriptors: %m");
4814         }
4815
4816         if (!context->same_pgrp &&
4817             setsid() < 0) {
4818                 *exit_status = EXIT_SETSID;
4819                 return log_error_errno(errno, "Failed to create new process session: %m");
4820         }
4821
4822         /* Now, reset the TTY associated to this service "destructively" (i.e. possibly even hang up or
4823          * disallocate the VT), to get rid of any prior uses of the device. Note that we do not keep any fd
4824          * open here, hence some of the settings made here might vanish again, depending on the TTY driver
4825          * used. A 2nd ("constructive") initialization after we opened the input/output fds we actually want
4826          * will fix this. Note that we pass a NULL invocation ID here – as exec_context_tty_reset() expects
4827          * the invocation ID associated with the OSC 3008 context ID to close. But we don't want to close any
4828          * OSC 3008 context here, and opening a fresh OSC 3008 context happens a bit further down. */
4829         exec_context_tty_reset(context, params, /* invocation_id= */ SD_ID128_NULL);
4830
4831         if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4832                 _cleanup_free_ char *cmdline = NULL;
4833
4834                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4835                 if (!cmdline) {
4836                         *exit_status = EXIT_MEMORY;
4837                         return log_oom();
4838                 }
4839
4840                 r = ask_for_confirmation(context, params, cmdline);
4841                 if (r != CONFIRM_EXECUTE) {
4842                         if (r == CONFIRM_PRETEND_SUCCESS) {
4843                                 *exit_status = EXIT_SUCCESS;
4844                                 return 0;
4845                         }
4846
4847                         *exit_status = EXIT_CONFIRM;
4848                         return log_error_errno(SYNTHETIC_ERRNO(ECANCELED), "Execution cancelled by the user.");
4849                 }
4850         }
4851
4852         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4853          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4854          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4855          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4856          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4857         if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4858             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4859                 *exit_status = EXIT_MEMORY;
4860                 return log_error_errno(errno, "Failed to update environment: %m");
4861         }
4862
4863         if (context->dynamic_user && runtime->dynamic_creds) {
4864                 _cleanup_strv_free_ char **suggested_paths = NULL;
4865
4866                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4867                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4868                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4869                         *exit_status = EXIT_USER;
4870                         return log_error_errno(errno, "Failed to update environment: %m");
4871                 }
4872
4873                 r = compile_suggested_paths(context, params, &suggested_paths);
4874                 if (r < 0) {
4875                         *exit_status = EXIT_MEMORY;
4876                         return log_oom();
4877                 }
4878
4879                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4880                 if (r < 0) {
4881                         *exit_status = EXIT_USER;
4882                         if (r == -EILSEQ)
4883                                 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
4884                                                        "Failed to update dynamic user credentials: User or group with specified name already exists.");
4885                         return log_error_errno(r, "Failed to update dynamic user credentials: %m");
4886                 }
4887
4888                 if (!uid_is_valid(uid)) {
4889                         *exit_status = EXIT_USER;
4890                         return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid);
4891                 }
4892
4893                 if (!gid_is_valid(gid)) {
4894                         *exit_status = EXIT_USER;
4895                         return log_error_errno(SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid);
4896                 }
4897
4898                 if (runtime->dynamic_creds->user)
4899                         username = runtime->dynamic_creds->user->name;
4900
4901         } else {
4902                 const char *u;
4903
4904                 if (context->user)
4905                         u = context->user;
4906                 else if (context->pam_name || FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
4907                         /* If PAM is enabled but no user name is explicitly selected, then use our own one. */
4908                         own_user = getusername_malloc();
4909                         if (!own_user) {
4910                                 *exit_status = EXIT_USER;
4911                                 return log_error_errno(r, "Failed to determine my own user ID: %m");
4912                         }
4913                         u = own_user;
4914                 } else
4915                         u = NULL;
4916
4917                 if (u) {
4918                         /* We can't use nss unconditionally for root without risking deadlocks if some IPC services
4919                          * will be started by pid1 and are ordered after us. But if SetLoginEnvironment= is
4920                          * enabled *explicitly* (i.e. no exec_context_get_set_login_environment() here),
4921                          * or PAM shall be invoked, let's consult NSS even for root, so that the user
4922                          * gets accurate $SHELL in session(-like) contexts. */
4923                         r = get_fixed_user(u,
4924                                            /* prefer_nss = */ context->set_login_environment > 0 || context->pam_name,
4925                                            &username, &uid, &gid, &pwent_home, &shell);
4926                         if (r < 0) {
4927                                 *exit_status = EXIT_USER;
4928                                 return log_error_errno(r, "Failed to determine user credentials: %m");
4929                         }
4930                 }
4931
4932                 if (context->group) {
4933                         r = get_fixed_group(context->group, &groupname, &gid);
4934                         if (r < 0) {
4935                                 *exit_status = EXIT_GROUP;
4936                                 return log_error_errno(r, "Failed to determine group credentials: %m");
4937                         }
4938                 }
4939         }
4940
4941         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4942         ngids = get_supplementary_groups(context, username, gid, &gids);
4943         if (ngids < 0) {
4944                 *exit_status = EXIT_GROUP;
4945                 return log_error_errno(ngids, "Failed to determine supplementary groups: %m");
4946         }
4947
4948         r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4949         if (r < 0) {
4950                 *exit_status = EXIT_USER;
4951                 return log_error_errno(r, "Failed to send user credentials to PID1: %m");
4952         }
4953
4954         params->user_lookup_fd = safe_close(params->user_lookup_fd);
4955
4956         r = acquire_home(context, &pwent_home, &home_buffer);
4957         if (r < 0) {
4958                 *exit_status = EXIT_CHDIR;
4959                 return log_error_errno(r, "Failed to determine $HOME for the invoking user: %m");
4960         }
4961
4962         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4963         if (socket_fd >= 0)
4964                 (void) fd_nonblock(socket_fd, false);
4965
4966         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4967          * from it. */
4968         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4969
4970         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4971          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4972         if (params->cgroup_path) {
4973                 _cleanup_free_ char *subcgroup = NULL;
4974
4975                 r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &subcgroup);
4976                 if (r < 0) {
4977                         *exit_status = EXIT_CGROUP;
4978                         return log_error_errno(r, "Failed to acquire cgroup path: %m");
4979                 }
4980                 if (r > 0) {
4981                         /* If there is a subcgroup required, let's make sure to create it now. */
4982                         r = cg_create(subcgroup);
4983                         if (r < 0)
4984                                 return log_error_errno(r, "Failed to create subcgroup '%s': %m", subcgroup);
4985                 }
4986
4987                 /* If we need a cgroup namespace, we cannot yet move the service to its configured subgroup,
4988                  * as unsharing the cgroup namespace later on makes the current cgroup the root of the
4989                  * namespace and we want the root of the namespace to be the main service cgroup and not the
4990                  * subgroup. One edge case is if we're a control process that needs to be spawned in a
4991                  * subgroup, in this case, we have no choice as moving into the main service cgroup might
4992                  * violate the no inner processes rule of cgroupv2. */
4993                 const char *cgtarget = needs_sandboxing && exec_needs_cgroup_namespace(context) &&
4994                                                            !exec_params_needs_control_subcgroup(params)
4995                                                            ? params->cgroup_path : subcgroup;
4996
4997                 r = cg_attach(cgtarget, 0);
4998                 if (r == -EUCLEAN) {
4999                         *exit_status = EXIT_CGROUP;
5000                         return log_error_errno(r,
5001                                                "Failed to attach process to cgroup '%s', "
5002                                                "because the cgroup or one of its parents or "
5003                                                "siblings is in the threaded mode.", cgtarget);
5004                 }
5005                 if (r < 0) {
5006                         *exit_status = EXIT_CGROUP;
5007                         return log_error_errno(r, "Failed to attach to cgroup %s: %m", cgtarget);
5008                 }
5009         }
5010
5011         if (context->network_namespace_path && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5012                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5013                 if (r < 0) {
5014                         *exit_status = EXIT_NETWORK;
5015                         return log_error_errno(r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5016                 }
5017         }
5018
5019         if (context->ipc_namespace_path && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5020                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5021                 if (r < 0) {
5022                         *exit_status = EXIT_NAMESPACE;
5023                         return log_error_errno(r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5024                 }
5025         }
5026
5027         r = setup_input(context, params, socket_fd, named_iofds);
5028         if (r < 0) {
5029                 *exit_status = EXIT_STDIN;
5030                 return log_error_errno(r, "Failed to set up standard input: %m");
5031         }
5032
5033         _cleanup_free_ char *fname = NULL;
5034         r = path_extract_filename(command->path, &fname);
5035         if (r < 0) {
5036                 *exit_status = EXIT_STDOUT;
5037                 return log_error_errno(r, "Failed to extract filename from path %s: %m", command->path);
5038         }
5039
5040         r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
5041         if (r < 0) {
5042                 *exit_status = EXIT_STDOUT;
5043                 return log_error_errno(r, "Failed to set up standard output: %m");
5044         }
5045
5046         r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
5047         if (r < 0) {
5048                 *exit_status = EXIT_STDERR;
5049                 return log_error_errno(r, "Failed to set up standard error output: %m");
5050         }
5051
5052         /* Now that stdin/stdout are definiely opened, properly initialize it with our desired
5053          * settings. Note: this is a "constructive" reset, it prepares things for us to use. This is
5054          * different from the "destructive" TTY reset further up. Also note: we apply this on stdin/stdout in
5055          * case this is a tty, regardless if we opened it ourselves or got it passed in pre-opened. */
5056         prepare_terminal(context, params);
5057
5058         if (context->oom_score_adjust_set) {
5059                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
5060                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
5061                 r = set_oom_score_adjust(context->oom_score_adjust);
5062                 if (ERRNO_IS_NEG_PRIVILEGE(r))
5063                         log_debug_errno(r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5064                 else if (r < 0) {
5065                         *exit_status = EXIT_OOM_ADJUST;
5066                         return log_error_errno(r, "Failed to adjust OOM setting: %m");
5067                 }
5068         }
5069
5070         if (context->coredump_filter_set) {
5071                 r = set_coredump_filter(context->coredump_filter);
5072                 if (ERRNO_IS_NEG_PRIVILEGE(r))
5073                         log_debug_errno(r, "Failed to adjust coredump_filter, ignoring: %m");
5074                 else if (r < 0) {
5075                         *exit_status = EXIT_LIMITS;
5076                         return log_error_errno(r, "Failed to adjust coredump_filter: %m");
5077                 }
5078         }
5079
5080         if (context->cpu_sched_set) {
5081                 struct sched_attr attr = {
5082                         .size = sizeof(attr),
5083                         .sched_policy = context->cpu_sched_policy,
5084                         .sched_priority = context->cpu_sched_priority,
5085                         .sched_flags = context->cpu_sched_reset_on_fork ? SCHED_FLAG_RESET_ON_FORK : 0,
5086                 };
5087
5088                 r = sched_setattr(/* pid= */ 0, &attr, /* flags= */ 0);
5089                 if (r < 0) {
5090                         *exit_status = EXIT_SETSCHEDULER;
5091                         return log_error_errno(errno, "Failed to set up CPU scheduling: %m");
5092                 }
5093         }
5094
5095         /*
5096          * Set nice value _after_ the call to sched_setattr() because struct sched_attr includes sched_nice
5097          * which we do not set, thus it will clobber any previously set nice value. Scheduling policy might
5098          * be reasonably set together with nice value e.g. in case of SCHED_BATCH (see sched(7)).
5099          * It would be ideal to set both with the same call, but we cannot easily do so because of all the
5100          * extra logic in setpriority_closest().
5101          */
5102         if (context->nice_set) {
5103                 r = setpriority_closest(context->nice);
5104                 if (r < 0) {
5105                         *exit_status = EXIT_NICE;
5106                         return log_error_errno(r, "Failed to set up process scheduling priority (nice level): %m");
5107                 }
5108         }
5109
5110         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5111                 _cleanup_(cpu_set_done) CPUSet converted_cpu_set = {};
5112                 const CPUSet *cpu_set;
5113
5114                 if (context->cpu_affinity_from_numa) {
5115                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5116                         if (r < 0) {
5117                                 *exit_status = EXIT_CPUAFFINITY;
5118                                 return log_error_errno(r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5119                         }
5120
5121                         cpu_set = &converted_cpu_set;
5122                 } else
5123                         cpu_set = &context->cpu_set;
5124
5125                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5126                         *exit_status = EXIT_CPUAFFINITY;
5127                         return log_error_errno(errno, "Failed to set up CPU affinity: %m");
5128                 }
5129         }
5130
5131         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5132                 r = apply_numa_policy(&context->numa_policy);
5133                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
5134                         log_debug_errno(r, "NUMA support not available, ignoring.");
5135                 else if (r < 0) {
5136                         *exit_status = EXIT_NUMA_POLICY;
5137                         return log_error_errno(r, "Failed to set NUMA memory policy: %m");
5138                 }
5139         }
5140
5141         if (context->ioprio_set)
5142                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5143                         *exit_status = EXIT_IOPRIO;
5144                         return log_error_errno(errno, "Failed to set up IO scheduling priority: %m");
5145                 }
5146
5147         if (context->timer_slack_nsec != NSEC_INFINITY)
5148                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5149                         *exit_status = EXIT_TIMERSLACK;
5150                         return log_error_errno(errno, "Failed to set up timer slack: %m");
5151                 }
5152
5153         if (context->personality != PERSONALITY_INVALID) {
5154                 r = safe_personality(context->personality);
5155                 if (r < 0) {
5156                         *exit_status = EXIT_PERSONALITY;
5157                         return log_error_errno(r, "Failed to set up execution domain (personality): %m");
5158                 }
5159         }
5160
5161         if (context->memory_ksm >= 0)
5162                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
5163                         if (ERRNO_IS_NOT_SUPPORTED(errno))
5164                                 log_debug_errno(errno, "KSM support not available, ignoring.");
5165                         else {
5166                                 *exit_status = EXIT_KSM;
5167                                 return log_error_errno(errno, "Failed to set KSM: %m");
5168                         }
5169                 }
5170
5171 #if ENABLE_UTMP
5172         if (context->utmp_id) {
5173                 _cleanup_free_ char *username_alloc = NULL;
5174
5175                 if (!username && context->utmp_mode == EXEC_UTMP_USER) {
5176                         username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
5177                         if (!username_alloc) {
5178                                 *exit_status = EXIT_USER;
5179                                 return log_oom();
5180                         }
5181                 }
5182
5183                 const char *line = context->tty_path ?
5184                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5185                         NULL;
5186                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5187                                       line,
5188                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
5189                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5190                                       USER_PROCESS,
5191                                       username ?: username_alloc);
5192         }
5193 #endif
5194
5195         if (uid_is_valid(uid)) {
5196                 r = chown_terminal(STDIN_FILENO, uid);
5197                 if (r < 0) {
5198                         *exit_status = EXIT_STDIN;
5199                         return log_error_errno(r, "Failed to change ownership of terminal: %m");
5200                 }
5201         }
5202
5203         if (params->cgroup_path) {
5204                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5205                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5206                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5207                  * touch a single hierarchy too. */
5208
5209                 if (params->flags & EXEC_CGROUP_DELEGATE) {
5210                         _cleanup_free_ char *p = NULL;
5211
5212                         r = cg_set_access(params->cgroup_path, uid, gid);
5213                         if (r < 0) {
5214                                 *exit_status = EXIT_CGROUP;
5215                                 return log_error_errno(r, "Failed to adjust control group access: %m");
5216                         }
5217
5218                         r = exec_params_get_cgroup_path(params, cgroup_context, params->cgroup_path, &p);
5219                         if (r < 0) {
5220                                 *exit_status = EXIT_CGROUP;
5221                                 return log_error_errno(r, "Failed to acquire cgroup path: %m");
5222                         }
5223                         if (r > 0) {
5224                                 r = cg_set_access_recursive(p, uid, gid);
5225                                 if (r < 0) {
5226                                         *exit_status = EXIT_CGROUP;
5227                                         return log_error_errno(r, "Failed to adjust control subgroup access: %m");
5228                                 }
5229                         }
5230                 }
5231
5232                 if (is_pressure_supported() > 0) {
5233                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
5234                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5235                                 if (r < 0) {
5236                                         *exit_status = EXIT_MEMORY;
5237                                         return log_oom();
5238                                 }
5239
5240                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5241                                 if (r < 0) {
5242                                         log_full_errno(r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5243                                                        "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5244                                         memory_pressure_path = mfree(memory_pressure_path);
5245                                 }
5246                                 /* First we use the current cgroup path to chmod and chown the memory pressure path, then pass the path relative
5247                                  * to the cgroup namespace to environment variables and mounts. If chown/chmod fails, we should not pass memory
5248                                  * pressure path environment variable or read-write mount to the unit. This is why we check if
5249                                  * memory_pressure_path != NULL in the conditional below. */
5250                                 if (memory_pressure_path && needs_sandboxing && exec_needs_cgroup_namespace(context)) {
5251                                         memory_pressure_path = mfree(memory_pressure_path);
5252                                         r = cg_get_path("memory", "", "memory.pressure", &memory_pressure_path);
5253                                         if (r < 0) {
5254                                                 *exit_status = EXIT_MEMORY;
5255                                                 return log_oom();
5256                                         }
5257                                 }
5258                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_NO) {
5259                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5260                                 if (!memory_pressure_path) {
5261                                         *exit_status = EXIT_MEMORY;
5262                                         return log_oom();
5263                                 }
5264                         }
5265                 }
5266         }
5267
5268         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5269
5270         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5271                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5272                 if (r < 0)
5273                         return log_error_errno(r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5274         }
5275
5276         r = exec_setup_credentials(context, cgroup_context, params, params->unit_id, uid, gid);
5277         if (r < 0) {
5278                 *exit_status = EXIT_CREDENTIALS;
5279                 return log_error_errno(r, "Failed to set up credentials: %m");
5280         }
5281
5282         r = build_environment(
5283                         context,
5284                         params,
5285                         cgroup_context,
5286                         n_fds,
5287                         pwent_home,
5288                         username,
5289                         shell,
5290                         journal_stream_dev,
5291                         journal_stream_ino,
5292                         memory_pressure_path,
5293                         needs_sandboxing,
5294                         &our_env);
5295         if (r < 0) {
5296                 *exit_status = EXIT_MEMORY;
5297                 return log_oom();
5298         }
5299
5300         r = build_pass_environment(context, &pass_env);
5301         if (r < 0) {
5302                 *exit_status = EXIT_MEMORY;
5303                 return log_oom();
5304         }
5305
5306         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5307          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5308          * not specify PATH but the unit has ExecSearchPath. */
5309         if (!strv_isempty(context->exec_search_path)) {
5310                 _cleanup_free_ char *joined = NULL;
5311
5312                 joined = strv_join(context->exec_search_path, ":");
5313                 if (!joined) {
5314                         *exit_status = EXIT_MEMORY;
5315                         return log_oom();
5316                 }
5317
5318                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5319                 if (r < 0) {
5320                         *exit_status = EXIT_MEMORY;
5321                         return log_oom();
5322                 }
5323         }
5324
5325         accum_env = strv_env_merge(params->environment,
5326                                    our_env,
5327                                    joined_exec_search_path,
5328                                    pass_env,
5329                                    context->environment,
5330                                    params->files_env);
5331         if (!accum_env) {
5332                 *exit_status = EXIT_MEMORY;
5333                 return log_oom();
5334         }
5335         strv_env_clean(accum_env);
5336
5337         (void) umask(context->umask);
5338
5339         r = setup_term_environment(context, &accum_env);
5340         if (r < 0) {
5341                 *exit_status = EXIT_MEMORY;
5342                 return log_error_errno(r, "Failed to construct $TERM: %m");
5343         }
5344
5345         r = setup_keyring(context, params, uid, gid);
5346         if (r < 0) {
5347                 *exit_status = EXIT_KEYRING;
5348                 return log_error_errno(r, "Failed to set up kernel keyring: %m");
5349         }
5350
5351         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5352          * excepted from either whole sandboxing or just setresuid() itself. */
5353         needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5354
5355         uint64_t capability_ambient_set = context->capability_ambient_set;
5356
5357         /* Check CAP_SYS_ADMIN before we enter user namespace to see if we can mount /proc even though its masked. */
5358         have_cap_sys_admin = have_effective_cap(CAP_SYS_ADMIN) > 0;
5359
5360         if (needs_sandboxing) {
5361                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5362                  * /sys being present. The actual MAC context application will happen later, as late as
5363                  * possible, to avoid impacting our own code paths. */
5364
5365 #if HAVE_SELINUX
5366                 use_selinux = mac_selinux_use();
5367 #endif
5368 #if ENABLE_SMACK
5369                 use_smack = mac_smack_use();
5370 #endif
5371 #if HAVE_APPARMOR
5372                 if (mac_apparmor_use()) {
5373                         r = dlopen_libapparmor();
5374                         if (r < 0 && !ERRNO_IS_NEG_NOT_SUPPORTED(r))
5375                                 log_warning_errno(r, "Failed to load libapparmor, ignoring: %m");
5376                         use_apparmor = r >= 0;
5377                 }
5378 #endif
5379         }
5380
5381         if (needs_sandboxing) {
5382                 int which_failed;
5383
5384                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5385                  * is set here. (See below.) */
5386
5387                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5388                 if (r < 0) {
5389                         *exit_status = EXIT_LIMITS;
5390                         return log_error_errno(r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5391                 }
5392         }
5393
5394         if (needs_setuid && context->pam_name && username) {
5395                 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
5396                  * wins here. (See above.) */
5397
5398                 /* All fds passed in the fds array will be closed in the pam child process. */
5399                 r = setup_pam(context, cgroup_context, params, username, uid, gid, &accum_env,
5400                               params->fds, n_fds, needs_sandboxing, params->exec_fd);
5401                 if (r < 0) {
5402                         *exit_status = EXIT_PAM;
5403                         return log_error_errno(r, "Failed to set up PAM session: %m");
5404                 }
5405
5406                 /* PAM modules might have set some ambient caps. Query them here and merge them into
5407                  * the caps we want to set in the end, so that we don't end up unsetting them. */
5408                 uint64_t ambient_after_pam;
5409                 r = capability_get_ambient(&ambient_after_pam);
5410                 if (r < 0) {
5411                         *exit_status = EXIT_CAPABILITIES;
5412                         return log_error_errno(r, "Failed to query ambient caps: %m");
5413                 }
5414
5415                 capability_ambient_set |= ambient_after_pam;
5416
5417                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5418                 if (ngids_after_pam < 0) {
5419                         *exit_status = EXIT_GROUP;
5420                         return log_error_errno(ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5421                 }
5422         }
5423
5424         if (needs_sandboxing && !have_cap_sys_admin && exec_needs_cap_sys_admin(context, params)) {
5425                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5426                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5427                  * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5428                 PrivateUsers pu = exec_context_get_effective_private_users(context, params);
5429                 if (pu == PRIVATE_USERS_NO)
5430                         pu = PRIVATE_USERS_SELF;
5431
5432                 /* The kernel requires /proc/pid/setgroups be set to "deny" prior to writing /proc/pid/gid_map in
5433                  * unprivileged user namespaces. */
5434                 r = setup_private_users(pu, saved_uid, saved_gid, uid, gid, /* allow_setgroups= */ false);
5435                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5436                  * the actual requested operations fail (or silently continue). */
5437                 if (r < 0 && context->private_users != PRIVATE_USERS_NO) {
5438                         *exit_status = EXIT_USER;
5439                         return log_error_errno(r, "Failed to set up user namespacing for unprivileged user: %m");
5440                 }
5441                 if (r < 0)
5442                         log_info_errno(r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5443                 else {
5444                         assert(r > 0);
5445                         userns_set_up = true;
5446                         log_debug("Set up unprivileged user namespace");
5447                 }
5448         }
5449
5450         /* Call setup_delegated_namespaces() the first time to unshare all non-delegated namespaces. */
5451         r = setup_delegated_namespaces(
5452                         context,
5453                         params,
5454                         runtime,
5455                         /* delegate= */ false,
5456                         memory_pressure_path,
5457                         uid,
5458                         gid,
5459                         command,
5460                         needs_sandboxing,
5461                         have_cap_sys_admin,
5462                         exit_status);
5463         if (r < 0)
5464                 return r;
5465
5466         /* Drop groups as early as possible.
5467          * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
5468          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5469         if (needs_setuid) {
5470                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5471                 int ngids_to_enforce;
5472
5473                 ngids_to_enforce = merge_gid_lists(gids,
5474                                                    ngids,
5475                                                    gids_after_pam,
5476                                                    ngids_after_pam,
5477                                                    &gids_to_enforce);
5478                 if (ngids_to_enforce < 0) {
5479                         *exit_status = EXIT_GROUP;
5480                         return log_error_errno(ngids_to_enforce, "Failed to merge group lists. Group membership might be incorrect: %m");
5481                 }
5482
5483                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5484                 if (r < 0) {
5485                         *exit_status = EXIT_GROUP;
5486                         return log_error_errno(r, "Changing group credentials failed: %m");
5487                 }
5488         }
5489
5490         /* If the user namespace was not set up above, try to do it now.
5491          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5492          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5493          * case of mount namespaces being less privileged when the mount point list is copied from a
5494          * different user namespace). */
5495
5496         if (needs_sandboxing && !userns_set_up) {
5497                 PrivateUsers pu = exec_context_get_effective_private_users(context, params);
5498
5499                 r = setup_private_users(pu, saved_uid, saved_gid, uid, gid,
5500                                         /* allow_setgroups= */ pu == PRIVATE_USERS_FULL);
5501                 if (r < 0) {
5502                         *exit_status = EXIT_USER;
5503                         return log_error_errno(r, "Failed to set up user namespacing: %m");
5504                 }
5505                 if (r > 0)
5506                         log_debug("Set up privileged user namespace");
5507         }
5508
5509         /* Call setup_delegated_namespaces() the second time to unshare all delegated namespaces. */
5510         r = setup_delegated_namespaces(
5511                         context,
5512                         params,
5513                         runtime,
5514                         /* delegate= */ true,
5515                         memory_pressure_path,
5516                         uid,
5517                         gid,
5518                         command,
5519                         needs_sandboxing,
5520                         have_cap_sys_admin,
5521                         exit_status);
5522         if (r < 0)
5523                 return r;
5524
5525         if (needs_sandboxing && exec_needs_cgroup_namespace(context) && params->cgroup_path) {
5526                 /* Move ourselves into the subcgroup now *after* we've unshared the cgroup namespace, which
5527                  * ensures the root of the cgroup namespace is the top level service cgroup and not the
5528                  * subcgroup. Adjust the prefix accordingly since we're in a cgroup namespace now. */
5529                 r = attach_to_subcgroup(context, cgroup_context, params, /* prefix= */ NULL);
5530                 if (r < 0) {
5531                         *exit_status = EXIT_CGROUP;
5532                         return r;
5533                 }
5534         }
5535
5536         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5537          * shall execute. */
5538
5539         const char *path = command->path;
5540
5541         if (FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
5542                 if (shell_is_placeholder(shell)) {
5543                         log_debug("Shell prefixing requested for user without default shell, using /bin/sh: %s",
5544                                   strna(username));
5545                         assert(streq(path, _PATH_BSHELL));
5546                 } else
5547                         path = shell;
5548         }
5549
5550         _cleanup_free_ char *executable = NULL;
5551         _cleanup_close_ int executable_fd = -EBADF;
5552         r = find_executable_full(path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5553         if (r < 0) {
5554                 *exit_status = EXIT_EXEC;
5555                 log_struct_errno(LOG_NOTICE, r,
5556                                  LOG_MESSAGE_ID(SD_MESSAGE_SPAWN_FAILED_STR),
5557                                  LOG_EXEC_MESSAGE(params, "Unable to locate executable '%s': %m", path),
5558                                  LOG_ITEM("EXECUTABLE=%s", path));
5559                 /* If the error will be ignored by manager, tune down the log level here. Missing executable
5560                  * is very much expected in this case. */
5561                 return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
5562         }
5563
5564         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
5565         if (r < 0) {
5566                 *exit_status = EXIT_FDS;
5567                 return log_error_errno(r, "Failed to collect shifted fd: %m");
5568         }
5569
5570 #if HAVE_SELINUX
5571         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5572                 int fd = -EBADF;
5573
5574                 if (socket_fd >= 0)
5575                         fd = socket_fd;
5576                 else if (params->n_socket_fds == 1)
5577                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5578                          * use context from that fd to compute the label. */
5579                         fd = params->fds[0];
5580
5581                 if (fd >= 0) {
5582                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5583                         if (r < 0) {
5584                                 if (!context->selinux_context_ignore) {
5585                                         *exit_status = EXIT_SELINUX_CONTEXT;
5586                                         return log_error_errno(r, "Failed to determine SELinux context: %m");
5587                                 }
5588                                 log_debug_errno(r, "Failed to determine SELinux context, ignoring: %m");
5589                         }
5590                 }
5591         }
5592 #endif
5593
5594         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5595          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5596          * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
5597          * them open until the final execve(). But first, close the remaining sockets in the context
5598          * objects. */
5599
5600         exec_runtime_close(runtime);
5601         exec_params_close(params);
5602
5603         r = close_all_fds(keep_fds, n_keep_fds);
5604         if (r >= 0)
5605                 r = pack_fds(params->fds, n_fds);
5606         if (r >= 0)
5607                 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
5608         if (r < 0) {
5609                 *exit_status = EXIT_FDS;
5610                 return log_error_errno(r, "Failed to adjust passed file descriptors: %m");
5611         }
5612
5613         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5614          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5615          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5616          * came this far. */
5617
5618         secure_bits = context->secure_bits;
5619
5620         if (needs_sandboxing) {
5621                 uint64_t bset;
5622
5623                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5624                  * (Note this is placed after the general resource limit initialization, see above, in order
5625                  * to take precedence.) */
5626                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5627                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5628                                 *exit_status = EXIT_LIMITS;
5629                                 return log_error_errno(errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5630                         }
5631                 }
5632
5633 #if ENABLE_SMACK
5634                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5635                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5636                 if (use_smack) {
5637                         r = setup_smack(context, params, executable_fd);
5638                         if (r < 0 && !context->smack_process_label_ignore) {
5639                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5640                                 return log_error_errno(r, "Failed to set SMACK process label: %m");
5641                         }
5642                 }
5643 #endif
5644
5645                 bset = context->capability_bounding_set;
5646
5647 #if HAVE_SECCOMP
5648                 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
5649                  * keep the needed privileges to apply it even if we're not root. */
5650                 if (needs_setuid &&
5651                     uid_is_valid(uid) &&
5652                     context_has_seccomp(context) &&
5653                     seccomp_allows_drop_privileges(context)) {
5654                         keep_seccomp_privileges = true;
5655
5656                         if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
5657                                 *exit_status = EXIT_USER;
5658                                 return log_error_errno(errno, "Failed to enable keep capabilities flag: %m");
5659                         }
5660
5661                         /* Save the current bounding set so we can restore it after applying the seccomp
5662                          * filter */
5663                         saved_bset = bset;
5664                         bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
5665                                 (UINT64_C(1) << CAP_SETPCAP);
5666                 }
5667 #endif
5668
5669                 if (!cap_test_all(bset)) {
5670                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
5671                         if (r < 0) {
5672                                 *exit_status = EXIT_CAPABILITIES;
5673                                 return log_error_errno(r, "Failed to drop capabilities: %m");
5674                         }
5675                 }
5676
5677                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5678                  * keep-caps set.
5679                  *
5680                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
5681                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
5682                  * the ambient capabilities can be raised as they are present in the permitted and
5683                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
5684                  * without changing the user, so we also set the ambient capabilities here.
5685                  *
5686                  * The requested ambient capabilities are raised in the inheritable set if the second
5687                  * argument is true. */
5688                 if (capability_ambient_set != 0) {
5689                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5690                         if (r < 0) {
5691                                 *exit_status = EXIT_CAPABILITIES;
5692                                 return log_error_errno(r, "Failed to apply ambient capabilities (before UID change): %m");
5693                         }
5694                 }
5695         }
5696
5697         /* chroot to root directory first, before we lose the ability to chroot */
5698         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5699         if (r < 0)
5700                 return log_error_errno(r, "Chrooting to the requested root directory failed: %m");
5701
5702         if (needs_setuid) {
5703                 if (uid_is_valid(uid)) {
5704                         r = enforce_user(context, uid, capability_ambient_set);
5705                         if (r < 0) {
5706                                 *exit_status = EXIT_USER;
5707                                 return log_error_errno(r, "Failed to change UID to " UID_FMT ": %m", uid);
5708                         }
5709
5710                         if (keep_seccomp_privileges) {
5711                                 if (!BIT_SET(capability_ambient_set, CAP_SETUID)) {
5712                                         r = drop_capability(CAP_SETUID);
5713                                         if (r < 0) {
5714                                                 *exit_status = EXIT_USER;
5715                                                 return log_error_errno(r, "Failed to drop CAP_SETUID: %m");
5716                                         }
5717                                 }
5718
5719                                 r = keep_capability(CAP_SYS_ADMIN);
5720                                 if (r < 0) {
5721                                         *exit_status = EXIT_USER;
5722                                         return log_error_errno(r, "Failed to keep CAP_SYS_ADMIN: %m");
5723                                 }
5724
5725                                 r = keep_capability(CAP_SETPCAP);
5726                                 if (r < 0) {
5727                                         *exit_status = EXIT_USER;
5728                                         return log_error_errno(r, "Failed to keep CAP_SETPCAP: %m");
5729                                 }
5730                         }
5731
5732                         if (capability_ambient_set != 0) {
5733
5734                                 /* Raise the ambient capabilities after user change. */
5735                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5736                                 if (r < 0) {
5737                                         *exit_status = EXIT_CAPABILITIES;
5738                                         return log_error_errno(r, "Failed to apply ambient capabilities (after UID change): %m");
5739                                 }
5740                         }
5741                 }
5742         }
5743
5744         /* Apply working directory here, because the working directory might be on NFS and only the user
5745          * running this service might have the correct privilege to change to the working directory. Also, it
5746          * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5747          * the cwd cannot be used to pin directories outside of the sandbox. */
5748         r = apply_working_directory(context, params, runtime, pwent_home, accum_env);
5749         if (r < 0) {
5750                 *exit_status = EXIT_CHDIR;
5751                 return log_error_errno(r, "Changing to the requested working directory failed: %m");
5752         }
5753
5754         if (needs_sandboxing) {
5755                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5756                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5757                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5758                  * are restricted. */
5759
5760 #if HAVE_SELINUX
5761                 if (use_selinux) {
5762                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5763
5764                         if (exec_context) {
5765                                 r = setexeccon(exec_context);
5766                                 if (r < 0) {
5767                                         if (!context->selinux_context_ignore) {
5768                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5769                                                 return log_error_errno(r, "Failed to change SELinux context to %s: %m", exec_context);
5770                                         }
5771                                         log_debug_errno(r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5772                                 }
5773                         }
5774                 }
5775 #endif
5776
5777 #if HAVE_APPARMOR
5778                 if (use_apparmor && context->apparmor_profile) {
5779                         r = ASSERT_PTR(sym_aa_change_onexec)(context->apparmor_profile);
5780                         if (r < 0 && !context->apparmor_profile_ignore) {
5781                                 *exit_status = EXIT_APPARMOR_PROFILE;
5782                                 return log_error_errno(errno, "Failed to prepare AppArmor profile change to %s: %m",
5783                                                        context->apparmor_profile);
5784                         }
5785                 }
5786 #endif
5787
5788                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5789                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5790                  * requires CAP_SETPCAP. */
5791                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5792                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5793                          * effective set here.
5794                          *
5795                          * The effective set is overwritten during execve() with the following values:
5796                          *
5797                          * - ambient set (for non-root processes)
5798                          *
5799                          * - (inheritable | bounding) set for root processes)
5800                          *
5801                          * Hence there is no security impact to raise it in the effective set before execve
5802                          */
5803                         r = capability_gain_cap_setpcap(/* ret_before_caps = */ NULL);
5804                         if (r < 0) {
5805                                 *exit_status = EXIT_CAPABILITIES;
5806                                 return log_error_errno(r, "Failed to gain CAP_SETPCAP for setting secure bits");
5807                         }
5808                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5809                                 *exit_status = EXIT_SECUREBITS;
5810                                 return log_error_errno(errno, "Failed to set process secure bits: %m");
5811                         }
5812                 }
5813
5814                 if (context_has_no_new_privileges(context))
5815                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5816                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5817                                 return log_error_errno(errno, "Failed to disable new privileges: %m");
5818                         }
5819
5820 #if HAVE_SECCOMP
5821                 r = apply_address_families(context, params);
5822                 if (r < 0) {
5823                         *exit_status = EXIT_ADDRESS_FAMILIES;
5824                         return log_error_errno(r, "Failed to restrict address families: %m");
5825                 }
5826
5827                 r = apply_memory_deny_write_execute(context, params);
5828                 if (r < 0) {
5829                         *exit_status = EXIT_SECCOMP;
5830                         return log_error_errno(r, "Failed to disable writing to executable memory: %m");
5831                 }
5832
5833                 r = apply_restrict_realtime(context, params);
5834                 if (r < 0) {
5835                         *exit_status = EXIT_SECCOMP;
5836                         return log_error_errno(r, "Failed to apply realtime restrictions: %m");
5837                 }
5838
5839                 r = apply_restrict_suid_sgid(context, params);
5840                 if (r < 0) {
5841                         *exit_status = EXIT_SECCOMP;
5842                         return log_error_errno(r, "Failed to apply SUID/SGID restrictions: %m");
5843                 }
5844
5845                 r = apply_restrict_namespaces(context, params);
5846                 if (r < 0) {
5847                         *exit_status = EXIT_SECCOMP;
5848                         return log_error_errno(r, "Failed to apply namespace restrictions: %m");
5849                 }
5850
5851                 r = apply_protect_sysctl(context, params);
5852                 if (r < 0) {
5853                         *exit_status = EXIT_SECCOMP;
5854                         return log_error_errno(r, "Failed to apply sysctl restrictions: %m");
5855                 }
5856
5857                 r = apply_protect_kernel_modules(context, params);
5858                 if (r < 0) {
5859                         *exit_status = EXIT_SECCOMP;
5860                         return log_error_errno(r, "Failed to apply module loading restrictions: %m");
5861                 }
5862
5863                 r = apply_protect_kernel_logs(context, params);
5864                 if (r < 0) {
5865                         *exit_status = EXIT_SECCOMP;
5866                         return log_error_errno(r, "Failed to apply kernel log restrictions: %m");
5867                 }
5868
5869                 r = apply_protect_clock(context, params);
5870                 if (r < 0) {
5871                         *exit_status = EXIT_SECCOMP;
5872                         return log_error_errno(r, "Failed to apply clock restrictions: %m");
5873                 }
5874
5875                 r = apply_private_devices(context, params);
5876                 if (r < 0) {
5877                         *exit_status = EXIT_SECCOMP;
5878                         return log_error_errno(r, "Failed to set up private devices: %m");
5879                 }
5880
5881                 r = apply_syscall_archs(context, params);
5882                 if (r < 0) {
5883                         *exit_status = EXIT_SECCOMP;
5884                         return log_error_errno(r, "Failed to apply syscall architecture restrictions: %m");
5885                 }
5886
5887                 r = apply_lock_personality(context, params);
5888                 if (r < 0) {
5889                         *exit_status = EXIT_SECCOMP;
5890                         return log_error_errno(r, "Failed to lock personalities: %m");
5891                 }
5892
5893                 r = apply_syscall_log(context, params);
5894                 if (r < 0) {
5895                         *exit_status = EXIT_SECCOMP;
5896                         return log_error_errno(r, "Failed to apply system call log filters: %m");
5897                 }
5898 #endif
5899
5900 #if HAVE_LIBBPF
5901                 r = apply_restrict_filesystems(context, params);
5902                 if (r < 0) {
5903                         *exit_status = EXIT_BPF;
5904                         return log_error_errno(r, "Failed to restrict filesystems: %m");
5905                 }
5906 #endif
5907
5908 #if HAVE_SECCOMP
5909                 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5910                  * by the filter as little as possible. */
5911                 r = apply_syscall_filter(context, params);
5912                 if (r < 0) {
5913                         *exit_status = EXIT_SECCOMP;
5914                         return log_error_errno(r, "Failed to apply system call filters: %m");
5915                 }
5916
5917                 if (keep_seccomp_privileges) {
5918                         /* Restore the capability bounding set with what's expected from the service + the
5919                          * ambient capabilities hack */
5920                         if (!cap_test_all(saved_bset)) {
5921                                 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5922                                 if (r < 0) {
5923                                         *exit_status = EXIT_CAPABILITIES;
5924                                         return log_error_errno(r, "Failed to drop bset capabilities: %m");
5925                                 }
5926                         }
5927
5928                         /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5929                          * applications that use it. */
5930                         if (!BIT_SET(saved_bset, CAP_SYS_ADMIN)) {
5931                                 r = drop_capability(CAP_SYS_ADMIN);
5932                                 if (r < 0) {
5933                                         *exit_status = EXIT_USER;
5934                                         return log_error_errno(r, "Failed to drop CAP_SYS_ADMIN: %m");
5935                                 }
5936                         }
5937
5938                         /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5939                          * applications that use it. */
5940                         if (!BIT_SET(saved_bset, CAP_SETPCAP)) {
5941                                 r = drop_capability(CAP_SETPCAP);
5942                                 if (r < 0) {
5943                                         *exit_status = EXIT_USER;
5944                                         return log_error_errno(r, "Failed to drop CAP_SETPCAP: %m");
5945                                 }
5946                         }
5947
5948                         if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5949                                 *exit_status = EXIT_USER;
5950                                 return log_error_errno(errno, "Failed to drop keep capabilities flag: %m");
5951                         }
5952                 }
5953 #endif
5954
5955         }
5956
5957         if (!strv_isempty(context->unset_environment)) {
5958                 char **ee = NULL;
5959
5960                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5961                 if (!ee) {
5962                         *exit_status = EXIT_MEMORY;
5963                         return log_oom();
5964                 }
5965
5966                 strv_free_and_replace(accum_env, ee);
5967         }
5968
5969         _cleanup_strv_free_ char **replaced_argv = NULL, **argv_via_shell = NULL;
5970         char **final_argv = FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL) ? strv_skip(command->argv, 1) : command->argv;
5971
5972         if (final_argv && !FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5973                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5974
5975                 r = replace_env_argv(final_argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5976                 if (r < 0) {
5977                         *exit_status = EXIT_MEMORY;
5978                         return log_error_errno(r, "Failed to replace environment variables: %m");
5979                 }
5980                 final_argv = replaced_argv;
5981
5982                 if (!strv_isempty(unset_variables)) {
5983                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5984                         log_warning("Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5985                 }
5986
5987                 if (!strv_isempty(bad_variables)) {
5988                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5989                         log_warning("Invalid environment variable name evaluates to an empty string: %s", strna(jb));
5990                 }
5991         }
5992
5993         if (FLAGS_SET(command->flags, EXEC_COMMAND_VIA_SHELL)) {
5994                 r = strv_extendf(&argv_via_shell, "%s%s", command->argv[0][0] == '-' ? "-" : "", path);
5995                 if (r < 0) {
5996                         *exit_status = EXIT_MEMORY;
5997                         return log_oom();
5998                 }
5999
6000                 if (!strv_isempty(final_argv)) {
6001                         _cleanup_free_ char *cmdline_joined = NULL;
6002
6003                         cmdline_joined = strv_join(final_argv, " ");
6004                         if (!cmdline_joined) {
6005                                 *exit_status = EXIT_MEMORY;
6006                                 return log_oom();
6007                         }
6008
6009                         r = strv_extend_many(&argv_via_shell, "-c", cmdline_joined);
6010                         if (r < 0) {
6011                                 *exit_status = EXIT_MEMORY;
6012                                 return log_oom();
6013                         }
6014                 }
6015
6016                 final_argv = argv_via_shell;
6017         }
6018
6019         log_command_line(context, params, "Executing", executable, final_argv);
6020
6021         /* We have finished with all our initializations. Let's now let the manager know that. From this
6022          * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
6023
6024         r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
6025         if (r < 0)
6026                 return r;
6027
6028         /* As last thing before the execve(), let's send the handoff timestamp */
6029         r = send_handoff_timestamp(context, params, exit_status);
6030         if (r < 0) {
6031                 /* If this handoff timestamp failed, let's undo the marking as hot */
6032                 (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
6033                 return r;
6034         }
6035
6036         /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
6037          * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
6038          * exec_fd this is pretty much the whole raison d'etre. */
6039
6040         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
6041
6042         /* The execve() failed, let's undo the marking as hot */
6043         (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
6044
6045         *exit_status = EXIT_EXEC;
6046         return log_error_errno(r, "Failed to execute %s: %m", executable);
6047 }