src/core/exec-invoke.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/eventfd.h>
   4 #include <sys/ioctl.h>
   5 #include <sys/mount.h>
   6 #include <sys/prctl.h>
   7
   8 #if HAVE_PAM
   9 #include <security/pam_appl.h>
  10 #include <security/pam_misc.h>
  11 #endif
  12
  13 #if HAVE_APPARMOR
  14 #include <sys/apparmor.h>
  15 #endif
  16
  17 #include "sd-messages.h"
  18
  19 #if HAVE_APPARMOR
  20 #include "apparmor-util.h"
  21 #endif
  22 #include "argv-util.h"
  23 #include "barrier.h"
  24 #include "bpf-dlopen.h"
  25 #include "bpf-restrict-fs.h"
  26 #include "btrfs-util.h"
  27 #include "capability-util.h"
  28 #include "cgroup-setup.h"
  29 #include "chase.h"
  30 #include "chattr-util.h"
  31 #include "chown-recursive.h"
  32 #include "copy.h"
  33 #include "data-fd-util.h"
  34 #include "env-util.h"
  35 #include "escape.h"
  36 #include "exec-credential.h"
  37 #include "exec-invoke.h"
  38 #include "execute.h"
  39 #include "exit-status.h"
  40 #include "fd-util.h"
  41 #include "hexdecoct.h"
  42 #include "io-util.h"
  43 #include "iovec-util.h"
  44 #include "journal-send.h"
  45 #include "missing_ioprio.h"
  46 #include "missing_prctl.h"
  47 #include "missing_securebits.h"
  48 #include "missing_syscall.h"
  49 #include "mkdir-label.h"
  50 #include "proc-cmdline.h"
  51 #include "process-util.h"
  52 #include "psi-util.h"
  53 #include "rlimit-util.h"
  54 #include "seccomp-util.h"
  55 #include "selinux-util.h"
  56 #include "signal-util.h"
  57 #include "smack-util.h"
  58 #include "socket-util.h"
  59 #include "string-table.h"
  60 #include "strv.h"
  61 #include "terminal-util.h"
  62 #include "utmp-wtmp.h"
  63 #include "vpick.h"
  64
  65 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  66 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  67
  68 #define SNDBUF_SIZE (8*1024*1024)
  69
  70 static int flag_fds(
  71                 const int fds[],
  72                 size_t n_socket_fds,
  73                 size_t n_fds,
  74                 bool nonblock) {
  75
  76         int r;
  77
  78         assert(fds || n_fds == 0);
  79
  80         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
  81          * O_NONBLOCK only applies to socket activation though. */
  82
  83         for (size_t i = 0; i < n_fds; i++) {
  84
  85                 if (i < n_socket_fds) {
  86                         r = fd_nonblock(fds[i], nonblock);
  87                         if (r < 0)
  88                                 return r;
  89                 }
  90
  91                 /* We unconditionally drop FD_CLOEXEC from the fds,
  92                  * since after all we want to pass these fds to our
  93                  * children */
  94
  95                 r = fd_cloexec(fds[i], false);
  96                 if (r < 0)
  97                         return r;
  98         }
  99
 100         return 0;
 101 }
 102
 103 static bool is_terminal_input(ExecInput i) {
 104         return IN_SET(i,
 105                       EXEC_INPUT_TTY,
 106                       EXEC_INPUT_TTY_FORCE,
 107                       EXEC_INPUT_TTY_FAIL);
 108 }
 109
 110 static bool is_terminal_output(ExecOutput o) {
 111         return IN_SET(o,
 112                       EXEC_OUTPUT_TTY,
 113                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 114                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 115 }
 116
 117 static bool is_kmsg_output(ExecOutput o) {
 118         return IN_SET(o,
 119                       EXEC_OUTPUT_KMSG,
 120                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 121 }
 122
 123 static bool exec_context_needs_term(const ExecContext *c) {
 124         assert(c);
 125
 126         /* Return true if the execution context suggests we should set $TERM to something useful. */
 127
 128         if (is_terminal_input(c->std_input))
 129                 return true;
 130
 131         if (is_terminal_output(c->std_output))
 132                 return true;
 133
 134         if (is_terminal_output(c->std_error))
 135                 return true;
 136
 137         return !!c->tty_path;
 138 }
 139
 140 static int open_null_as(int flags, int nfd) {
 141         int fd;
 142
 143         assert(nfd >= 0);
 144
 145         fd = open("/dev/null", flags|O_NOCTTY);
 146         if (fd < 0)
 147                 return -errno;
 148
 149         return move_fd(fd, nfd, false);
 150 }
 151
 152 static int connect_journal_socket(
 153                 int fd,
 154                 const char *log_namespace,
 155                 uid_t uid,
 156                 gid_t gid) {
 157
 158         uid_t olduid = UID_INVALID;
 159         gid_t oldgid = GID_INVALID;
 160         const char *j;
 161         int r;
 162
 163         assert(fd >= 0);
 164
 165         j = journal_stream_path(log_namespace);
 166         if (!j)
 167                 return -EINVAL;
 168
 169         if (gid_is_valid(gid)) {
 170                 oldgid = getgid();
 171
 172                 if (setegid(gid) < 0)
 173                         return -errno;
 174         }
 175
 176         if (uid_is_valid(uid)) {
 177                 olduid = getuid();
 178
 179                 if (seteuid(uid) < 0) {
 180                         r = -errno;
 181                         goto restore_gid;
 182                 }
 183         }
 184
 185         r = connect_unix_path(fd, AT_FDCWD, j);
 186
 187         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 188            an LSM interferes. */
 189
 190         if (uid_is_valid(uid))
 191                 (void) seteuid(olduid);
 192
 193  restore_gid:
 194         if (gid_is_valid(gid))
 195                 (void) setegid(oldgid);
 196
 197         return r;
 198 }
 199
 200 static int connect_logger_as(
 201                 const ExecContext *context,
 202                 const ExecParameters *params,
 203                 ExecOutput output,
 204                 const char *ident,
 205                 int nfd,
 206                 uid_t uid,
 207                 gid_t gid) {
 208
 209         _cleanup_close_ int fd = -EBADF;
 210         int r;
 211
 212         assert(context);
 213         assert(params);
 214         assert(output < _EXEC_OUTPUT_MAX);
 215         assert(ident);
 216         assert(nfd >= 0);
 217
 218         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 219         if (fd < 0)
 220                 return -errno;
 221
 222         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 223         if (r < 0)
 224                 return r;
 225
 226         if (shutdown(fd, SHUT_RD) < 0)
 227                 return -errno;
 228
 229         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 230
 231         if (dprintf(fd,
 232                 "%s\n"
 233                 "%s\n"
 234                 "%i\n"
 235                 "%i\n"
 236                 "%i\n"
 237                 "%i\n"
 238                 "%i\n",
 239                 context->syslog_identifier ?: ident,
 240                 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
 241                 context->syslog_priority,
 242                 !!context->syslog_level_prefix,
 243                 false,
 244                 is_kmsg_output(output),
 245                 is_terminal_output(output)) < 0)
 246                 return -errno;
 247
 248         return move_fd(TAKE_FD(fd), nfd, false);
 249 }
 250
 251 static int open_terminal_as(const char *path, int flags, int nfd) {
 252         int fd;
 253
 254         assert(path);
 255         assert(nfd >= 0);
 256
 257         fd = open_terminal(path, flags | O_NOCTTY);
 258         if (fd < 0)
 259                 return fd;
 260
 261         return move_fd(fd, nfd, false);
 262 }
 263
 264 static int acquire_path(const char *path, int flags, mode_t mode) {
 265         _cleanup_close_ int fd = -EBADF;
 266         int r;
 267
 268         assert(path);
 269
 270         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 271                 flags |= O_CREAT;
 272
 273         fd = open(path, flags|O_NOCTTY, mode);
 274         if (fd >= 0)
 275                 return TAKE_FD(fd);
 276
 277         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 278                 return -errno;
 279
 280         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 281
 282         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 283         if (fd < 0)
 284                 return -errno;
 285
 286         r = connect_unix_path(fd, AT_FDCWD, path);
 287         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 288                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 289                  * wasn't an AF_UNIX socket after all */
 290                 return -ENXIO;
 291         if (r < 0)
 292                 return r;
 293
 294         if ((flags & O_ACCMODE) == O_RDONLY)
 295                 r = shutdown(fd, SHUT_WR);
 296         else if ((flags & O_ACCMODE) == O_WRONLY)
 297                 r = shutdown(fd, SHUT_RD);
 298         else
 299                 r = 0;
 300         if (r < 0)
 301                 return -errno;
 302
 303         return TAKE_FD(fd);
 304 }
 305
 306 static int fixup_input(
 307                 const ExecContext *context,
 308                 int socket_fd,
 309                 bool apply_tty_stdin) {
 310
 311         ExecInput std_input;
 312
 313         assert(context);
 314
 315         std_input = context->std_input;
 316
 317         if (is_terminal_input(std_input) && !apply_tty_stdin)
 318                 return EXEC_INPUT_NULL;
 319
 320         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 321                 return EXEC_INPUT_NULL;
 322
 323         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 324                 return EXEC_INPUT_NULL;
 325
 326         return std_input;
 327 }
 328
 329 static int fixup_output(ExecOutput output, int socket_fd) {
 330
 331         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 332                 return EXEC_OUTPUT_INHERIT;
 333
 334         return output;
 335 }
 336
 337 static int setup_input(
 338                 const ExecContext *context,
 339                 const ExecParameters *params,
 340                 int socket_fd,
 341                 const int named_iofds[static 3]) {
 342
 343         ExecInput i;
 344         int r;
 345
 346         assert(context);
 347         assert(params);
 348         assert(named_iofds);
 349
 350         if (params->stdin_fd >= 0) {
 351                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 352                         return -errno;
 353
 354                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 355                 if (isatty(STDIN_FILENO)) {
 356                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 357
 358                         if (context->tty_reset)
 359                                 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
 360
 361                         (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
 362                 }
 363
 364                 return STDIN_FILENO;
 365         }
 366
 367         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 368
 369         switch (i) {
 370
 371         case EXEC_INPUT_NULL:
 372                 return open_null_as(O_RDONLY, STDIN_FILENO);
 373
 374         case EXEC_INPUT_TTY:
 375         case EXEC_INPUT_TTY_FORCE:
 376         case EXEC_INPUT_TTY_FAIL: {
 377                 _cleanup_close_ int tty_fd = -EBADF;
 378                 const char *tty_path;
 379
 380                 tty_path = ASSERT_PTR(exec_context_tty_path(context));
 381
 382                 tty_fd = acquire_terminal(tty_path,
 383                                           i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 384                                           i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 385                                                                       ACQUIRE_TERMINAL_WAIT,
 386                                           USEC_INFINITY);
 387                 if (tty_fd < 0)
 388                         return tty_fd;
 389
 390                 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
 391                 if (r < 0)
 392                         return r;
 393
 394                 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
 395                 if (r < 0)
 396                         return r;
 397
 398                 TAKE_FD(tty_fd);
 399                 return r;
 400         }
 401
 402         case EXEC_INPUT_SOCKET:
 403                 assert(socket_fd >= 0);
 404
 405                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 406
 407         case EXEC_INPUT_NAMED_FD:
 408                 assert(named_iofds[STDIN_FILENO] >= 0);
 409
 410                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 411                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 412
 413         case EXEC_INPUT_DATA: {
 414                 int fd;
 415
 416                 fd = acquire_data_fd_full(context->stdin_data, context->stdin_data_size, /* flags = */ 0);
 417                 if (fd < 0)
 418                         return fd;
 419
 420                 return move_fd(fd, STDIN_FILENO, false);
 421         }
 422
 423         case EXEC_INPUT_FILE: {
 424                 bool rw;
 425                 int fd;
 426
 427                 assert(context->stdio_file[STDIN_FILENO]);
 428
 429                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 430                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 431
 432                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 433                 if (fd < 0)
 434                         return fd;
 435
 436                 return move_fd(fd, STDIN_FILENO, false);
 437         }
 438
 439         default:
 440                 assert_not_reached();
 441         }
 442 }
 443
 444 static bool can_inherit_stderr_from_stdout(
 445                 const ExecContext *context,
 446                 ExecOutput o,
 447                 ExecOutput e) {
 448
 449         assert(context);
 450
 451         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 452          * stderr fd */
 453
 454         if (e == EXEC_OUTPUT_INHERIT)
 455                 return true;
 456         if (e != o)
 457                 return false;
 458
 459         if (e == EXEC_OUTPUT_NAMED_FD)
 460                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 461
 462         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 463                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 464
 465         return true;
 466 }
 467
 468 static int setup_output(
 469                 const ExecContext *context,
 470                 const ExecParameters *params,
 471                 int fileno,
 472                 int socket_fd,
 473                 const int named_iofds[static 3],
 474                 const char *ident,
 475                 uid_t uid,
 476                 gid_t gid,
 477                 dev_t *journal_stream_dev,
 478                 ino_t *journal_stream_ino) {
 479
 480         ExecOutput o;
 481         ExecInput i;
 482         int r;
 483
 484         assert(context);
 485         assert(params);
 486         assert(ident);
 487         assert(journal_stream_dev);
 488         assert(journal_stream_ino);
 489
 490         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 491
 492                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 493                         return -errno;
 494
 495                 return STDOUT_FILENO;
 496         }
 497
 498         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 499                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 500                         return -errno;
 501
 502                 return STDERR_FILENO;
 503         }
 504
 505         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 506         o = fixup_output(context->std_output, socket_fd);
 507
 508         if (fileno == STDERR_FILENO) {
 509                 ExecOutput e;
 510                 e = fixup_output(context->std_error, socket_fd);
 511
 512                 /* This expects the input and output are already set up */
 513
 514                 /* Don't change the stderr file descriptor if we inherit all
 515                  * the way and are not on a tty */
 516                 if (e == EXEC_OUTPUT_INHERIT &&
 517                     o == EXEC_OUTPUT_INHERIT &&
 518                     i == EXEC_INPUT_NULL &&
 519                     !is_terminal_input(context->std_input) &&
 520                     getppid() != 1)
 521                         return fileno;
 522
 523                 /* Duplicate from stdout if possible */
 524                 if (can_inherit_stderr_from_stdout(context, o, e))
 525                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 526
 527                 o = e;
 528
 529         } else if (o == EXEC_OUTPUT_INHERIT) {
 530                 /* If input got downgraded, inherit the original value */
 531                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 532                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 533
 534                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 535                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 536                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 537
 538                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 539                 if (getppid() != 1)
 540                         return fileno;
 541
 542                 /* We need to open /dev/null here anew, to get the right access mode. */
 543                 return open_null_as(O_WRONLY, fileno);
 544         }
 545
 546         switch (o) {
 547
 548         case EXEC_OUTPUT_NULL:
 549                 return open_null_as(O_WRONLY, fileno);
 550
 551         case EXEC_OUTPUT_TTY:
 552                 if (is_terminal_input(i))
 553                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 554
 555                 /* We don't reset the terminal if this is just about output */
 556                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 557
 558         case EXEC_OUTPUT_KMSG:
 559         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 560         case EXEC_OUTPUT_JOURNAL:
 561         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 562                 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
 563                 if (r < 0) {
 564                         log_exec_warning_errno(context,
 565                                                params,
 566                                                r,
 567                                                "Failed to connect %s to the journal socket, ignoring: %m",
 568                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 569                         r = open_null_as(O_WRONLY, fileno);
 570                 } else {
 571                         struct stat st;
 572
 573                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 574                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 575                          * services to detect whether they are connected to the journal or not.
 576                          *
 577                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 578                          * about STDERR as that's usually the best way to do logging. */
 579
 580                         if (fstat(fileno, &st) >= 0 &&
 581                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 582                                 *journal_stream_dev = st.st_dev;
 583                                 *journal_stream_ino = st.st_ino;
 584                         }
 585                 }
 586                 return r;
 587
 588         case EXEC_OUTPUT_SOCKET:
 589                 assert(socket_fd >= 0);
 590
 591                 return RET_NERRNO(dup2(socket_fd, fileno));
 592
 593         case EXEC_OUTPUT_NAMED_FD:
 594                 assert(named_iofds[fileno] >= 0);
 595
 596                 (void) fd_nonblock(named_iofds[fileno], false);
 597                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 598
 599         case EXEC_OUTPUT_FILE:
 600         case EXEC_OUTPUT_FILE_APPEND:
 601         case EXEC_OUTPUT_FILE_TRUNCATE: {
 602                 bool rw;
 603                 int fd, flags;
 604
 605                 assert(context->stdio_file[fileno]);
 606
 607                 rw = context->std_input == EXEC_INPUT_FILE &&
 608                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 609
 610                 if (rw)
 611                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 612
 613                 flags = O_WRONLY;
 614                 if (o == EXEC_OUTPUT_FILE_APPEND)
 615                         flags |= O_APPEND;
 616                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 617                         flags |= O_TRUNC;
 618
 619                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 620                 if (fd < 0)
 621                         return fd;
 622
 623                 return move_fd(fd, fileno, 0);
 624         }
 625
 626         default:
 627                 assert_not_reached();
 628         }
 629 }
 630
 631 static int chown_terminal(int fd, uid_t uid) {
 632         int r;
 633
 634         assert(fd >= 0);
 635
 636         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 637         if (!isatty_safe(fd))
 638                 return 0;
 639
 640         /* This might fail. What matters are the results. */
 641         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 642         if (r < 0)
 643                 return r;
 644
 645         return 1;
 646 }
 647
 648 static int setup_confirm_stdio(
 649                 const ExecContext *context,
 650                 const char *vc,
 651                 int *ret_saved_stdin,
 652                 int *ret_saved_stdout) {
 653
 654         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 655         int r;
 656
 657         assert(ret_saved_stdin);
 658         assert(ret_saved_stdout);
 659
 660         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 661         if (saved_stdin < 0)
 662                 return -errno;
 663
 664         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 665         if (saved_stdout < 0)
 666                 return -errno;
 667
 668         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 669         if (fd < 0)
 670                 return fd;
 671
 672         r = chown_terminal(fd, getuid());
 673         if (r < 0)
 674                 return r;
 675
 676         r = reset_terminal_fd(fd, /* switch_to_text= */ true);
 677         if (r < 0)
 678                 return r;
 679
 680         r = exec_context_apply_tty_size(context, fd, vc);
 681         if (r < 0)
 682                 return r;
 683
 684         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 685         TAKE_FD(fd);
 686         if (r < 0)
 687                 return r;
 688
 689         *ret_saved_stdin = TAKE_FD(saved_stdin);
 690         *ret_saved_stdout = TAKE_FD(saved_stdout);
 691         return 0;
 692 }
 693
 694 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
 695         assert(err < 0);
 696         assert(unit_id);
 697
 698         if (err == -ETIMEDOUT)
 699                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
 700         else {
 701                 errno = -err;
 702                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
 703         }
 704 }
 705
 706 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
 707         _cleanup_close_ int fd = -EBADF;
 708
 709         assert(vc);
 710
 711         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 712         if (fd < 0)
 713                 return;
 714
 715         write_confirm_error_fd(err, fd, unit_id);
 716 }
 717
 718 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 719         int r = 0;
 720
 721         assert(saved_stdin);
 722         assert(saved_stdout);
 723
 724         release_terminal();
 725
 726         if (*saved_stdin >= 0)
 727                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 728                         r = -errno;
 729
 730         if (*saved_stdout >= 0)
 731                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 732                         r = -errno;
 733
 734         *saved_stdin = safe_close(*saved_stdin);
 735         *saved_stdout = safe_close(*saved_stdout);
 736
 737         return r;
 738 }
 739
 740 enum {
 741         CONFIRM_PRETEND_FAILURE = -1,
 742         CONFIRM_PRETEND_SUCCESS =  0,
 743         CONFIRM_EXECUTE = 1,
 744 };
 745
 746 static bool confirm_spawn_disabled(void) {
 747         return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
 748 }
 749
 750 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
 751         int saved_stdout = -1, saved_stdin = -1, r;
 752         _cleanup_free_ char *e = NULL;
 753         char c;
 754
 755         assert(context);
 756         assert(params);
 757
 758         /* For any internal errors, assume a positive response. */
 759         r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
 760         if (r < 0) {
 761                 write_confirm_error(r, params->confirm_spawn, params->unit_id);
 762                 return CONFIRM_EXECUTE;
 763         }
 764
 765         /* confirm_spawn might have been disabled while we were sleeping. */
 766         if (!params->confirm_spawn || confirm_spawn_disabled()) {
 767                 r = 1;
 768                 goto restore_stdio;
 769         }
 770
 771         e = ellipsize(cmdline, 60, 100);
 772         if (!e) {
 773                 log_oom();
 774                 r = CONFIRM_EXECUTE;
 775                 goto restore_stdio;
 776         }
 777
 778         for (;;) {
 779                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 780                 if (r < 0) {
 781                         write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
 782                         r = CONFIRM_EXECUTE;
 783                         goto restore_stdio;
 784                 }
 785
 786                 switch (c) {
 787                 case 'c':
 788                         printf("Resuming normal execution.\n");
 789                         manager_disable_confirm_spawn();
 790                         r = 1;
 791                         break;
 792                 case 'D':
 793                         printf("  Unit: %s\n",
 794                                params->unit_id);
 795                         exec_context_dump(context, stdout, "  ");
 796                         exec_params_dump(params, stdout, "  ");
 797                         continue; /* ask again */
 798                 case 'f':
 799                         printf("Failing execution.\n");
 800                         r = CONFIRM_PRETEND_FAILURE;
 801                         break;
 802                 case 'h':
 803                         printf("  c - continue, proceed without asking anymore\n"
 804                                "  D - dump, show the state of the unit\n"
 805                                "  f - fail, don't execute the command and pretend it failed\n"
 806                                "  h - help\n"
 807                                "  i - info, show a short summary of the unit\n"
 808                                "  j - jobs, show jobs that are in progress\n"
 809                                "  s - skip, don't execute the command and pretend it succeeded\n"
 810                                "  y - yes, execute the command\n");
 811                         continue; /* ask again */
 812                 case 'i':
 813                         printf("  Unit:        %s\n"
 814                                "  Command:     %s\n",
 815                                params->unit_id, cmdline);
 816                         continue; /* ask again */
 817                 case 'j':
 818                         if (sigqueue(getppid(),
 819                                      SIGRTMIN+18,
 820                                      (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
 821                                 return -errno;
 822
 823                         continue; /* ask again */
 824                 case 'n':
 825                         /* 'n' was removed in favor of 'f'. */
 826                         printf("Didn't understand 'n', did you mean 'f'?\n");
 827                         continue; /* ask again */
 828                 case 's':
 829                         printf("Skipping execution.\n");
 830                         r = CONFIRM_PRETEND_SUCCESS;
 831                         break;
 832                 case 'y':
 833                         r = CONFIRM_EXECUTE;
 834                         break;
 835                 default:
 836                         assert_not_reached();
 837                 }
 838                 break;
 839         }
 840
 841 restore_stdio:
 842         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 843         return r;
 844 }
 845
 846 static int get_fixed_user(
 847                 const char *user_or_uid,
 848                 const char **ret_username,
 849                 uid_t *ret_uid,
 850                 gid_t *ret_gid,
 851                 const char **ret_home,
 852                 const char **ret_shell) {
 853
 854         int r;
 855
 856         assert(user_or_uid);
 857         assert(ret_username);
 858
 859         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 860          * (i.e. are "/" or "/bin/nologin"). */
 861
 862         r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
 863         if (r < 0)
 864                 return r;
 865
 866         /* user_or_uid is normalized by get_user_creds to username */
 867         *ret_username = user_or_uid;
 868
 869         return 0;
 870 }
 871
 872 static int get_fixed_group(
 873                 const char *group_or_gid,
 874                 const char **ret_groupname,
 875                 gid_t *ret_gid) {
 876
 877         int r;
 878
 879         assert(group_or_gid);
 880         assert(ret_groupname);
 881
 882         r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
 883         if (r < 0)
 884                 return r;
 885
 886         /* group_or_gid is normalized by get_group_creds to groupname */
 887         *ret_groupname = group_or_gid;
 888
 889         return 0;
 890 }
 891
 892 static int get_supplementary_groups(const ExecContext *c, const char *user,
 893                                     const char *group, gid_t gid,
 894                                     gid_t **supplementary_gids, int *ngids) {
 895         int r, k = 0;
 896         int ngroups_max;
 897         bool keep_groups = false;
 898         gid_t *groups = NULL;
 899         _cleanup_free_ gid_t *l_gids = NULL;
 900
 901         assert(c);
 902
 903         /*
 904          * If user is given, then lookup GID and supplementary groups list.
 905          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 906          * here and as early as possible so we keep the list of supplementary
 907          * groups of the caller.
 908          */
 909         if (user && gid_is_valid(gid) && gid != 0) {
 910                 /* First step, initialize groups from /etc/groups */
 911                 if (initgroups(user, gid) < 0)
 912                         return -errno;
 913
 914                 keep_groups = true;
 915         }
 916
 917         if (strv_isempty(c->supplementary_groups))
 918                 return 0;
 919
 920         /*
 921          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 922          * be positive, otherwise fail.
 923          */
 924         errno = 0;
 925         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 926         if (ngroups_max <= 0)
 927                 return errno_or_else(EOPNOTSUPP);
 928
 929         l_gids = new(gid_t, ngroups_max);
 930         if (!l_gids)
 931                 return -ENOMEM;
 932
 933         if (keep_groups) {
 934                 /*
 935                  * Lookup the list of groups that the user belongs to, we
 936                  * avoid NSS lookups here too for gid=0.
 937                  */
 938                 k = ngroups_max;
 939                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 940                         return -EINVAL;
 941         } else
 942                 k = 0;
 943
 944         STRV_FOREACH(i, c->supplementary_groups) {
 945                 const char *g;
 946
 947                 if (k >= ngroups_max)
 948                         return -E2BIG;
 949
 950                 g = *i;
 951                 r = get_group_creds(&g, l_gids+k, 0);
 952                 if (r < 0)
 953                         return r;
 954
 955                 k++;
 956         }
 957
 958         /*
 959          * Sets ngids to zero to drop all supplementary groups, happens
 960          * when we are under root and SupplementaryGroups= is empty.
 961          */
 962         if (k == 0) {
 963                 *ngids = 0;
 964                 return 0;
 965         }
 966
 967         /* Otherwise get the final list of supplementary groups */
 968         groups = memdup(l_gids, sizeof(gid_t) * k);
 969         if (!groups)
 970                 return -ENOMEM;
 971
 972         *supplementary_gids = groups;
 973         *ngids = k;
 974
 975         groups = NULL;
 976
 977         return 0;
 978 }
 979
 980 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
 981         int r;
 982
 983         /* Handle SupplementaryGroups= if it is not empty */
 984         if (ngids > 0) {
 985                 r = maybe_setgroups(ngids, supplementary_gids);
 986                 if (r < 0)
 987                         return r;
 988         }
 989
 990         if (gid_is_valid(gid)) {
 991                 /* Then set our gids */
 992                 if (setresgid(gid, gid, gid) < 0)
 993                         return -errno;
 994         }
 995
 996         return 0;
 997 }
 998
 999 static int set_securebits(unsigned bits, unsigned mask) {
1000         unsigned applied;
1001         int current;
1002
1003         current = prctl(PR_GET_SECUREBITS);
1004         if (current < 0)
1005                 return -errno;
1006
1007         /* Clear all securebits defined in mask and set bits */
1008         applied = ((unsigned) current & ~mask) | bits;
1009         if ((unsigned) current == applied)
1010                 return 0;
1011
1012         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1013                 return -errno;
1014
1015         return 1;
1016 }
1017
1018 static int enforce_user(
1019                 const ExecContext *context,
1020                 uid_t uid,
1021                 uint64_t capability_ambient_set) {
1022         assert(context);
1023         int r;
1024
1025         if (!uid_is_valid(uid))
1026                 return 0;
1027
1028         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1029          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1030          * case. */
1031
1032         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1033
1034                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1035                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1036                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1037                 if (r < 0)
1038                         return r;
1039         }
1040
1041         /* Second step: actually set the uids */
1042         if (setresuid(uid, uid, uid) < 0)
1043                 return -errno;
1044
1045         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1046          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1047          * outside of this call. */
1048         return 0;
1049 }
1050
1051 #if HAVE_PAM
1052
1053 static int null_conv(
1054                 int num_msg,
1055                 const struct pam_message **msg,
1056                 struct pam_response **resp,
1057                 void *appdata_ptr) {
1058
1059         /* We don't support conversations */
1060
1061         return PAM_CONV_ERR;
1062 }
1063
1064 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1065         int r, s;
1066
1067         assert(handle);
1068
1069         r = pam_close_session(handle, flags);
1070         if (r != PAM_SUCCESS)
1071                 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1072
1073         s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1074         if (s != PAM_SUCCESS)
1075                 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1076
1077         return r != PAM_SUCCESS ? r : s;
1078 }
1079
1080 #endif
1081
1082 static int setup_pam(
1083                 const char *name,
1084                 const char *user,
1085                 uid_t uid,
1086                 gid_t gid,
1087                 const char *tty,
1088                 char ***env, /* updated on success */
1089                 const int fds[], size_t n_fds,
1090                 int exec_fd) {
1091
1092 #if HAVE_PAM
1093
1094         static const struct pam_conv conv = {
1095                 .conv = null_conv,
1096                 .appdata_ptr = NULL
1097         };
1098
1099         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1100         _cleanup_strv_free_ char **e = NULL;
1101         pam_handle_t *handle = NULL;
1102         sigset_t old_ss;
1103         int pam_code = PAM_SUCCESS, r;
1104         bool close_session = false;
1105         pid_t parent_pid;
1106         int flags = 0;
1107
1108         assert(name);
1109         assert(user);
1110         assert(env);
1111
1112         /* We set up PAM in the parent process, then fork. The child
1113          * will then stay around until killed via PR_GET_PDEATHSIG or
1114          * systemd via the cgroup logic. It will then remove the PAM
1115          * session again. The parent process will exec() the actual
1116          * daemon. We do things this way to ensure that the main PID
1117          * of the daemon is the one we initially fork()ed. */
1118
1119         r = barrier_create(&barrier);
1120         if (r < 0)
1121                 goto fail;
1122
1123         if (log_get_max_level() < LOG_DEBUG)
1124                 flags |= PAM_SILENT;
1125
1126         pam_code = pam_start(name, user, &conv, &handle);
1127         if (pam_code != PAM_SUCCESS) {
1128                 handle = NULL;
1129                 goto fail;
1130         }
1131
1132         if (!tty) {
1133                 _cleanup_free_ char *q = NULL;
1134
1135                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1136                  * out if that's the case, and read the TTY off it. */
1137
1138                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1139                         tty = strjoina("/dev/", q);
1140         }
1141
1142         if (tty) {
1143                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1144                 if (pam_code != PAM_SUCCESS)
1145                         goto fail;
1146         }
1147
1148         STRV_FOREACH(nv, *env) {
1149                 pam_code = pam_putenv(handle, *nv);
1150                 if (pam_code != PAM_SUCCESS)
1151                         goto fail;
1152         }
1153
1154         pam_code = pam_acct_mgmt(handle, flags);
1155         if (pam_code != PAM_SUCCESS)
1156                 goto fail;
1157
1158         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1159         if (pam_code != PAM_SUCCESS)
1160                 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1161
1162         pam_code = pam_open_session(handle, flags);
1163         if (pam_code != PAM_SUCCESS)
1164                 goto fail;
1165
1166         close_session = true;
1167
1168         e = pam_getenvlist(handle);
1169         if (!e) {
1170                 pam_code = PAM_BUF_ERR;
1171                 goto fail;
1172         }
1173
1174         /* Block SIGTERM, so that we know that it won't get lost in the child */
1175
1176         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
1177
1178         parent_pid = getpid_cached();
1179
1180         r = safe_fork("(sd-pam)", 0, NULL);
1181         if (r < 0)
1182                 goto fail;
1183         if (r == 0) {
1184                 int ret = EXIT_PAM;
1185
1186                 /* The child's job is to reset the PAM session on termination */
1187                 barrier_set_role(&barrier, BARRIER_CHILD);
1188
1189                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1190                  * those fds are open here that have been opened by PAM. */
1191                 (void) close_many(fds, n_fds);
1192
1193                 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1194                  * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1195                  * we'd never signal completion. */
1196                 exec_fd = safe_close(exec_fd);
1197
1198                 /* Drop privileges - we don't need any to pam_close_session and this will make
1199                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1200                  * threads to fail to exit normally */
1201
1202                 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1203                 if (r < 0)
1204                         log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1205
1206                 (void) ignore_signals(SIGPIPE);
1207
1208                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1209                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1210                  * this way. We rely on the control groups kill logic to do the rest for us. */
1211                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1212                         goto child_finish;
1213
1214                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1215                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1216                  *
1217                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1218                 (void) barrier_place(&barrier);
1219
1220                 /* Check if our parent process might already have died? */
1221                 if (getppid() == parent_pid) {
1222                         sigset_t ss;
1223                         int sig;
1224
1225                         assert_se(sigemptyset(&ss) >= 0);
1226                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1227
1228                         assert_se(sigwait(&ss, &sig) == 0);
1229                         assert(sig == SIGTERM);
1230                 }
1231
1232                 /* If our parent died we'll end the session */
1233                 if (getppid() != parent_pid) {
1234                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1235                         if (pam_code != PAM_SUCCESS)
1236                                 goto child_finish;
1237                 }
1238
1239                 ret = 0;
1240
1241         child_finish:
1242                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1243                  * know about this. See pam_end(3) */
1244                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1245                 _exit(ret);
1246         }
1247
1248         barrier_set_role(&barrier, BARRIER_PARENT);
1249
1250         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1251          * here. */
1252         handle = NULL;
1253
1254         /* Unblock SIGTERM again in the parent */
1255         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1256
1257         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1258          * this fd around. */
1259         closelog();
1260
1261         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1262          * recover. However, warn loudly if it happens. */
1263         if (!barrier_place_and_sync(&barrier))
1264                 log_error("PAM initialization failed");
1265
1266         return strv_free_and_replace(*env, e);
1267
1268 fail:
1269         if (pam_code != PAM_SUCCESS) {
1270                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1271                 r = -EPERM;  /* PAM errors do not map to errno */
1272         } else
1273                 log_error_errno(r, "PAM failed: %m");
1274
1275         if (handle) {
1276                 if (close_session)
1277                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1278
1279                 (void) pam_end(handle, pam_code | flags);
1280         }
1281
1282         closelog();
1283         return r;
1284 #else
1285         return 0;
1286 #endif
1287 }
1288
1289 static void rename_process_from_path(const char *path) {
1290         _cleanup_free_ char *buf = NULL;
1291         const char *p;
1292
1293         assert(path);
1294
1295         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1296          * /bin/ps */
1297
1298         if (path_extract_filename(path, &buf) < 0) {
1299                 rename_process("(...)");
1300                 return;
1301         }
1302
1303         size_t l = strlen(buf);
1304         if (l > 8) {
1305                 /* The end of the process name is usually more interesting, since the first bit might just be
1306                  * "systemd-" */
1307                 p = buf + l - 8;
1308                 l = 8;
1309         } else
1310                 p = buf;
1311
1312         char process_name[11];
1313         process_name[0] = '(';
1314         memcpy(process_name+1, p, l);
1315         process_name[1+l] = ')';
1316         process_name[1+l+1] = 0;
1317
1318         (void) rename_process(process_name);
1319 }
1320
1321 static bool context_has_address_families(const ExecContext *c) {
1322         assert(c);
1323
1324         return c->address_families_allow_list ||
1325                 !set_isempty(c->address_families);
1326 }
1327
1328 static bool context_has_syscall_filters(const ExecContext *c) {
1329         assert(c);
1330
1331         return c->syscall_allow_list ||
1332                 !hashmap_isempty(c->syscall_filter);
1333 }
1334
1335 static bool context_has_syscall_logs(const ExecContext *c) {
1336         assert(c);
1337
1338         return c->syscall_log_allow_list ||
1339                 !hashmap_isempty(c->syscall_log);
1340 }
1341
1342 static bool context_has_seccomp(const ExecContext *c) {
1343         /* We need NNP if we have any form of seccomp and are unprivileged */
1344         return c->lock_personality ||
1345                 c->memory_deny_write_execute ||
1346                 c->private_devices ||
1347                 c->protect_clock ||
1348                 c->protect_hostname ||
1349                 c->protect_kernel_tunables ||
1350                 c->protect_kernel_modules ||
1351                 c->protect_kernel_logs ||
1352                 context_has_address_families(c) ||
1353                 exec_context_restrict_namespaces_set(c) ||
1354                 c->restrict_realtime ||
1355                 c->restrict_suid_sgid ||
1356                 !set_isempty(c->syscall_archs) ||
1357                 context_has_syscall_filters(c) ||
1358                 context_has_syscall_logs(c);
1359 }
1360
1361 static bool context_has_no_new_privileges(const ExecContext *c) {
1362         assert(c);
1363
1364         if (c->no_new_privileges)
1365                 return true;
1366
1367         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1368                 return false;
1369
1370         return context_has_seccomp(c);
1371 }
1372
1373 #if HAVE_SECCOMP
1374
1375 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1376         void *id, *val;
1377         bool has_capget = false, has_capset = false, has_prctl = false;
1378
1379         assert(c);
1380
1381         /* No syscall filter, we are allowed to drop privileges */
1382         if (hashmap_isempty(c->syscall_filter))
1383                 return true;
1384
1385         HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1386                 _cleanup_free_ char *name = NULL;
1387
1388                 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1389
1390                 if (streq(name, "capget"))
1391                         has_capget = true;
1392                 else if (streq(name, "capset"))
1393                         has_capset = true;
1394                 else if (streq(name, "prctl"))
1395                         has_prctl = true;
1396         }
1397
1398         if (c->syscall_allow_list)
1399                 return has_capget && has_capset && has_prctl;
1400         else
1401                 return !(has_capget || has_capset || has_prctl);
1402 }
1403
1404 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1405
1406         if (is_seccomp_available())
1407                 return false;
1408
1409         log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1410         return true;
1411 }
1412
1413 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1414         uint32_t negative_action, default_action, action;
1415         int r;
1416
1417         assert(c);
1418         assert(p);
1419
1420         if (!context_has_syscall_filters(c))
1421                 return 0;
1422
1423         if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1424                 return 0;
1425
1426         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1427
1428         if (c->syscall_allow_list) {
1429                 default_action = negative_action;
1430                 action = SCMP_ACT_ALLOW;
1431         } else {
1432                 default_action = SCMP_ACT_ALLOW;
1433                 action = negative_action;
1434         }
1435
1436         if (needs_ambient_hack) {
1437                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1438                 if (r < 0)
1439                         return r;
1440         }
1441
1442         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1443 }
1444
1445 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1446 #ifdef SCMP_ACT_LOG
1447         uint32_t default_action, action;
1448 #endif
1449
1450         assert(c);
1451         assert(p);
1452
1453         if (!context_has_syscall_logs(c))
1454                 return 0;
1455
1456 #ifdef SCMP_ACT_LOG
1457         if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1458                 return 0;
1459
1460         if (c->syscall_log_allow_list) {
1461                 /* Log nothing but the ones listed */
1462                 default_action = SCMP_ACT_ALLOW;
1463                 action = SCMP_ACT_LOG;
1464         } else {
1465                 /* Log everything but the ones listed */
1466                 default_action = SCMP_ACT_LOG;
1467                 action = SCMP_ACT_ALLOW;
1468         }
1469
1470         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1471 #else
1472         /* old libseccomp */
1473         log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1474         return 0;
1475 #endif
1476 }
1477
1478 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1479         assert(c);
1480         assert(p);
1481
1482         if (set_isempty(c->syscall_archs))
1483                 return 0;
1484
1485         if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1486                 return 0;
1487
1488         return seccomp_restrict_archs(c->syscall_archs);
1489 }
1490
1491 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1492         assert(c);
1493         assert(p);
1494
1495         if (!context_has_address_families(c))
1496                 return 0;
1497
1498         if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1499                 return 0;
1500
1501         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1502 }
1503
1504 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1505         int r;
1506
1507         assert(c);
1508         assert(p);
1509
1510         if (!c->memory_deny_write_execute)
1511                 return 0;
1512
1513         /* use prctl() if kernel supports it (6.3) */
1514         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1515         if (r == 0) {
1516                 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1517                 return 0;
1518         }
1519         if (r < 0 && errno != EINVAL)
1520                 return log_exec_debug_errno(c,
1521                                             p,
1522                                             errno,
1523                                             "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1524         /* else use seccomp */
1525         log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1526
1527         if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1528                 return 0;
1529
1530         return seccomp_memory_deny_write_execute();
1531 }
1532
1533 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1534         assert(c);
1535         assert(p);
1536
1537         if (!c->restrict_realtime)
1538                 return 0;
1539
1540         if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1541                 return 0;
1542
1543         return seccomp_restrict_realtime();
1544 }
1545
1546 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1547         assert(c);
1548         assert(p);
1549
1550         if (!c->restrict_suid_sgid)
1551                 return 0;
1552
1553         if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1554                 return 0;
1555
1556         return seccomp_restrict_suid_sgid();
1557 }
1558
1559 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1560         assert(c);
1561         assert(p);
1562
1563         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1564          * let's protect even those systems where this is left on in the kernel. */
1565
1566         if (!c->protect_kernel_tunables)
1567                 return 0;
1568
1569         if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1570                 return 0;
1571
1572         return seccomp_protect_sysctl();
1573 }
1574
1575 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1576         assert(c);
1577         assert(p);
1578
1579         /* Turn off module syscalls on ProtectKernelModules=yes */
1580
1581         if (!c->protect_kernel_modules)
1582                 return 0;
1583
1584         if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1585                 return 0;
1586
1587         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1588 }
1589
1590 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1591         assert(c);
1592         assert(p);
1593
1594         if (!c->protect_kernel_logs)
1595                 return 0;
1596
1597         if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1598                 return 0;
1599
1600         return seccomp_protect_syslog();
1601 }
1602
1603 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1604         assert(c);
1605         assert(p);
1606
1607         if (!c->protect_clock)
1608                 return 0;
1609
1610         if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1611                 return 0;
1612
1613         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1614 }
1615
1616 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1617         assert(c);
1618         assert(p);
1619
1620         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1621
1622         if (!c->private_devices)
1623                 return 0;
1624
1625         if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1626                 return 0;
1627
1628         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1629 }
1630
1631 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1632         assert(c);
1633         assert(p);
1634
1635         if (!exec_context_restrict_namespaces_set(c))
1636                 return 0;
1637
1638         if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1639                 return 0;
1640
1641         return seccomp_restrict_namespaces(c->restrict_namespaces);
1642 }
1643
1644 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1645         unsigned long personality;
1646         int r;
1647
1648         assert(c);
1649         assert(p);
1650
1651         if (!c->lock_personality)
1652                 return 0;
1653
1654         if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1655                 return 0;
1656
1657         personality = c->personality;
1658
1659         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1660         if (personality == PERSONALITY_INVALID) {
1661
1662                 r = opinionated_personality(&personality);
1663                 if (r < 0)
1664                         return r;
1665         }
1666
1667         return seccomp_lock_personality(personality);
1668 }
1669
1670 #endif
1671
1672 #if HAVE_LIBBPF
1673 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1674         int r;
1675
1676         assert(c);
1677         assert(p);
1678
1679         if (!exec_context_restrict_filesystems_set(c))
1680                 return 0;
1681
1682         if (p->bpf_restrict_fs_map_fd < 0) {
1683                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1684                 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1685                 return 0;
1686         }
1687
1688         /* We are in a new binary, so dl-open again */
1689         r = dlopen_bpf();
1690         if (r < 0)
1691                 return r;
1692
1693         return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1694 }
1695 #endif
1696
1697 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1698         assert(c);
1699         assert(p);
1700
1701         if (!c->protect_hostname)
1702                 return 0;
1703
1704         if (ns_type_supported(NAMESPACE_UTS)) {
1705                 if (unshare(CLONE_NEWUTS) < 0) {
1706                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1707                                 *ret_exit_status = EXIT_NAMESPACE;
1708                                 return log_exec_error_errno(c,
1709                                                             p,
1710                                                             errno,
1711                                                             "Failed to set up UTS namespacing: %m");
1712                         }
1713
1714                         log_exec_warning(c,
1715                                          p,
1716                                          "ProtectHostname=yes is configured, but UTS namespace setup is "
1717                                          "prohibited (container manager?), ignoring namespace setup.");
1718                 }
1719         } else
1720                 log_exec_warning(c,
1721                                  p,
1722                                  "ProtectHostname=yes is configured, but the kernel does not "
1723                                  "support UTS namespaces, ignoring namespace setup.");
1724
1725 #if HAVE_SECCOMP
1726         int r;
1727
1728         if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1729                 return 0;
1730
1731         r = seccomp_protect_hostname();
1732         if (r < 0) {
1733                 *ret_exit_status = EXIT_SECCOMP;
1734                 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1735         }
1736 #endif
1737
1738         return 0;
1739 }
1740
1741 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1742         assert(idle_pipe);
1743
1744         idle_pipe[1] = safe_close(idle_pipe[1]);
1745         idle_pipe[2] = safe_close(idle_pipe[2]);
1746
1747         if (idle_pipe[0] >= 0) {
1748                 int r;
1749
1750                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1751
1752                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1753                         ssize_t n;
1754
1755                         /* Signal systemd that we are bored and want to continue. */
1756                         n = write(idle_pipe[3], "x", 1);
1757                         if (n > 0)
1758                                 /* Wait for systemd to react to the signal above. */
1759                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1760                 }
1761
1762                 idle_pipe[0] = safe_close(idle_pipe[0]);
1763
1764         }
1765
1766         idle_pipe[3] = safe_close(idle_pipe[3]);
1767 }
1768
1769 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1770
1771 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1772  * the service payload in. */
1773 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1774         [EXEC_DIRECTORY_RUNTIME]       = "RUNTIME_DIRECTORY",
1775         [EXEC_DIRECTORY_STATE]         = "STATE_DIRECTORY",
1776         [EXEC_DIRECTORY_CACHE]         = "CACHE_DIRECTORY",
1777         [EXEC_DIRECTORY_LOGS]          = "LOGS_DIRECTORY",
1778         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1779 };
1780
1781 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1782
1783 static int build_environment(
1784                 const ExecContext *c,
1785                 const ExecParameters *p,
1786                 const CGroupContext *cgroup_context,
1787                 size_t n_fds,
1788                 const char *home,
1789                 const char *username,
1790                 const char *shell,
1791                 dev_t journal_stream_dev,
1792                 ino_t journal_stream_ino,
1793                 const char *memory_pressure_path,
1794                 char ***ret) {
1795
1796         _cleanup_strv_free_ char **our_env = NULL;
1797         size_t n_env = 0;
1798         char *x;
1799         int r;
1800
1801         assert(c);
1802         assert(p);
1803         assert(ret);
1804
1805 #define N_ENV_VARS 19
1806         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1807         if (!our_env)
1808                 return -ENOMEM;
1809
1810         if (n_fds > 0) {
1811                 _cleanup_free_ char *joined = NULL;
1812
1813                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1814                         return -ENOMEM;
1815                 our_env[n_env++] = x;
1816
1817                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1818                         return -ENOMEM;
1819                 our_env[n_env++] = x;
1820
1821                 joined = strv_join(p->fd_names, ":");
1822                 if (!joined)
1823                         return -ENOMEM;
1824
1825                 x = strjoin("LISTEN_FDNAMES=", joined);
1826                 if (!x)
1827                         return -ENOMEM;
1828                 our_env[n_env++] = x;
1829         }
1830
1831         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1832                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1833                         return -ENOMEM;
1834                 our_env[n_env++] = x;
1835
1836                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1837                         return -ENOMEM;
1838                 our_env[n_env++] = x;
1839         }
1840
1841         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1842          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1843          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1844         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1845                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1846                 if (!x)
1847                         return -ENOMEM;
1848                 our_env[n_env++] = x;
1849         }
1850
1851         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1852          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1853          * really make much sense since we're not logged in. Hence we conditionalize the three based on
1854          * SetLoginEnvironment= switch. */
1855         if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1856                 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1857                 if (r < 0)
1858                         return log_exec_debug_errno(c,
1859                                                     p,
1860                                                     r,
1861                                                     "Failed to determine user credentials for root: %m");
1862         }
1863
1864         bool set_user_login_env = exec_context_get_set_login_environment(c);
1865
1866         if (username) {
1867                 x = strjoin("USER=", username);
1868                 if (!x)
1869                         return -ENOMEM;
1870                 our_env[n_env++] = x;
1871
1872                 if (set_user_login_env) {
1873                         x = strjoin("LOGNAME=", username);
1874                         if (!x)
1875                                 return -ENOMEM;
1876                         our_env[n_env++] = x;
1877                 }
1878         }
1879
1880         if (home && set_user_login_env) {
1881                 x = strjoin("HOME=", home);
1882                 if (!x)
1883                         return -ENOMEM;
1884
1885                 path_simplify(x + 5);
1886                 our_env[n_env++] = x;
1887         }
1888
1889         if (shell && set_user_login_env) {
1890                 x = strjoin("SHELL=", shell);
1891                 if (!x)
1892                         return -ENOMEM;
1893
1894                 path_simplify(x + 6);
1895                 our_env[n_env++] = x;
1896         }
1897
1898         if (!sd_id128_is_null(p->invocation_id)) {
1899                 assert(p->invocation_id_string);
1900
1901                 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1902                 if (!x)
1903                         return -ENOMEM;
1904
1905                 our_env[n_env++] = x;
1906         }
1907
1908         if (exec_context_needs_term(c)) {
1909                 _cleanup_free_ char *cmdline = NULL;
1910                 const char *tty_path, *term = NULL;
1911
1912                 tty_path = exec_context_tty_path(c);
1913
1914                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1917
1918                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1919                         term = getenv("TERM");
1920                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1921                         _cleanup_free_ char *key = NULL;
1922
1923                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1924                         if (!key)
1925                                 return -ENOMEM;
1926
1927                         r = proc_cmdline_get_key(key, 0, &cmdline);
1928                         if (r < 0)
1929                                 log_exec_debug_errno(c,
1930                                                      p,
1931                                                      r,
1932                                                      "Failed to read %s from kernel cmdline, ignoring: %m",
1933                                                      key);
1934                         else if (r > 0)
1935                                 term = cmdline;
1936                 }
1937
1938                 if (!term)
1939                         term = default_term_for_tty(tty_path);
1940
1941                 x = strjoin("TERM=", term);
1942                 if (!x)
1943                         return -ENOMEM;
1944                 our_env[n_env++] = x;
1945         }
1946
1947         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1948                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1949                         return -ENOMEM;
1950
1951                 our_env[n_env++] = x;
1952         }
1953
1954         if (c->log_namespace) {
1955                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1956                 if (!x)
1957                         return -ENOMEM;
1958
1959                 our_env[n_env++] = x;
1960         }
1961
1962         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1963                 _cleanup_free_ char *joined = NULL;
1964                 const char *n;
1965
1966                 if (!p->prefix[t])
1967                         continue;
1968
1969                 if (c->directories[t].n_items == 0)
1970                         continue;
1971
1972                 n = exec_directory_env_name_to_string(t);
1973                 if (!n)
1974                         continue;
1975
1976                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1977                         _cleanup_free_ char *prefixed = NULL;
1978
1979                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1980                         if (!prefixed)
1981                                 return -ENOMEM;
1982
1983                         if (!strextend_with_separator(&joined, ":", prefixed))
1984                                 return -ENOMEM;
1985                 }
1986
1987                 x = strjoin(n, "=", joined);
1988                 if (!x)
1989                         return -ENOMEM;
1990
1991                 our_env[n_env++] = x;
1992         }
1993
1994         _cleanup_free_ char *creds_dir = NULL;
1995         r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
1996         if (r < 0)
1997                 return r;
1998         if (r > 0) {
1999                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2000                 if (!x)
2001                         return -ENOMEM;
2002
2003                 our_env[n_env++] = x;
2004         }
2005
2006         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2007                 return -ENOMEM;
2008
2009         our_env[n_env++] = x;
2010
2011         if (memory_pressure_path) {
2012                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2013                 if (!x)
2014                         return -ENOMEM;
2015
2016                 our_env[n_env++] = x;
2017
2018                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2019                         _cleanup_free_ char *b = NULL, *e = NULL;
2020
2021                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2022                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2023                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2024                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2025                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2026                                 return -ENOMEM;
2027
2028                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2029                                 return -ENOMEM;
2030
2031                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2032                         if (!x)
2033                                 return -ENOMEM;
2034
2035                         our_env[n_env++] = x;
2036                 }
2037         }
2038
2039         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2040 #undef N_ENV_VARS
2041
2042         *ret = TAKE_PTR(our_env);
2043
2044         return 0;
2045 }
2046
2047 static int build_pass_environment(const ExecContext *c, char ***ret) {
2048         _cleanup_strv_free_ char **pass_env = NULL;
2049         size_t n_env = 0;
2050
2051         STRV_FOREACH(i, c->pass_environment) {
2052                 _cleanup_free_ char *x = NULL;
2053                 char *v;
2054
2055                 v = getenv(*i);
2056                 if (!v)
2057                         continue;
2058                 x = strjoin(*i, "=", v);
2059                 if (!x)
2060                         return -ENOMEM;
2061
2062                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2063                         return -ENOMEM;
2064
2065                 pass_env[n_env++] = TAKE_PTR(x);
2066                 pass_env[n_env] = NULL;
2067         }
2068
2069         *ret = TAKE_PTR(pass_env);
2070
2071         return 0;
2072 }
2073
2074 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2075         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2076         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2077         _cleanup_close_ int unshare_ready_fd = -EBADF;
2078         _cleanup_(sigkill_waitp) pid_t pid = 0;
2079         uint64_t c = 1;
2080         ssize_t n;
2081         int r;
2082
2083         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2084          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2085          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2086          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2087          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2088          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2089          * continues execution normally.
2090          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2091          * does not need CAP_SETUID to write the single line mapping to itself. */
2092
2093         /* Can only set up multiple mappings with CAP_SETUID. */
2094         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2095                 r = asprintf(&uid_map,
2096                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2097                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2098                              ouid, ouid, uid, uid);
2099         else
2100                 r = asprintf(&uid_map,
2101                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2102                              ouid, ouid);
2103
2104         if (r < 0)
2105                 return -ENOMEM;
2106
2107         /* Can only set up multiple mappings with CAP_SETGID. */
2108         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2109                 r = asprintf(&gid_map,
2110                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2111                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2112                              ogid, ogid, gid, gid);
2113         else
2114                 r = asprintf(&gid_map,
2115                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2116                              ogid, ogid);
2117
2118         if (r < 0)
2119                 return -ENOMEM;
2120
2121         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2122          * namespace. */
2123         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2124         if (unshare_ready_fd < 0)
2125                 return -errno;
2126
2127         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2128          * failed. */
2129         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2130                 return -errno;
2131
2132         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2133         if (r < 0)
2134                 return r;
2135         if (r == 0) {
2136                 _cleanup_close_ int fd = -EBADF;
2137                 const char *a;
2138                 pid_t ppid;
2139
2140                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2141                  * here, after the parent opened its own user namespace. */
2142
2143                 ppid = getppid();
2144                 errno_pipe[0] = safe_close(errno_pipe[0]);
2145
2146                 /* Wait until the parent unshared the user namespace */
2147                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2148                         r = -errno;
2149                         goto child_fail;
2150                 }
2151
2152                 /* Disable the setgroups() system call in the child user namespace, for good. */
2153                 a = procfs_file_alloca(ppid, "setgroups");
2154                 fd = open(a, O_WRONLY|O_CLOEXEC);
2155                 if (fd < 0) {
2156                         if (errno != ENOENT) {
2157                                 r = -errno;
2158                                 goto child_fail;
2159                         }
2160
2161                         /* If the file is missing the kernel is too old, let's continue anyway. */
2162                 } else {
2163                         if (write(fd, "deny\n", 5) < 0) {
2164                                 r = -errno;
2165                                 goto child_fail;
2166                         }
2167
2168                         fd = safe_close(fd);
2169                 }
2170
2171                 /* First write the GID map */
2172                 a = procfs_file_alloca(ppid, "gid_map");
2173                 fd = open(a, O_WRONLY|O_CLOEXEC);
2174                 if (fd < 0) {
2175                         r = -errno;
2176                         goto child_fail;
2177                 }
2178                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2179                         r = -errno;
2180                         goto child_fail;
2181                 }
2182                 fd = safe_close(fd);
2183
2184                 /* The write the UID map */
2185                 a = procfs_file_alloca(ppid, "uid_map");
2186                 fd = open(a, O_WRONLY|O_CLOEXEC);
2187                 if (fd < 0) {
2188                         r = -errno;
2189                         goto child_fail;
2190                 }
2191                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2192                         r = -errno;
2193                         goto child_fail;
2194                 }
2195
2196                 _exit(EXIT_SUCCESS);
2197
2198         child_fail:
2199                 (void) write(errno_pipe[1], &r, sizeof(r));
2200                 _exit(EXIT_FAILURE);
2201         }
2202
2203         errno_pipe[1] = safe_close(errno_pipe[1]);
2204
2205         if (unshare(CLONE_NEWUSER) < 0)
2206                 return -errno;
2207
2208         /* Let the child know that the namespace is ready now */
2209         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2210                 return -errno;
2211
2212         /* Try to read an error code from the child */
2213         n = read(errno_pipe[0], &r, sizeof(r));
2214         if (n < 0)
2215                 return -errno;
2216         if (n == sizeof(r)) { /* an error code was sent to us */
2217                 if (r < 0)
2218                         return r;
2219                 return -EIO;
2220         }
2221         if (n != 0) /* on success we should have read 0 bytes */
2222                 return -EIO;
2223
2224         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2225         if (r < 0)
2226                 return r;
2227         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2228                 return -EIO;
2229
2230         return 0;
2231 }
2232
2233 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2234         _cleanup_free_ char *src_abs = NULL;
2235         int r;
2236
2237         assert(source);
2238
2239         src_abs = path_join(root, source);
2240         if (!src_abs)
2241                 return -ENOMEM;
2242
2243         STRV_FOREACH(dst, symlinks) {
2244                 _cleanup_free_ char *dst_abs = NULL;
2245
2246                 dst_abs = path_join(root, *dst);
2247                 if (!dst_abs)
2248                         return -ENOMEM;
2249
2250                 r = mkdir_parents_label(dst_abs, 0755);
2251                 if (r < 0)
2252                         return r;
2253
2254                 r = symlink_idempotent(src_abs, dst_abs, true);
2255                 if (r < 0)
2256                         return r;
2257         }
2258
2259         return 0;
2260 }
2261
2262 static int setup_exec_directory(
2263                 const ExecContext *context,
2264                 const ExecParameters *params,
2265                 uid_t uid,
2266                 gid_t gid,
2267                 ExecDirectoryType type,
2268                 bool needs_mount_namespace,
2269                 int *exit_status) {
2270
2271         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2272                 [EXEC_DIRECTORY_RUNTIME]       = EXIT_RUNTIME_DIRECTORY,
2273                 [EXEC_DIRECTORY_STATE]         = EXIT_STATE_DIRECTORY,
2274                 [EXEC_DIRECTORY_CACHE]         = EXIT_CACHE_DIRECTORY,
2275                 [EXEC_DIRECTORY_LOGS]          = EXIT_LOGS_DIRECTORY,
2276                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2277         };
2278         int r;
2279
2280         assert(context);
2281         assert(params);
2282         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2283         assert(exit_status);
2284
2285         if (!params->prefix[type])
2286                 return 0;
2287
2288         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2289                 if (!uid_is_valid(uid))
2290                         uid = 0;
2291                 if (!gid_is_valid(gid))
2292                         gid = 0;
2293         }
2294
2295         FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
2296                 _cleanup_free_ char *p = NULL, *pp = NULL;
2297
2298                 p = path_join(params->prefix[type], i->path);
2299                 if (!p) {
2300                         r = -ENOMEM;
2301                         goto fail;
2302                 }
2303
2304                 r = mkdir_parents_label(p, 0755);
2305                 if (r < 0)
2306                         goto fail;
2307
2308                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2309
2310                         /* If we are in user mode, and a configuration directory exists but a state directory
2311                          * doesn't exist, then we likely are upgrading from an older systemd version that
2312                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2313                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2314                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME it is now
2315                          * separated. If a service has both dirs configured but only the configuration dir
2316                          * exists and the state dir does not, we assume we are looking at an update
2317                          * situation. Hence, create a compatibility symlink, so that all expectations are
2318                          * met.
2319                          *
2320                          * (We also do something similar with the log directory, which still doesn't exist in
2321                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2322
2323                         /* this assumes the state dir is always created before the configuration dir */
2324                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2325                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2326
2327                         r = laccess(p, F_OK);
2328                         if (r == -ENOENT) {
2329                                 _cleanup_free_ char *q = NULL;
2330
2331                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2332                                  * under the configuration hierarchy. */
2333
2334                                 if (type == EXEC_DIRECTORY_STATE)
2335                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], i->path);
2336                                 else if (type == EXEC_DIRECTORY_LOGS)
2337                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", i->path);
2338                                 else
2339                                         assert_not_reached();
2340                                 if (!q) {
2341                                         r = -ENOMEM;
2342                                         goto fail;
2343                                 }
2344
2345                                 r = laccess(q, F_OK);
2346                                 if (r >= 0) {
2347                                         /* It does exist! This hence looks like an update. Symlink the
2348                                          * configuration directory into the state directory. */
2349
2350                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2351                                         if (r < 0)
2352                                                 goto fail;
2353
2354                                         log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2355                                         continue;
2356                                 } else if (r != -ENOENT)
2357                                         log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2358
2359                         } else if (r < 0)
2360                                 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2361                 }
2362
2363                 if (exec_directory_is_private(context, type)) {
2364                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2365                          * case we want to avoid leaving a directory around fully accessible that is owned by
2366                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2367                          * trick used by container managers to prohibit host users to get access to files of
2368                          * the same UID in containers: we place everything inside a directory that has an
2369                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2370                          * for unprivileged host code. We then use fs namespacing to make this directory
2371                          * permeable for the service itself.
2372                          *
2373                          * Specifically: for a service which wants a special directory "foo/" we first create
2374                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2375                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2376                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2377                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2378                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2379                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2380                          * for the service and making sure it only gets access to the dirs it needs but no
2381                          * others. Tricky? Yes, absolutely, but it works!
2382                          *
2383                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2384                          * to be owned by the service itself.
2385                          *
2386                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2387                          * for sharing files or sockets with other services. */
2388
2389                         pp = path_join(params->prefix[type], "private");
2390                         if (!pp) {
2391                                 r = -ENOMEM;
2392                                 goto fail;
2393                         }
2394
2395                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2396                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2397                         if (r < 0)
2398                                 goto fail;
2399
2400                         if (!path_extend(&pp, i->path)) {
2401                                 r = -ENOMEM;
2402                                 goto fail;
2403                         }
2404
2405                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2406                         r = mkdir_parents_label(pp, 0755);
2407                         if (r < 0)
2408                                 goto fail;
2409
2410                         if (is_dir(p, false) > 0 &&
2411                             (laccess(pp, F_OK) == -ENOENT)) {
2412
2413                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2414                                  * it over. Most likely the service has been upgraded from one that didn't use
2415                                  * DynamicUser=1, to one that does. */
2416
2417                                 log_exec_info(context,
2418                                               params,
2419                                               "Found pre-existing public %s= directory %s, migrating to %s.\n"
2420                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2421                                               exec_directory_type_to_string(type), p, pp);
2422
2423                                 r = RET_NERRNO(rename(p, pp));
2424                                 if (r < 0)
2425                                         goto fail;
2426                         } else {
2427                                 /* Otherwise, create the actual directory for the service */
2428
2429                                 r = mkdir_label(pp, context->directories[type].mode);
2430                                 if (r < 0 && r != -EEXIST)
2431                                         goto fail;
2432                         }
2433
2434                         if (!i->only_create) {
2435                                 /* And link it up from the original place.
2436                                  * Notes
2437                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2438                                  *    the host, and a new one for the child namespace will be created later.
2439                                  * 2) It is not necessary to create this symlink when one of its parent
2440                                  *    directories is specified and already created. E.g.
2441                                  *        StateDirectory=foo foo/bar
2442                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2443                                  *        pp = "/var/lib/private/foo/bar"
2444                                  *        p = "/var/lib/foo/bar"
2445                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2446                                  *    we do not need to create the symlink, but we cannot create the symlink.
2447                                  *    See issue #24783. */
2448                                 r = symlink_idempotent(pp, p, true);
2449                                 if (r < 0)
2450                                         goto fail;
2451                         }
2452
2453                 } else {
2454                         _cleanup_free_ char *target = NULL;
2455
2456                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2457                             readlink_and_make_absolute(p, &target) >= 0) {
2458                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2459
2460                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2461                                  * by DynamicUser=1 (see above)?
2462                                  *
2463                                  * We do this for all directory types except for ConfigurationDirectory=,
2464                                  * since they all support the private/ symlink logic at least in some
2465                                  * configurations, see above. */
2466
2467                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2468                                 if (r < 0)
2469                                         goto fail;
2470
2471                                 q = path_join(params->prefix[type], "private", i->path);
2472                                 if (!q) {
2473                                         r = -ENOMEM;
2474                                         goto fail;
2475                                 }
2476
2477                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2478                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2479                                 if (r < 0)
2480                                         goto fail;
2481
2482                                 if (path_equal(q_resolved, target_resolved)) {
2483
2484                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2485                                          * but is no longer. Let's move the directory back up. */
2486
2487                                         log_exec_info(context,
2488                                                       params,
2489                                                       "Found pre-existing private %s= directory %s, migrating to %s.\n"
2490                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2491                                                       exec_directory_type_to_string(type), q, p);
2492
2493                                         r = RET_NERRNO(unlink(p));
2494                                         if (r < 0)
2495                                                 goto fail;
2496
2497                                         r = RET_NERRNO(rename(q, p));
2498                                         if (r < 0)
2499                                                 goto fail;
2500                                 }
2501                         }
2502
2503                         r = mkdir_label(p, context->directories[type].mode);
2504                         if (r < 0) {
2505                                 if (r != -EEXIST)
2506                                         goto fail;
2507
2508                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2509                                         struct stat st;
2510
2511                                         /* Don't change the owner/access mode of the configuration directory,
2512                                          * as in the common case it is not written to by a service, and shall
2513                                          * not be writable. */
2514
2515                                         r = RET_NERRNO(stat(p, &st));
2516                                         if (r < 0)
2517                                                 goto fail;
2518
2519                                         /* Still complain if the access mode doesn't match */
2520                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2521                                                 log_exec_warning(context,
2522                                                                  params,
2523                                                                  "%s \'%s\' already exists but the mode is different. "
2524                                                                  "(File system: %o %sMode: %o)",
2525                                                                  exec_directory_type_to_string(type), i->path,
2526                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2527
2528                                         continue;
2529                                 }
2530                         }
2531                 }
2532
2533                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2534                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2535                  * current UID/GID ownership.) */
2536                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2537                 if (r < 0)
2538                         goto fail;
2539
2540                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2541                  * available to user code anyway */
2542                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2543                         continue;
2544
2545                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2546                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2547                  * assignments to exist. */
2548                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2549                 if (r < 0)
2550                         goto fail;
2551         }
2552
2553         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2554          * they are set up later, to allow configuring empty var/run/etc. */
2555         if (!needs_mount_namespace)
2556                 FOREACH_ARRAY(i, context->directories[type].items, context->directories[type].n_items) {
2557                         r = create_many_symlinks(params->prefix[type], i->path, i->symlinks);
2558                         if (r < 0)
2559                                 goto fail;
2560                 }
2561
2562         return 0;
2563
2564 fail:
2565         *exit_status = exit_status_table[type];
2566         return r;
2567 }
2568
2569 #if ENABLE_SMACK
2570 static int setup_smack(
2571                 const ExecParameters *params,
2572                 const ExecContext *context,
2573                 int executable_fd) {
2574         int r;
2575
2576         assert(params);
2577         assert(executable_fd >= 0);
2578
2579         if (context->smack_process_label) {
2580                 r = mac_smack_apply_pid(0, context->smack_process_label);
2581                 if (r < 0)
2582                         return r;
2583         } else if (params->fallback_smack_process_label) {
2584                 _cleanup_free_ char *exec_label = NULL;
2585
2586                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2587                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2588                         return r;
2589
2590                 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2591                 if (r < 0)
2592                         return r;
2593         }
2594
2595         return 0;
2596 }
2597 #endif
2598
2599 static int compile_bind_mounts(
2600                 const ExecContext *context,
2601                 const ExecParameters *params,
2602                 BindMount **ret_bind_mounts,
2603                 size_t *ret_n_bind_mounts,
2604                 char ***ret_empty_directories) {
2605
2606         _cleanup_strv_free_ char **empty_directories = NULL;
2607         BindMount *bind_mounts = NULL;
2608         size_t n, h = 0;
2609         int r;
2610
2611         assert(context);
2612         assert(params);
2613         assert(ret_bind_mounts);
2614         assert(ret_n_bind_mounts);
2615         assert(ret_empty_directories);
2616
2617         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2618
2619         n = context->n_bind_mounts;
2620         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2621                 if (!params->prefix[t])
2622                         continue;
2623
2624                 FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items)
2625                         n += !i->only_create;
2626         }
2627
2628         if (n <= 0) {
2629                 *ret_bind_mounts = NULL;
2630                 *ret_n_bind_mounts = 0;
2631                 *ret_empty_directories = NULL;
2632                 return 0;
2633         }
2634
2635         bind_mounts = new(BindMount, n);
2636         if (!bind_mounts)
2637                 return -ENOMEM;
2638
2639         FOREACH_ARRAY(item, context->bind_mounts, context->n_bind_mounts) {
2640                 _cleanup_free_ char *s = NULL, *d = NULL;
2641
2642                 s = strdup(item->source);
2643                 if (!s)
2644                         return -ENOMEM;
2645
2646                 d = strdup(item->destination);
2647                 if (!d)
2648                         return -ENOMEM;
2649
2650                 bind_mounts[h++] = (BindMount) {
2651                         .source = TAKE_PTR(s),
2652                         .destination = TAKE_PTR(d),
2653                         .read_only = item->read_only,
2654                         .recursive = item->recursive,
2655                         .ignore_enoent = item->ignore_enoent,
2656                 };
2657         }
2658
2659         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2660                 if (!params->prefix[t])
2661                         continue;
2662
2663                 if (context->directories[t].n_items == 0)
2664                         continue;
2665
2666                 if (exec_directory_is_private(context, t) &&
2667                     !exec_context_with_rootfs(context)) {
2668                         char *private_root;
2669
2670                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2671                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2672                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2673
2674                         private_root = path_join(params->prefix[t], "private");
2675                         if (!private_root)
2676                                 return -ENOMEM;
2677
2678                         r = strv_consume(&empty_directories, private_root);
2679                         if (r < 0)
2680                                 return r;
2681                 }
2682
2683                 FOREACH_ARRAY(i, context->directories[t].items, context->directories[t].n_items) {
2684                         _cleanup_free_ char *s = NULL, *d = NULL;
2685
2686                         /* When one of the parent directories is in the list, we cannot create the symlink
2687                          * for the child directory. See also the comments in setup_exec_directory(). */
2688                         if (i->only_create)
2689                                 continue;
2690
2691                         if (exec_directory_is_private(context, t))
2692                                 s = path_join(params->prefix[t], "private", i->path);
2693                         else
2694                                 s = path_join(params->prefix[t], i->path);
2695                         if (!s)
2696                                 return -ENOMEM;
2697
2698                         if (exec_directory_is_private(context, t) &&
2699                             exec_context_with_rootfs(context))
2700                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2701                                  * directory is not created on the root directory. So, let's bind-mount the directory
2702                                  * on the 'non-private' place. */
2703                                 d = path_join(params->prefix[t], i->path);
2704                         else
2705                                 d = strdup(s);
2706                         if (!d)
2707                                 return -ENOMEM;
2708
2709                         bind_mounts[h++] = (BindMount) {
2710                                 .source = TAKE_PTR(s),
2711                                 .destination = TAKE_PTR(d),
2712                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2713                                 .recursive = true,
2714                         };
2715                 }
2716         }
2717
2718         assert(h == n);
2719
2720         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2721         *ret_n_bind_mounts = n;
2722         *ret_empty_directories = TAKE_PTR(empty_directories);
2723
2724         return (int) n;
2725 }
2726
2727 /* ret_symlinks will contain a list of pairs src:dest that describes
2728  * the symlinks to create later on. For example, the symlinks needed
2729  * to safely give private directories to DynamicUser=1 users. */
2730 static int compile_symlinks(
2731                 const ExecContext *context,
2732                 const ExecParameters *params,
2733                 bool setup_os_release_symlink,
2734                 char ***ret_symlinks) {
2735
2736         _cleanup_strv_free_ char **symlinks = NULL;
2737         int r;
2738
2739         assert(context);
2740         assert(params);
2741         assert(ret_symlinks);
2742
2743         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++)
2744                 FOREACH_ARRAY(i, context->directories[dt].items, context->directories[dt].n_items) {
2745                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2746
2747                         STRV_FOREACH(symlink, i->symlinks) {
2748                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2749
2750                                 src_abs = path_join(params->prefix[dt], i->path);
2751                                 dst_abs = path_join(params->prefix[dt], *symlink);
2752                                 if (!src_abs || !dst_abs)
2753                                         return -ENOMEM;
2754
2755                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2756                                 if (r < 0)
2757                                         return r;
2758                         }
2759
2760                         if (!exec_directory_is_private(context, dt) ||
2761                             exec_context_with_rootfs(context) ||
2762                             i->only_create)
2763                                 continue;
2764
2765                         private_path = path_join(params->prefix[dt], "private", i->path);
2766                         if (!private_path)
2767                                 return -ENOMEM;
2768
2769                         path = path_join(params->prefix[dt], i->path);
2770                         if (!path)
2771                                 return -ENOMEM;
2772
2773                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2774                         if (r < 0)
2775                                 return r;
2776                 }
2777
2778         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2779          * and readers will never get a half-written version. Note that, while the paths specified here are
2780          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2781          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2782         if (setup_os_release_symlink) {
2783                 r = strv_extend_many(
2784                                 &symlinks,
2785                                 "/run/host/.os-release-stage/os-release",
2786                                 "/run/host/os-release");
2787                 if (r < 0)
2788                         return r;
2789         }
2790
2791         *ret_symlinks = TAKE_PTR(symlinks);
2792
2793         return 0;
2794 }
2795
2796 static bool insist_on_sandboxing(
2797                 const ExecContext *context,
2798                 const char *root_dir,
2799                 const char *root_image,
2800                 const BindMount *bind_mounts,
2801                 size_t n_bind_mounts) {
2802
2803         assert(context);
2804         assert(n_bind_mounts == 0 || bind_mounts);
2805
2806         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2807          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2808          * rearrange stuff in a way we cannot ignore gracefully. */
2809
2810         if (context->n_temporary_filesystems > 0)
2811                 return true;
2812
2813         if (root_dir || root_image)
2814                 return true;
2815
2816         if (context->n_mount_images > 0)
2817                 return true;
2818
2819         if (context->dynamic_user)
2820                 return true;
2821
2822         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2823                 return true;
2824
2825         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2826          * essential. */
2827         FOREACH_ARRAY(i, bind_mounts, n_bind_mounts)
2828                 if (!path_equal(i->source, i->destination))
2829                         return true;
2830
2831         if (context->log_namespace)
2832                 return true;
2833
2834         return false;
2835 }
2836
2837 static int setup_ephemeral(
2838                 const ExecContext *context,
2839                 ExecRuntime *runtime,
2840                 char **root_image,            /* both input and output! modified if ephemeral logic enabled */
2841                 char **root_directory) {      /* ditto */
2842
2843         _cleanup_close_ int fd = -EBADF;
2844         _cleanup_free_ char *new_root = NULL;
2845         int r;
2846
2847         assert(context);
2848         assert(root_image);
2849         assert(root_directory);
2850
2851         if (!*root_image && !*root_directory)
2852                 return 0;
2853
2854         if (!runtime || !runtime->ephemeral_copy)
2855                 return 0;
2856
2857         assert(runtime->ephemeral_storage_socket[0] >= 0);
2858         assert(runtime->ephemeral_storage_socket[1] >= 0);
2859
2860         new_root = strdup(runtime->ephemeral_copy);
2861         if (!new_root)
2862                 return log_oom_debug();
2863
2864         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2865         if (r < 0)
2866                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2867
2868         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2869
2870         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2871         if (fd >= 0)
2872                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2873                 return 0;
2874         if (fd != -EAGAIN)
2875                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2876
2877         if (*root_image) {
2878                 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
2879
2880                 fd = copy_file(*root_image,
2881                                new_root,
2882                                O_EXCL,
2883                                0600,
2884                                COPY_LOCK_BSD|
2885                                COPY_REFLINK|
2886                                COPY_CRTIME);
2887                 if (fd < 0)
2888                         return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
2889                                                *root_image, new_root);
2890
2891                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2892                  * which tends to not perform well in combination with lots of random writes.
2893                  *
2894                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2895                  * copy, but we at least want to make the intention clear.
2896                  */
2897                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2898                 if (r < 0)
2899                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
2900         } else {
2901                 assert(*root_directory);
2902
2903                 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
2904
2905                 fd = btrfs_subvol_snapshot_at(
2906                                 AT_FDCWD, *root_directory,
2907                                 AT_FDCWD, new_root,
2908                                 BTRFS_SNAPSHOT_FALLBACK_COPY |
2909                                 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2910                                 BTRFS_SNAPSHOT_RECURSIVE |
2911                                 BTRFS_SNAPSHOT_LOCK_BSD);
2912                 if (fd < 0)
2913                         return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
2914                                                *root_directory, new_root);
2915         }
2916
2917         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2918         if (r < 0)
2919                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2920
2921         if (*root_image)
2922                 free_and_replace(*root_image, new_root);
2923         else {
2924                 assert(*root_directory);
2925                 free_and_replace(*root_directory, new_root);
2926         }
2927
2928         return 1;
2929 }
2930
2931 static int verity_settings_prepare(
2932                 VeritySettings *verity,
2933                 const char *root_image,
2934                 const void *root_hash,
2935                 size_t root_hash_size,
2936                 const char *root_hash_path,
2937                 const void *root_hash_sig,
2938                 size_t root_hash_sig_size,
2939                 const char *root_hash_sig_path,
2940                 const char *verity_data_path) {
2941
2942         int r;
2943
2944         assert(verity);
2945
2946         if (root_hash) {
2947                 void *d;
2948
2949                 d = memdup(root_hash, root_hash_size);
2950                 if (!d)
2951                         return -ENOMEM;
2952
2953                 free_and_replace(verity->root_hash, d);
2954                 verity->root_hash_size = root_hash_size;
2955                 verity->designator = PARTITION_ROOT;
2956         }
2957
2958         if (root_hash_sig) {
2959                 void *d;
2960
2961                 d = memdup(root_hash_sig, root_hash_sig_size);
2962                 if (!d)
2963                         return -ENOMEM;
2964
2965                 free_and_replace(verity->root_hash_sig, d);
2966                 verity->root_hash_sig_size = root_hash_sig_size;
2967                 verity->designator = PARTITION_ROOT;
2968         }
2969
2970         if (verity_data_path) {
2971                 r = free_and_strdup(&verity->data_path, verity_data_path);
2972                 if (r < 0)
2973                         return r;
2974         }
2975
2976         r = verity_settings_load(
2977                         verity,
2978                         root_image,
2979                         root_hash_path,
2980                         root_hash_sig_path);
2981         if (r < 0)
2982                 return log_debug_errno(r, "Failed to load root hash: %m");
2983
2984         return 0;
2985 }
2986
2987 static int pick_versions(
2988                 const ExecContext *context,
2989                 const ExecParameters *params,
2990                 char **ret_root_image,
2991                 char **ret_root_directory) {
2992
2993         int r;
2994
2995         assert(context);
2996         assert(params);
2997         assert(ret_root_image);
2998         assert(ret_root_directory);
2999
3000         if (context->root_image) {
3001                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3002
3003                 r = path_pick(/* toplevel_path= */ NULL,
3004                               /* toplevel_fd= */ AT_FDCWD,
3005                               context->root_image,
3006                               &pick_filter_image_raw,
3007                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3008                               &result);
3009                 if (r < 0)
3010                         return r;
3011
3012                 if (!result.path)
3013                         return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3014
3015                 *ret_root_image = TAKE_PTR(result.path);
3016                 *ret_root_directory = NULL;
3017                 return r;
3018         }
3019
3020         if (context->root_directory) {
3021                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3022
3023                 r = path_pick(/* toplevel_path= */ NULL,
3024                               /* toplevel_fd= */ AT_FDCWD,
3025                               context->root_directory,
3026                               &pick_filter_image_dir,
3027                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3028                               &result);
3029                 if (r < 0)
3030                         return r;
3031
3032                 if (!result.path)
3033                         return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3034
3035                 *ret_root_image = NULL;
3036                 *ret_root_directory = TAKE_PTR(result.path);
3037                 return r;
3038         }
3039
3040         *ret_root_image = *ret_root_directory = NULL;
3041         return 0;
3042 }
3043
3044 static int apply_mount_namespace(
3045                 ExecCommandFlags command_flags,
3046                 const ExecContext *context,
3047                 const ExecParameters *params,
3048                 ExecRuntime *runtime,
3049                 const char *memory_pressure_path,
3050                 bool needs_sandboxing,
3051                 char **error_path) {
3052
3053         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3054         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3055                         **read_write_paths_cleanup = NULL;
3056         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3057                 *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3058         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3059         char **read_write_paths;
3060         bool setup_os_release_symlink;
3061         BindMount *bind_mounts = NULL;
3062         size_t n_bind_mounts = 0;
3063         int r;
3064
3065         assert(context);
3066
3067         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3068
3069         if (params->flags & EXEC_APPLY_CHROOT) {
3070                 r = pick_versions(
3071                                 context,
3072                                 params,
3073                                 &root_image,
3074                                 &root_dir);
3075                 if (r < 0)
3076                         return r;
3077
3078                 r = setup_ephemeral(
3079                                 context,
3080                                 runtime,
3081                                 &root_image,
3082                                 &root_dir);
3083                 if (r < 0)
3084                         return r;
3085         }
3086
3087         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3088         if (r < 0)
3089                 return r;
3090
3091         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3092          * service will need to write to it in order to start the notifications. */
3093         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3094                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3095                 if (!read_write_paths_cleanup)
3096                         return -ENOMEM;
3097
3098                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3099                 if (r < 0)
3100                         return r;
3101
3102                 read_write_paths = read_write_paths_cleanup;
3103         } else
3104                 read_write_paths = context->read_write_paths;
3105
3106         if (needs_sandboxing) {
3107                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3108                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3109                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3110
3111                 if (context->private_tmp && runtime && runtime->shared) {
3112                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3113                                 tmp_dir = runtime->shared->tmp_dir;
3114                         else if (runtime->shared->tmp_dir)
3115                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3116
3117                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3118                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3119                         else if (runtime->shared->var_tmp_dir)
3120                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3121                 }
3122         }
3123
3124         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3125         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3126         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3127         if (r < 0)
3128                 return r;
3129
3130         if (context->mount_propagation_flag == MS_SHARED)
3131                 log_exec_debug(context,
3132                                params,
3133                                "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3134
3135         r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3136         if (r < 0)
3137                 return r;
3138
3139         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3140                 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3141                 if (!propagate_dir)
3142                         return -ENOMEM;
3143
3144                 incoming_dir = strdup("/run/systemd/incoming");
3145                 if (!incoming_dir)
3146                         return -ENOMEM;
3147
3148                 extension_dir = strdup("/run/systemd/unit-extensions");
3149                 if (!extension_dir)
3150                         return -ENOMEM;
3151
3152                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3153                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3154                 if (setup_os_release_symlink) {
3155                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3156                         if (!host_os_release_stage)
3157                                 return -ENOMEM;
3158                 }
3159         } else {
3160                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3161
3162                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3163                         return -ENOMEM;
3164
3165                 if (setup_os_release_symlink) {
3166                         if (asprintf(&host_os_release_stage,
3167                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3168                                      geteuid()) < 0)
3169                                 return -ENOMEM;
3170                 }
3171         }
3172
3173         if (root_image) {
3174                 r = verity_settings_prepare(
3175                         &verity,
3176                         root_image,
3177                         context->root_hash, context->root_hash_size, context->root_hash_path,
3178                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3179                         context->root_verity);
3180                 if (r < 0)
3181                         return r;
3182         }
3183
3184         NamespaceParameters parameters = {
3185                 .runtime_scope = params->runtime_scope,
3186
3187                 .root_directory = root_dir,
3188                 .root_image = root_image,
3189                 .root_image_options = context->root_image_options,
3190                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3191
3192                 .read_write_paths = read_write_paths,
3193                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3194                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3195
3196                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3197                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3198
3199                 .empty_directories = empty_directories,
3200                 .symlinks = symlinks,
3201
3202                 .bind_mounts = bind_mounts,
3203                 .n_bind_mounts = n_bind_mounts,
3204
3205                 .temporary_filesystems = context->temporary_filesystems,
3206                 .n_temporary_filesystems = context->n_temporary_filesystems,
3207
3208                 .mount_images = context->mount_images,
3209                 .n_mount_images = context->n_mount_images,
3210                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3211
3212                 .tmp_dir = tmp_dir,
3213                 .var_tmp_dir = var_tmp_dir,
3214
3215                 .creds_path = creds_path,
3216                 .log_namespace = context->log_namespace,
3217                 .mount_propagation_flag = context->mount_propagation_flag,
3218
3219                 .verity = &verity,
3220
3221                 .extension_images = context->extension_images,
3222                 .n_extension_images = context->n_extension_images,
3223                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3224                 .extension_directories = context->extension_directories,
3225
3226                 .propagate_dir = propagate_dir,
3227                 .incoming_dir = incoming_dir,
3228                 .extension_dir = extension_dir,
3229                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3230                 .host_os_release_stage = host_os_release_stage,
3231
3232                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3233                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3234                  * sandbox inside the mount namespace. */
3235                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3236
3237                 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3238                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3239                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3240                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3241                 .protect_hostname = needs_sandboxing && context->protect_hostname,
3242
3243                 .private_dev = needs_sandboxing && context->private_devices,
3244                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3245                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3246
3247                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3248
3249                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3250                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3251
3252                 .protect_home = needs_sandboxing ? context->protect_home : false,
3253                 .protect_system = needs_sandboxing ? context->protect_system : false,
3254                 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3255                 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3256         };
3257
3258         r = setup_namespace(&parameters, error_path);
3259         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3260          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3261          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3262          * completely different execution environment. */
3263         if (r == -ENOANO) {
3264                 if (insist_on_sandboxing(
3265                                     context,
3266                                     root_dir, root_image,
3267                                     bind_mounts,
3268                                     n_bind_mounts))
3269                         return log_exec_debug_errno(context,
3270                                                     params,
3271                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3272                                                     "Failed to set up namespace, and refusing to continue since "
3273                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3274                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3275                                                     n_bind_mounts,
3276                                                     context->n_temporary_filesystems,
3277                                                     yes_no(root_dir),
3278                                                     yes_no(root_image),
3279                                                     yes_no(context->dynamic_user));
3280
3281                 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3282                 return 0;
3283         }
3284
3285         return r;
3286 }
3287
3288 static int apply_working_directory(
3289                 const ExecContext *context,
3290                 const ExecParameters *params,
3291                 ExecRuntime *runtime,
3292                 const char *home,
3293                 int *exit_status) {
3294
3295         const char *wd;
3296         int r;
3297
3298         assert(context);
3299         assert(exit_status);
3300
3301         if (context->working_directory_home) {
3302                 if (!home) {
3303                         *exit_status = EXIT_CHDIR;
3304                         return -ENXIO;
3305                 }
3306
3307                 wd = home;
3308         } else
3309                 wd = empty_to_root(context->working_directory);
3310
3311         if (params->flags & EXEC_APPLY_CHROOT)
3312                 r = RET_NERRNO(chdir(wd));
3313         else {
3314                 _cleanup_close_ int dfd = -EBADF;
3315
3316                 r = chase(wd,
3317                           (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
3318                           CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3319                           /* ret_path= */ NULL,
3320                           &dfd);
3321                 if (r >= 0)
3322                         r = RET_NERRNO(fchdir(dfd));
3323         }
3324
3325         if (r < 0 && !context->working_directory_missing_ok) {
3326                 *exit_status = EXIT_CHDIR;
3327                 return r;
3328         }
3329
3330         return 0;
3331 }
3332
3333 static int apply_root_directory(
3334                 const ExecContext *context,
3335                 const ExecParameters *params,
3336                 ExecRuntime *runtime,
3337                 const bool needs_mount_ns,
3338                 int *exit_status) {
3339
3340         assert(context);
3341         assert(exit_status);
3342
3343         if (params->flags & EXEC_APPLY_CHROOT)
3344                 if (!needs_mount_ns && context->root_directory)
3345                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3346                                 *exit_status = EXIT_CHROOT;
3347                                 return -errno;
3348                         }
3349
3350         return 0;
3351 }
3352
3353 static int setup_keyring(
3354                 const ExecContext *context,
3355                 const ExecParameters *p,
3356                 uid_t uid, gid_t gid) {
3357
3358         key_serial_t keyring;
3359         int r = 0;
3360         uid_t saved_uid;
3361         gid_t saved_gid;
3362
3363         assert(context);
3364         assert(p);
3365
3366         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3367          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3368          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3369          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3370          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3371          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3372
3373         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3374                 return 0;
3375
3376         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3377          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3378          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3379          * & group is just as nasty as acquiring a reference to the user keyring. */
3380
3381         saved_uid = getuid();
3382         saved_gid = getgid();
3383
3384         if (gid_is_valid(gid) && gid != saved_gid) {
3385                 if (setregid(gid, -1) < 0)
3386                         return log_exec_error_errno(context,
3387                                                     p,
3388                                                     errno,
3389                                                     "Failed to change GID for user keyring: %m");
3390         }
3391
3392         if (uid_is_valid(uid) && uid != saved_uid) {
3393                 if (setreuid(uid, -1) < 0) {
3394                         r = log_exec_error_errno(context,
3395                                                  p,
3396                                                  errno,
3397                                                  "Failed to change UID for user keyring: %m");
3398                         goto out;
3399                 }
3400         }
3401
3402         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3403         if (keyring == -1) {
3404                 if (errno == ENOSYS)
3405                         log_exec_debug_errno(context,
3406                                              p,
3407                                              errno,
3408                                              "Kernel keyring not supported, ignoring.");
3409                 else if (ERRNO_IS_PRIVILEGE(errno))
3410                         log_exec_debug_errno(context,
3411                                              p,
3412                                              errno,
3413                                              "Kernel keyring access prohibited, ignoring.");
3414                 else if (errno == EDQUOT)
3415                         log_exec_debug_errno(context,
3416                                              p,
3417                                              errno,
3418                                              "Out of kernel keyrings to allocate, ignoring.");
3419                 else
3420                         r = log_exec_error_errno(context,
3421                                                  p,
3422                                                  errno,
3423                                                  "Setting up kernel keyring failed: %m");
3424
3425                 goto out;
3426         }
3427
3428         /* When requested link the user keyring into the session keyring. */
3429         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3430
3431                 if (keyctl(KEYCTL_LINK,
3432                            KEY_SPEC_USER_KEYRING,
3433                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3434                         r = log_exec_error_errno(context,
3435                                                  p,
3436                                                  errno,
3437                                                  "Failed to link user keyring into session keyring: %m");
3438                         goto out;
3439                 }
3440         }
3441
3442         /* Restore uid/gid back */
3443         if (uid_is_valid(uid) && uid != saved_uid) {
3444                 if (setreuid(saved_uid, -1) < 0) {
3445                         r = log_exec_error_errno(context,
3446                                                  p,
3447                                                  errno,
3448                                                  "Failed to change UID back for user keyring: %m");
3449                         goto out;
3450                 }
3451         }
3452
3453         if (gid_is_valid(gid) && gid != saved_gid) {
3454                 if (setregid(saved_gid, -1) < 0)
3455                         return log_exec_error_errno(context,
3456                                                     p,
3457                                                     errno,
3458                                                     "Failed to change GID back for user keyring: %m");
3459         }
3460
3461         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3462         if (!sd_id128_is_null(p->invocation_id)) {
3463                 key_serial_t key;
3464
3465                 key = add_key("user",
3466                               "invocation_id",
3467                               &p->invocation_id,
3468                               sizeof(p->invocation_id),
3469                               KEY_SPEC_SESSION_KEYRING);
3470                 if (key == -1)
3471                         log_exec_debug_errno(context,
3472                                              p,
3473                                              errno,
3474                                              "Failed to add invocation ID to keyring, ignoring: %m");
3475                 else {
3476                         if (keyctl(KEYCTL_SETPERM, key,
3477                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3478                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3479                                 r = log_exec_error_errno(context,
3480                                                          p,
3481                                                          errno,
3482                                                          "Failed to restrict invocation ID permission: %m");
3483                 }
3484         }
3485
3486 out:
3487         /* Revert back uid & gid for the last time, and exit */
3488         /* no extra logging, as only the first already reported error matters */
3489         if (getuid() != saved_uid)
3490                 (void) setreuid(saved_uid, -1);
3491
3492         if (getgid() != saved_gid)
3493                 (void) setregid(saved_gid, -1);
3494
3495         return r;
3496 }
3497
3498 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3499         assert(array);
3500         assert(n);
3501         assert(pair);
3502
3503         if (pair[0] >= 0)
3504                 array[(*n)++] = pair[0];
3505         if (pair[1] >= 0)
3506                 array[(*n)++] = pair[1];
3507 }
3508
3509 static int close_remaining_fds(
3510                 const ExecParameters *params,
3511                 const ExecRuntime *runtime,
3512                 int socket_fd,
3513                 const int *fds, size_t n_fds) {
3514
3515         size_t n_dont_close = 0;
3516         int dont_close[n_fds + 16];
3517
3518         assert(params);
3519
3520         if (params->stdin_fd >= 0)
3521                 dont_close[n_dont_close++] = params->stdin_fd;
3522         if (params->stdout_fd >= 0)
3523                 dont_close[n_dont_close++] = params->stdout_fd;
3524         if (params->stderr_fd >= 0)
3525                 dont_close[n_dont_close++] = params->stderr_fd;
3526
3527         if (socket_fd >= 0)
3528                 dont_close[n_dont_close++] = socket_fd;
3529         if (n_fds > 0) {
3530                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3531                 n_dont_close += n_fds;
3532         }
3533
3534         if (runtime)
3535                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3536
3537         if (runtime && runtime->shared) {
3538                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3539                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3540         }
3541
3542         if (runtime && runtime->dynamic_creds) {
3543                 if (runtime->dynamic_creds->user)
3544                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3545                 if (runtime->dynamic_creds->group)
3546                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3547         }
3548
3549         if (params->user_lookup_fd >= 0)
3550                 dont_close[n_dont_close++] = params->user_lookup_fd;
3551
3552         if (params->handoff_timestamp_fd >= 0)
3553                 dont_close[n_dont_close++] = params->handoff_timestamp_fd;
3554
3555         assert(n_dont_close <= ELEMENTSOF(dont_close));
3556
3557         return close_all_fds(dont_close, n_dont_close);
3558 }
3559
3560 static int send_user_lookup(
3561                 const char *unit_id,
3562                 int user_lookup_fd,
3563                 uid_t uid,
3564                 gid_t gid) {
3565
3566         assert(unit_id);
3567
3568         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3569          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3570          * specified. */
3571
3572         if (user_lookup_fd < 0)
3573                 return 0;
3574
3575         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3576                 return 0;
3577
3578         if (writev(user_lookup_fd,
3579                (struct iovec[]) {
3580                            IOVEC_MAKE(&uid, sizeof(uid)),
3581                            IOVEC_MAKE(&gid, sizeof(gid)),
3582                            IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3583                 return -errno;
3584
3585         return 0;
3586 }
3587
3588 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3589         int r;
3590
3591         assert(c);
3592         assert(home);
3593         assert(buf);
3594
3595         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3596
3597         if (*home)
3598                 return 0;
3599
3600         if (!c->working_directory_home)
3601                 return 0;
3602
3603         r = get_home_dir(buf);
3604         if (r < 0)
3605                 return r;
3606
3607         *home = *buf;
3608         return 1;
3609 }
3610
3611 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3612         _cleanup_strv_free_ char ** list = NULL;
3613         int r;
3614
3615         assert(c);
3616         assert(p);
3617         assert(ret);
3618
3619         assert(c->dynamic_user);
3620
3621         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3622          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3623          * directories. */
3624
3625         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3626                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3627                         continue;
3628
3629                 if (!p->prefix[t])
3630                         continue;
3631
3632                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3633                         char *e;
3634
3635                         if (exec_directory_is_private(c, t))
3636                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3637                         else
3638                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3639                         if (!e)
3640                                 return -ENOMEM;
3641
3642                         r = strv_consume(&list, e);
3643                         if (r < 0)
3644                                 return r;
3645                 }
3646         }
3647
3648         *ret = TAKE_PTR(list);
3649
3650         return 0;
3651 }
3652
3653 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3654         _cleanup_(cpu_set_reset) CPUSet s = {};
3655         int r;
3656
3657         assert(c);
3658         assert(ret);
3659
3660         if (!c->numa_policy.nodes.set) {
3661                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3662                 return 0;
3663         }
3664
3665         r = numa_to_cpu_set(&c->numa_policy, &s);
3666         if (r < 0)
3667                 return r;
3668
3669         cpu_set_reset(ret);
3670
3671         return cpu_set_add_all(ret, &s);
3672 }
3673
3674 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3675         int r;
3676
3677         assert(fds);
3678         assert(n_fds);
3679         assert(*n_fds < fds_size);
3680         assert(fd);
3681
3682         if (*fd < 0)
3683                return 0;
3684
3685         if (*fd < 3 + (int) *n_fds) {
3686                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3687                  * the fds we pass to the process (or which are closed only during execve). */
3688
3689                 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3690                 if (r < 0)
3691                         return -errno;
3692
3693                 close_and_replace(*fd, r);
3694         }
3695
3696         fds[(*n_fds)++] = *fd;
3697         return 1;
3698 }
3699
3700 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3701         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3702
3703         union sockaddr_union addr = {
3704                 .un.sun_family = AF_UNIX,
3705         };
3706         socklen_t sa_len;
3707         int r;
3708
3709         assert(c);
3710         assert(p);
3711         assert(of);
3712         assert(ofd >= 0);
3713
3714         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3715         if (r < 0)
3716                 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for '%s': %m", of->path);
3717         sa_len = r;
3718
3719         FOREACH_ELEMENT(i, socket_types) {
3720                 _cleanup_close_ int fd = -EBADF;
3721
3722                 fd = socket(AF_UNIX, *i|SOCK_CLOEXEC, 0);
3723                 if (fd < 0)
3724                         return log_exec_error_errno(c, p,
3725                                                     errno, "Failed to create socket for '%s': %m",
3726                                                     of->path);
3727
3728                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3729                 if (r >= 0)
3730                         return TAKE_FD(fd);
3731                 if (r != -EPROTOTYPE)
3732                         return log_exec_error_errno(c, p,
3733                                                     r, "Failed to connect to socket for '%s': %m",
3734                                                     of->path);
3735         }
3736
3737         return log_exec_error_errno(c, p,
3738                                     SYNTHETIC_ERRNO(EPROTOTYPE), "No suitable socket type to connect to socket '%s'.",
3739                                     of->path);
3740 }
3741
3742 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3743         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3744         struct stat st;
3745
3746         assert(c);
3747         assert(p);
3748         assert(of);
3749
3750         ofd = open(of->path, O_PATH | O_CLOEXEC);
3751         if (ofd < 0)
3752                 return log_exec_error_errno(c, p, errno, "Failed to open '%s' as O_PATH: %m", of->path);
3753
3754         if (fstat(ofd, &st) < 0)
3755                 return log_exec_error_errno(c, p, errno, "Failed to stat '%s': %m", of->path);
3756
3757         if (S_ISSOCK(st.st_mode)) {
3758                 fd = connect_unix_harder(c, p, of, ofd);
3759                 if (fd < 0)
3760                         return fd;
3761
3762                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3763                         return log_exec_error_errno(c, p,
3764                                                     errno, "Failed to shutdown send for socket '%s': %m",
3765                                                     of->path);
3766
3767                 log_exec_debug(c, p, "Opened socket '%s' as fd %d.", of->path, fd);
3768         } else {
3769                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3770                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3771                         flags |= O_APPEND;
3772                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3773                         flags |= O_TRUNC;
3774
3775                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3776                 if (fd < 0)
3777                         return log_exec_error_errno(c, p, fd, "Failed to reopen file '%s': %m", of->path);
3778
3779                 log_exec_debug(c, p, "Opened file '%s' as fd %d.", of->path, fd);
3780         }
3781
3782         return TAKE_FD(fd);
3783 }
3784
3785 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3786         int r;
3787
3788         assert(c);
3789         assert(p);
3790         assert(n_fds);
3791
3792         LIST_FOREACH(open_files, of, p->open_files) {
3793                 _cleanup_close_ int fd = -EBADF;
3794
3795                 fd = get_open_file_fd(c, p, of);
3796                 if (fd < 0) {
3797                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3798                                 log_exec_warning_errno(c, p, fd,
3799                                                        "Failed to get OpenFile= file descriptor for '%s', ignoring: %m",
3800                                                        of->path);
3801                                 continue;
3802                         }
3803
3804                         return fd;
3805                 }
3806
3807                 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3808                         return -ENOMEM;
3809
3810                 r = strv_extend(&p->fd_names, of->fdname);
3811                 if (r < 0)
3812                         return r;
3813
3814                 p->fds[(*n_fds)++] = TAKE_FD(fd);
3815         }
3816
3817         return 0;
3818 }
3819
3820 static void log_command_line(
3821                 const ExecContext *context,
3822                 const ExecParameters *params,
3823                 const char *msg,
3824                 const char *executable,
3825                 char **argv) {
3826
3827         assert(context);
3828         assert(params);
3829         assert(msg);
3830         assert(executable);
3831
3832         if (!DEBUG_LOGGING)
3833                 return;
3834
3835         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3836
3837         log_exec_struct(context, params, LOG_DEBUG,
3838                         "EXECUTABLE=%s", executable,
3839                         LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3840                         LOG_EXEC_INVOCATION_ID(params));
3841 }
3842
3843 static bool exec_context_need_unprivileged_private_users(
3844                 const ExecContext *context,
3845                 const ExecParameters *params) {
3846
3847         assert(context);
3848         assert(params);
3849
3850         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3851          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3852          * (system manager) then we have privileges and don't need this. */
3853         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3854                 return false;
3855
3856         return context->private_users ||
3857                context->private_tmp ||
3858                context->private_devices ||
3859                context->private_network ||
3860                context->network_namespace_path ||
3861                context->private_ipc ||
3862                context->ipc_namespace_path ||
3863                context->private_mounts > 0 ||
3864                context->mount_apivfs > 0 ||
3865                context->n_bind_mounts > 0 ||
3866                context->n_temporary_filesystems > 0 ||
3867                context->root_directory ||
3868                !strv_isempty(context->extension_directories) ||
3869                context->protect_system != PROTECT_SYSTEM_NO ||
3870                context->protect_home != PROTECT_HOME_NO ||
3871                context->protect_kernel_tunables ||
3872                context->protect_kernel_modules ||
3873                context->protect_kernel_logs ||
3874                context->protect_control_groups ||
3875                context->protect_clock ||
3876                context->protect_hostname ||
3877                !strv_isempty(context->read_write_paths) ||
3878                !strv_isempty(context->read_only_paths) ||
3879                !strv_isempty(context->inaccessible_paths) ||
3880                !strv_isempty(context->exec_paths) ||
3881                !strv_isempty(context->no_exec_paths);
3882 }
3883
3884 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3885         assert(context);
3886
3887         if (confirm_spawn_disabled())
3888                 return false;
3889
3890         /* For some reasons units remaining in the same process group
3891          * as PID 1 fail to acquire the console even if it's not used
3892          * by any process. So skip the confirmation question for them. */
3893         return !context->same_pgrp;
3894 }
3895
3896 static int exec_context_named_iofds(
3897                 const ExecContext *c,
3898                 const ExecParameters *p,
3899                 int named_iofds[static 3]) {
3900
3901         size_t targets;
3902         const char* stdio_fdname[3];
3903         size_t n_fds;
3904
3905         assert(c);
3906         assert(p);
3907         assert(named_iofds);
3908
3909         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3910                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3911                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3912
3913         for (size_t i = 0; i < 3; i++)
3914                 stdio_fdname[i] = exec_context_fdname(c, i);
3915
3916         n_fds = p->n_storage_fds + p->n_socket_fds;
3917
3918         for (size_t i = 0; i < n_fds  && targets > 0; i++)
3919                 if (named_iofds[STDIN_FILENO] < 0 &&
3920                     c->std_input == EXEC_INPUT_NAMED_FD &&
3921                     stdio_fdname[STDIN_FILENO] &&
3922                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3923
3924                         named_iofds[STDIN_FILENO] = p->fds[i];
3925                         targets--;
3926
3927                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3928                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3929                            stdio_fdname[STDOUT_FILENO] &&
3930                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3931
3932                         named_iofds[STDOUT_FILENO] = p->fds[i];
3933                         targets--;
3934
3935                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3936                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3937                            stdio_fdname[STDERR_FILENO] &&
3938                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3939
3940                         named_iofds[STDERR_FILENO] = p->fds[i];
3941                         targets--;
3942                 }
3943
3944         return targets == 0 ? 0 : -ENOENT;
3945 }
3946
3947 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3948         if (!shared)
3949                 return;
3950
3951         safe_close_pair(shared->netns_storage_socket);
3952         safe_close_pair(shared->ipcns_storage_socket);
3953 }
3954
3955 static void exec_runtime_close(ExecRuntime *rt) {
3956         if (!rt)
3957                 return;
3958
3959         safe_close_pair(rt->ephemeral_storage_socket);
3960
3961         exec_shared_runtime_close(rt->shared);
3962         dynamic_creds_close(rt->dynamic_creds);
3963 }
3964
3965 static void exec_params_close(ExecParameters *p) {
3966         if (!p)
3967                 return;
3968
3969         p->stdin_fd = safe_close(p->stdin_fd);
3970         p->stdout_fd = safe_close(p->stdout_fd);
3971         p->stderr_fd = safe_close(p->stderr_fd);
3972 }
3973
3974 static int exec_fd_mark_hot(
3975                 const ExecContext *c,
3976                 ExecParameters *p,
3977                 bool hot,
3978                 int *reterr_exit_status) {
3979
3980         assert(c);
3981         assert(p);
3982
3983         if (p->exec_fd < 0)
3984                 return 0;
3985
3986         uint8_t x = hot;
3987
3988         if (write(p->exec_fd, &x, sizeof(x)) < 0) {
3989                 if (reterr_exit_status)
3990                         *reterr_exit_status = EXIT_EXEC;
3991                 return log_exec_error_errno(c, p, errno, "Failed to mark exec_fd as %s: %m", hot ? "hot" : "cold");
3992         }
3993
3994         return 1;
3995 }
3996
3997 static int send_handoff_timestamp(
3998                 const ExecContext *c,
3999                 ExecParameters *p,
4000                 int *reterr_exit_status) {
4001
4002         assert(c);
4003         assert(p);
4004
4005         if (p->handoff_timestamp_fd < 0)
4006                 return 0;
4007
4008         dual_timestamp dt;
4009         dual_timestamp_now(&dt);
4010
4011         if (send(p->handoff_timestamp_fd, (const usec_t[2]) { dt.realtime, dt.monotonic }, sizeof(usec_t) * 2, 0) < 0) {
4012                 if (reterr_exit_status)
4013                         *reterr_exit_status = EXIT_EXEC;
4014                 return log_exec_error_errno(c, p, errno, "Failed to send handoff timestamp: %m");
4015         }
4016
4017         return 1;
4018 }
4019
4020 int exec_invoke(
4021                 const ExecCommand *command,
4022                 const ExecContext *context,
4023                 ExecParameters *params,
4024                 ExecRuntime *runtime,
4025                 const CGroupContext *cgroup_context,
4026                 int *exit_status) {
4027
4028         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4029         int r, ngids = 0;
4030         _cleanup_free_ gid_t *supplementary_gids = NULL;
4031         const char *username = NULL, *groupname = NULL;
4032         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4033         const char *home = NULL, *shell = NULL;
4034         char **final_argv = NULL;
4035         dev_t journal_stream_dev = 0;
4036         ino_t journal_stream_ino = 0;
4037         bool userns_set_up = false;
4038         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4039                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4040                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4041                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4042         bool keep_seccomp_privileges = false;
4043 #if HAVE_SELINUX
4044         _cleanup_free_ char *mac_selinux_context_net = NULL;
4045         bool use_selinux = false;
4046 #endif
4047 #if ENABLE_SMACK
4048         bool use_smack = false;
4049 #endif
4050 #if HAVE_APPARMOR
4051         bool use_apparmor = false;
4052 #endif
4053 #if HAVE_SECCOMP
4054         uint64_t saved_bset = 0;
4055 #endif
4056         uid_t saved_uid = getuid();
4057         gid_t saved_gid = getgid();
4058         uid_t uid = UID_INVALID;
4059         gid_t gid = GID_INVALID;
4060         size_t n_fds, /* fds to pass to the child */
4061                n_keep_fds; /* total number of fds not to close */
4062         int secure_bits;
4063         _cleanup_free_ gid_t *gids_after_pam = NULL;
4064         int ngids_after_pam = 0;
4065
4066         int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4067         size_t n_storage_fds, n_socket_fds;
4068
4069         assert(command);
4070         assert(context);
4071         assert(params);
4072         assert(exit_status);
4073
4074         /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4075          * and is already applied earlier. Just for safety. */
4076         if (context->log_level_max >= 0)
4077                 log_set_max_level(context->log_level_max);
4078
4079         /* Explicitly test for CVE-2021-4034 inspired invocations */
4080         if (!command->path || strv_isempty(command->argv)) {
4081                 *exit_status = EXIT_EXEC;
4082                 return log_exec_error_errno(
4083                                 context,
4084                                 params,
4085                                 SYNTHETIC_ERRNO(EINVAL),
4086                                 "Invalid command line arguments.");
4087         }
4088
4089         LOG_CONTEXT_PUSH_EXEC(context, params);
4090
4091         if (context->std_input == EXEC_INPUT_SOCKET ||
4092             context->std_output == EXEC_OUTPUT_SOCKET ||
4093             context->std_error == EXEC_OUTPUT_SOCKET) {
4094
4095                 if (params->n_socket_fds > 1)
4096                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4097
4098                 if (params->n_socket_fds == 0)
4099                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4100
4101                 socket_fd = params->fds[0];
4102                 n_storage_fds = n_socket_fds = 0;
4103         } else {
4104                 n_socket_fds = params->n_socket_fds;
4105                 n_storage_fds = params->n_storage_fds;
4106         }
4107         n_fds = n_socket_fds + n_storage_fds;
4108
4109         r = exec_context_named_iofds(context, params, named_iofds);
4110         if (r < 0)
4111                 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4112
4113         rename_process_from_path(command->path);
4114
4115         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4116          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4117          * both of which will be demoted to SIG_DFL. */
4118         (void) default_signals(SIGNALS_CRASH_HANDLER,
4119                                SIGNALS_IGNORE);
4120
4121         if (context->ignore_sigpipe)
4122                 (void) ignore_signals(SIGPIPE);
4123
4124         r = reset_signal_mask();
4125         if (r < 0) {
4126                 *exit_status = EXIT_SIGNAL_MASK;
4127                 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4128         }
4129
4130         if (params->idle_pipe)
4131                 do_idle_pipe_dance(params->idle_pipe);
4132
4133         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4134          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4135          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4136          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4137
4138         log_forget_fds();
4139         log_set_open_when_needed(true);
4140         log_settle_target();
4141
4142         /* In case anything used libc syslog(), close this here, too */
4143         closelog();
4144
4145         r = collect_open_file_fds(context, params, &n_fds);
4146         if (r < 0) {
4147                 *exit_status = EXIT_FDS;
4148                 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4149         }
4150
4151         int keep_fds[n_fds + 4];
4152         memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4153         n_keep_fds = n_fds;
4154
4155         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4156         if (r < 0) {
4157                 *exit_status = EXIT_FDS;
4158                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4159         }
4160
4161         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->handoff_timestamp_fd);
4162         if (r < 0) {
4163                 *exit_status = EXIT_FDS;
4164                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4165         }
4166
4167 #if HAVE_LIBBPF
4168         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4169         if (r < 0) {
4170                 *exit_status = EXIT_FDS;
4171                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4172         }
4173 #endif
4174
4175         r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4176         if (r < 0) {
4177                 *exit_status = EXIT_FDS;
4178                 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4179         }
4180
4181         if (!context->same_pgrp &&
4182             setsid() < 0) {
4183                 *exit_status = EXIT_SETSID;
4184                 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4185         }
4186
4187         exec_context_tty_reset(context, params);
4188
4189         if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4190                 _cleanup_free_ char *cmdline = NULL;
4191
4192                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4193                 if (!cmdline) {
4194                         *exit_status = EXIT_MEMORY;
4195                         return log_oom();
4196                 }
4197
4198                 r = ask_for_confirmation(context, params, cmdline);
4199                 if (r != CONFIRM_EXECUTE) {
4200                         if (r == CONFIRM_PRETEND_SUCCESS) {
4201                                 *exit_status = EXIT_SUCCESS;
4202                                 return 0;
4203                         }
4204
4205                         *exit_status = EXIT_CONFIRM;
4206                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4207                                                     "Execution cancelled by the user.");
4208                 }
4209         }
4210
4211         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4212          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4213          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4214          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4215          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4216         if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4217             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4218                 *exit_status = EXIT_MEMORY;
4219                 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4220         }
4221
4222         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4223                 _cleanup_strv_free_ char **suggested_paths = NULL;
4224
4225                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4226                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4227                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4228                         *exit_status = EXIT_USER;
4229                         return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4230                 }
4231
4232                 r = compile_suggested_paths(context, params, &suggested_paths);
4233                 if (r < 0) {
4234                         *exit_status = EXIT_MEMORY;
4235                         return log_oom();
4236                 }
4237
4238                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4239                 if (r < 0) {
4240                         *exit_status = EXIT_USER;
4241                         if (r == -EILSEQ)
4242                                 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4243                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4244                         return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4245                 }
4246
4247                 if (!uid_is_valid(uid)) {
4248                         *exit_status = EXIT_USER;
4249                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\".", uid);
4250                 }
4251
4252                 if (!gid_is_valid(gid)) {
4253                         *exit_status = EXIT_USER;
4254                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\".", gid);
4255                 }
4256
4257                 if (runtime->dynamic_creds->user)
4258                         username = runtime->dynamic_creds->user->name;
4259
4260         } else {
4261                 if (context->user) {
4262                         r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4263                         if (r < 0) {
4264                                 *exit_status = EXIT_USER;
4265                                 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4266                         }
4267                 }
4268
4269                 if (context->group) {
4270                         r = get_fixed_group(context->group, &groupname, &gid);
4271                         if (r < 0) {
4272                                 *exit_status = EXIT_GROUP;
4273                                 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4274                         }
4275                 }
4276         }
4277
4278         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4279         r = get_supplementary_groups(context, username, groupname, gid,
4280                                      &supplementary_gids, &ngids);
4281         if (r < 0) {
4282                 *exit_status = EXIT_GROUP;
4283                 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4284         }
4285
4286         r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4287         if (r < 0) {
4288                 *exit_status = EXIT_USER;
4289                 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4290         }
4291
4292         params->user_lookup_fd = safe_close(params->user_lookup_fd);
4293
4294         r = acquire_home(context, uid, &home, &home_buffer);
4295         if (r < 0) {
4296                 *exit_status = EXIT_CHDIR;
4297                 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4298         }
4299
4300         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4301         if (socket_fd >= 0)
4302                 (void) fd_nonblock(socket_fd, false);
4303
4304         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4305          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4306         if (params->cgroup_path) {
4307                 _cleanup_free_ char *p = NULL;
4308
4309                 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4310                 if (r < 0) {
4311                         *exit_status = EXIT_CGROUP;
4312                         return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4313                 }
4314
4315                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4316                 if (r == -EUCLEAN) {
4317                         *exit_status = EXIT_CGROUP;
4318                         return log_exec_error_errno(context, params, r,
4319                                                     "Failed to attach process to cgroup '%s', "
4320                                                     "because the cgroup or one of its parents or "
4321                                                     "siblings is in the threaded mode.", p);
4322                 }
4323                 if (r < 0) {
4324                         *exit_status = EXIT_CGROUP;
4325                         return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4326                 }
4327         }
4328
4329         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4330                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4331                 if (r < 0) {
4332                         *exit_status = EXIT_NETWORK;
4333                         return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4334                 }
4335         }
4336
4337         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4338                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4339                 if (r < 0) {
4340                         *exit_status = EXIT_NAMESPACE;
4341                         return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4342                 }
4343         }
4344
4345         r = setup_input(context, params, socket_fd, named_iofds);
4346         if (r < 0) {
4347                 *exit_status = EXIT_STDIN;
4348                 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4349         }
4350
4351         _cleanup_free_ char *fname = NULL;
4352         r = path_extract_filename(command->path, &fname);
4353         if (r < 0) {
4354                 *exit_status = EXIT_STDOUT;
4355                 return log_exec_error_errno(context, params, r, "Failed to extract filename from path %s: %m", command->path);
4356         }
4357
4358         r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
4359         if (r < 0) {
4360                 *exit_status = EXIT_STDOUT;
4361                 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4362         }
4363
4364         r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, fname, uid, gid, &journal_stream_dev, &journal_stream_ino);
4365         if (r < 0) {
4366                 *exit_status = EXIT_STDERR;
4367                 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4368         }
4369
4370         if (context->oom_score_adjust_set) {
4371                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4372                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4373                 r = set_oom_score_adjust(context->oom_score_adjust);
4374                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4375                         log_exec_debug_errno(context, params, r,
4376                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4377                 else if (r < 0) {
4378                         *exit_status = EXIT_OOM_ADJUST;
4379                         return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4380                 }
4381         }
4382
4383         if (context->coredump_filter_set) {
4384                 r = set_coredump_filter(context->coredump_filter);
4385                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4386                         log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4387                 else if (r < 0) {
4388                         *exit_status = EXIT_LIMITS;
4389                         return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4390                 }
4391         }
4392
4393         if (context->nice_set) {
4394                 r = setpriority_closest(context->nice);
4395                 if (r < 0) {
4396                         *exit_status = EXIT_NICE;
4397                         return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4398                 }
4399         }
4400
4401         if (context->cpu_sched_set) {
4402                 struct sched_param param = {
4403                         .sched_priority = context->cpu_sched_priority,
4404                 };
4405
4406                 r = sched_setscheduler(0,
4407                                        context->cpu_sched_policy |
4408                                        (context->cpu_sched_reset_on_fork ?
4409                                         SCHED_RESET_ON_FORK : 0),
4410                                        &param);
4411                 if (r < 0) {
4412                         *exit_status = EXIT_SETSCHEDULER;
4413                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4414                 }
4415         }
4416
4417         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4418                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4419                 const CPUSet *cpu_set;
4420
4421                 if (context->cpu_affinity_from_numa) {
4422                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4423                         if (r < 0) {
4424                                 *exit_status = EXIT_CPUAFFINITY;
4425                                 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4426                         }
4427
4428                         cpu_set = &converted_cpu_set;
4429                 } else
4430                         cpu_set = &context->cpu_set;
4431
4432                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4433                         *exit_status = EXIT_CPUAFFINITY;
4434                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4435                 }
4436         }
4437
4438         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4439                 r = apply_numa_policy(&context->numa_policy);
4440                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4441                         log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4442                 else if (r < 0) {
4443                         *exit_status = EXIT_NUMA_POLICY;
4444                         return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4445                 }
4446         }
4447
4448         if (context->ioprio_set)
4449                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4450                         *exit_status = EXIT_IOPRIO;
4451                         return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4452                 }
4453
4454         if (context->timer_slack_nsec != NSEC_INFINITY)
4455                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4456                         *exit_status = EXIT_TIMERSLACK;
4457                         return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4458                 }
4459
4460         if (context->personality != PERSONALITY_INVALID) {
4461                 r = safe_personality(context->personality);
4462                 if (r < 0) {
4463                         *exit_status = EXIT_PERSONALITY;
4464                         return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4465                 }
4466         }
4467
4468 #if ENABLE_UTMP
4469         if (context->utmp_id) {
4470                 _cleanup_free_ char *username_alloc = NULL;
4471
4472                 if (!username && context->utmp_mode == EXEC_UTMP_USER) {
4473                         username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
4474                         if (!username_alloc) {
4475                                 *exit_status = EXIT_USER;
4476                                 return log_oom();
4477                         }
4478                 }
4479
4480                 const char *line = context->tty_path ?
4481                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4482                         NULL;
4483                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4484                                       line,
4485                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4486                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4487                                       USER_PROCESS,
4488                                       username ?: username_alloc);
4489         }
4490 #endif
4491
4492         if (uid_is_valid(uid)) {
4493                 r = chown_terminal(STDIN_FILENO, uid);
4494                 if (r < 0) {
4495                         *exit_status = EXIT_STDIN;
4496                         return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4497                 }
4498         }
4499
4500         if (params->cgroup_path) {
4501                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4502                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4503                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4504                  * touch a single hierarchy too. */
4505
4506                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4507                         _cleanup_free_ char *p = NULL;
4508
4509                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4510                         if (r < 0) {
4511                                 *exit_status = EXIT_CGROUP;
4512                                 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4513                         }
4514
4515                         r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4516                         if (r < 0) {
4517                                 *exit_status = EXIT_CGROUP;
4518                                 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4519                         }
4520                         if (r > 0) {
4521                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4522                                 if (r < 0) {
4523                                         *exit_status = EXIT_CGROUP;
4524                                         return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4525                                 }
4526                         }
4527                 }
4528
4529                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4530                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4531                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4532                                 if (r < 0) {
4533                                         *exit_status = EXIT_MEMORY;
4534                                         return log_oom();
4535                                 }
4536
4537                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4538                                 if (r < 0) {
4539                                         log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4540                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4541                                         memory_pressure_path = mfree(memory_pressure_path);
4542                                 }
4543                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4544                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4545                                 if (!memory_pressure_path) {
4546                                         *exit_status = EXIT_MEMORY;
4547                                         return log_oom();
4548                                 }
4549                         }
4550                 }
4551         }
4552
4553         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4554
4555         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4556                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4557                 if (r < 0)
4558                         return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4559         }
4560
4561         r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4562         if (r < 0) {
4563                 *exit_status = EXIT_CREDENTIALS;
4564                 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4565         }
4566
4567         r = build_environment(
4568                         context,
4569                         params,
4570                         cgroup_context,
4571                         n_fds,
4572                         home,
4573                         username,
4574                         shell,
4575                         journal_stream_dev,
4576                         journal_stream_ino,
4577                         memory_pressure_path,
4578                         &our_env);
4579         if (r < 0) {
4580                 *exit_status = EXIT_MEMORY;
4581                 return log_oom();
4582         }
4583
4584         r = build_pass_environment(context, &pass_env);
4585         if (r < 0) {
4586                 *exit_status = EXIT_MEMORY;
4587                 return log_oom();
4588         }
4589
4590         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4591          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4592          * not specify PATH but the unit has ExecSearchPath. */
4593         if (!strv_isempty(context->exec_search_path)) {
4594                 _cleanup_free_ char *joined = NULL;
4595
4596                 joined = strv_join(context->exec_search_path, ":");
4597                 if (!joined) {
4598                         *exit_status = EXIT_MEMORY;
4599                         return log_oom();
4600                 }
4601
4602                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4603                 if (r < 0) {
4604                         *exit_status = EXIT_MEMORY;
4605                         return log_oom();
4606                 }
4607         }
4608
4609         accum_env = strv_env_merge(params->environment,
4610                                    our_env,
4611                                    joined_exec_search_path,
4612                                    pass_env,
4613                                    context->environment,
4614                                    params->files_env);
4615         if (!accum_env) {
4616                 *exit_status = EXIT_MEMORY;
4617                 return log_oom();
4618         }
4619         accum_env = strv_env_clean(accum_env);
4620
4621         (void) umask(context->umask);
4622
4623         r = setup_keyring(context, params, uid, gid);
4624         if (r < 0) {
4625                 *exit_status = EXIT_KEYRING;
4626                 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4627         }
4628
4629         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4630          * from it. */
4631         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4632
4633         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4634          * for it, and the kernel doesn't actually support ambient caps. */
4635         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4636
4637         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4638          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4639          * desired. */
4640         if (needs_ambient_hack)
4641                 needs_setuid = false;
4642         else
4643                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4644
4645         uint64_t capability_ambient_set = context->capability_ambient_set;
4646
4647         if (needs_sandboxing) {
4648                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4649                  * /sys being present. The actual MAC context application will happen later, as late as
4650                  * possible, to avoid impacting our own code paths. */
4651
4652 #if HAVE_SELINUX
4653                 use_selinux = mac_selinux_use();
4654 #endif
4655 #if ENABLE_SMACK
4656                 use_smack = mac_smack_use();
4657 #endif
4658 #if HAVE_APPARMOR
4659                 use_apparmor = mac_apparmor_use();
4660 #endif
4661         }
4662
4663         if (needs_sandboxing) {
4664                 int which_failed;
4665
4666                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4667                  * is set here. (See below.) */
4668
4669                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4670                 if (r < 0) {
4671                         *exit_status = EXIT_LIMITS;
4672                         return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4673                 }
4674         }
4675
4676         if (needs_setuid && context->pam_name && username) {
4677                 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4678                  * wins here. (See above.) */
4679
4680                 /* All fds passed in the fds array will be closed in the pam child process. */
4681                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
4682                 if (r < 0) {
4683                         *exit_status = EXIT_PAM;
4684                         return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4685                 }
4686
4687                 if (ambient_capabilities_supported()) {
4688                         uint64_t ambient_after_pam;
4689
4690                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4691                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4692                         r = capability_get_ambient(&ambient_after_pam);
4693                         if (r < 0) {
4694                                 *exit_status = EXIT_CAPABILITIES;
4695                                 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4696                         }
4697
4698                         capability_ambient_set |= ambient_after_pam;
4699                 }
4700
4701                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4702                 if (ngids_after_pam < 0) {
4703                         *exit_status = EXIT_GROUP;
4704                         return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4705                 }
4706         }
4707
4708         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4709                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4710                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4711                  * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4712
4713                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4714                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4715                  * the actual requested operations fail (or silently continue). */
4716                 if (r < 0 && context->private_users) {
4717                         *exit_status = EXIT_USER;
4718                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4719                 }
4720                 if (r < 0)
4721                         log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4722                 else
4723                         userns_set_up = true;
4724         }
4725
4726         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4727
4728                 /* Try to enable network namespacing if network namespacing is available and we have
4729                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4730                  * new network namespace. And if we don't have that, then we could only create a network
4731                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4732                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4733                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4734                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4735                                 log_exec_notice_errno(context, params, r,
4736                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4737                         else if (r < 0) {
4738                                 *exit_status = EXIT_NETWORK;
4739                                 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4740                         }
4741                 } else if (context->network_namespace_path) {
4742                         *exit_status = EXIT_NETWORK;
4743                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4744                                                     "NetworkNamespacePath= is not supported, refusing.");
4745                 } else
4746                         log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4747         }
4748
4749         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4750
4751                 if (ns_type_supported(NAMESPACE_IPC)) {
4752                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4753                         if (r == -EPERM)
4754                                 log_exec_warning_errno(context, params, r,
4755                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4756                         else if (r < 0) {
4757                                 *exit_status = EXIT_NAMESPACE;
4758                                 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4759                         }
4760                 } else if (context->ipc_namespace_path) {
4761                         *exit_status = EXIT_NAMESPACE;
4762                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4763                                                     "IPCNamespacePath= is not supported, refusing.");
4764                 } else
4765                         log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4766         }
4767
4768         if (needs_mount_namespace) {
4769                 _cleanup_free_ char *error_path = NULL;
4770
4771                 r = apply_mount_namespace(command->flags,
4772                                           context,
4773                                           params,
4774                                           runtime,
4775                                           memory_pressure_path,
4776                                           needs_sandboxing,
4777                                           &error_path);
4778                 if (r < 0) {
4779                         *exit_status = EXIT_NAMESPACE;
4780                         return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4781                                                     error_path ? ": " : "", strempty(error_path));
4782                 }
4783         }
4784
4785         if (needs_sandboxing) {
4786                 r = apply_protect_hostname(context, params, exit_status);
4787                 if (r < 0)
4788                         return r;
4789         }
4790
4791         if (context->memory_ksm >= 0)
4792                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm, 0, 0, 0) < 0) {
4793                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4794                                 log_exec_debug_errno(context,
4795                                                      params,
4796                                                      errno,
4797                                                      "KSM support not available, ignoring.");
4798                         else {
4799                                 *exit_status = EXIT_KSM;
4800                                 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4801                         }
4802                 }
4803
4804         /* Drop groups as early as possible.
4805          * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4806          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4807         if (needs_setuid) {
4808                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4809                 int ngids_to_enforce = 0;
4810
4811                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4812                                                    ngids,
4813                                                    gids_after_pam,
4814                                                    ngids_after_pam,
4815                                                    &gids_to_enforce);
4816                 if (ngids_to_enforce < 0) {
4817                         *exit_status = EXIT_GROUP;
4818                         return log_exec_error_errno(context, params,
4819                                                     ngids_to_enforce,
4820                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4821                 }
4822
4823                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4824                 if (r < 0) {
4825                         *exit_status = EXIT_GROUP;
4826                         return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4827                 }
4828         }
4829
4830         /* If the user namespace was not set up above, try to do it now.
4831          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4832          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4833          * case of mount namespaces being less privileged when the mount point list is copied from a
4834          * different user namespace). */
4835
4836         if (needs_sandboxing && context->private_users && !userns_set_up) {
4837                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4838                 if (r < 0) {
4839                         *exit_status = EXIT_USER;
4840                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4841                 }
4842         }
4843
4844         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4845          * shall execute. */
4846
4847         _cleanup_free_ char *executable = NULL;
4848         _cleanup_close_ int executable_fd = -EBADF;
4849         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4850         if (r < 0) {
4851                 *exit_status = EXIT_EXEC;
4852                 log_exec_struct_errno(context, params, LOG_NOTICE, r,
4853                                       "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4854                                       LOG_EXEC_MESSAGE(params,
4855                                                        "Unable to locate executable '%s': %m",
4856                                                        command->path),
4857                                       "EXECUTABLE=%s", command->path);
4858                 /* If the error will be ignored by manager, tune down the log level here. Missing executable
4859                  * is very much expected in this case. */
4860                 return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
4861         }
4862
4863         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4864         if (r < 0) {
4865                 *exit_status = EXIT_FDS;
4866                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4867         }
4868
4869 #if HAVE_SELINUX
4870         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4871                 int fd = -EBADF;
4872
4873                 if (socket_fd >= 0)
4874                         fd = socket_fd;
4875                 else if (params->n_socket_fds == 1)
4876                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4877                          * use context from that fd to compute the label. */
4878                         fd = params->fds[0];
4879
4880                 if (fd >= 0) {
4881                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4882                         if (r < 0) {
4883                                 if (!context->selinux_context_ignore) {
4884                                         *exit_status = EXIT_SELINUX_CONTEXT;
4885                                         return log_exec_error_errno(context,
4886                                                                     params,
4887                                                                     r,
4888                                                                     "Failed to determine SELinux context: %m");
4889                                 }
4890                                 log_exec_debug_errno(context,
4891                                                      params,
4892                                                      r,
4893                                                      "Failed to determine SELinux context, ignoring: %m");
4894                         }
4895                 }
4896         }
4897 #endif
4898
4899         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4900          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4901          * more. We do keep exec_fd and handoff_timestamp_fd however, if we have it, since we need to keep
4902          * them open until the final execve(). But first, close the remaining sockets in the context
4903          * objects. */
4904
4905         exec_runtime_close(runtime);
4906         exec_params_close(params);
4907
4908         r = close_all_fds(keep_fds, n_keep_fds);
4909         if (r >= 0)
4910                 r = pack_fds(params->fds, n_fds);
4911         if (r >= 0)
4912                 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4913         if (r < 0) {
4914                 *exit_status = EXIT_FDS;
4915                 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4916         }
4917
4918         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4919          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4920          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4921          * came this far. */
4922
4923         secure_bits = context->secure_bits;
4924
4925         if (needs_sandboxing) {
4926                 uint64_t bset;
4927
4928                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4929                  * (Note this is placed after the general resource limit initialization, see above, in order
4930                  * to take precedence.) */
4931                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4932                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4933                                 *exit_status = EXIT_LIMITS;
4934                                 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4935                         }
4936                 }
4937
4938 #if ENABLE_SMACK
4939                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4940                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4941                 if (use_smack && context->smack_process_label) {
4942                         r = setup_smack(params, context, executable_fd);
4943                         if (r < 0 && !context->smack_process_label_ignore) {
4944                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4945                                 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4946                         }
4947                 }
4948 #endif
4949
4950                 bset = context->capability_bounding_set;
4951                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4952                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4953                  * instead of us doing that */
4954                 if (needs_ambient_hack)
4955                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4956                                 (UINT64_C(1) << CAP_SETUID) |
4957                                 (UINT64_C(1) << CAP_SETGID);
4958
4959 #if HAVE_SECCOMP
4960                 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4961                  * keep the needed privileges to apply it even if we're not root. */
4962                 if (needs_setuid &&
4963                     uid_is_valid(uid) &&
4964                     context_has_seccomp(context) &&
4965                     seccomp_allows_drop_privileges(context)) {
4966                         keep_seccomp_privileges = true;
4967
4968                         if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4969                                 *exit_status = EXIT_USER;
4970                                 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4971                         }
4972
4973                         /* Save the current bounding set so we can restore it after applying the seccomp
4974                          * filter */
4975                         saved_bset = bset;
4976                         bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4977                                 (UINT64_C(1) << CAP_SETPCAP);
4978                 }
4979 #endif
4980
4981                 if (!cap_test_all(bset)) {
4982                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4983                         if (r < 0) {
4984                                 *exit_status = EXIT_CAPABILITIES;
4985                                 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4986                         }
4987                 }
4988
4989                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4990                  * keep-caps set.
4991                  *
4992                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4993                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4994                  * the ambient capabilities can be raised as they are present in the permitted and
4995                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4996                  * without changing the user, so we also set the ambient capabilities here.
4997                  *
4998                  * The requested ambient capabilities are raised in the inheritable set if the second
4999                  * argument is true. */
5000                 if (!needs_ambient_hack) {
5001                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5002                         if (r < 0) {
5003                                 *exit_status = EXIT_CAPABILITIES;
5004                                 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
5005                         }
5006                 }
5007         }
5008
5009         /* chroot to root directory first, before we lose the ability to chroot */
5010         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5011         if (r < 0)
5012                 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
5013
5014         if (needs_setuid) {
5015                 if (uid_is_valid(uid)) {
5016                         r = enforce_user(context, uid, capability_ambient_set);
5017                         if (r < 0) {
5018                                 *exit_status = EXIT_USER;
5019                                 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
5020                         }
5021
5022                         if (keep_seccomp_privileges) {
5023                                 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
5024                                         r = drop_capability(CAP_SETUID);
5025                                         if (r < 0) {
5026                                                 *exit_status = EXIT_USER;
5027                                                 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
5028                                         }
5029                                 }
5030
5031                                 r = keep_capability(CAP_SYS_ADMIN);
5032                                 if (r < 0) {
5033                                         *exit_status = EXIT_USER;
5034                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
5035                                 }
5036
5037                                 r = keep_capability(CAP_SETPCAP);
5038                                 if (r < 0) {
5039                                         *exit_status = EXIT_USER;
5040                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
5041                                 }
5042                         }
5043
5044                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5045
5046                                 /* Raise the ambient capabilities after user change. */
5047                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5048                                 if (r < 0) {
5049                                         *exit_status = EXIT_CAPABILITIES;
5050                                         return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
5051                                 }
5052                         }
5053                 }
5054         }
5055
5056         /* Apply working directory here, because the working directory might be on NFS and only the user
5057          * running this service might have the correct privilege to change to the working directory. Also, it
5058          * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5059          * the cwd cannot be used to pin directories outside of the sandbox. */
5060         r = apply_working_directory(context, params, runtime, home, exit_status);
5061         if (r < 0)
5062                 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
5063
5064         if (needs_sandboxing) {
5065                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5066                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5067                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5068                  * are restricted. */
5069
5070 #if HAVE_SELINUX
5071                 if (use_selinux) {
5072                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5073
5074                         if (exec_context) {
5075                                 r = setexeccon(exec_context);
5076                                 if (r < 0) {
5077                                         if (!context->selinux_context_ignore) {
5078                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5079                                                 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
5080                                         }
5081                                         log_exec_debug_errno(context,
5082                                                              params,
5083                                                              r,
5084                                                              "Failed to change SELinux context to %s, ignoring: %m",
5085                                                              exec_context);
5086                                 }
5087                         }
5088                 }
5089 #endif
5090
5091 #if HAVE_APPARMOR
5092                 if (use_apparmor && context->apparmor_profile) {
5093                         r = aa_change_onexec(context->apparmor_profile);
5094                         if (r < 0 && !context->apparmor_profile_ignore) {
5095                                 *exit_status = EXIT_APPARMOR_PROFILE;
5096                                 return log_exec_error_errno(context,
5097                                                             params,
5098                                                             errno,
5099                                                             "Failed to prepare AppArmor profile change to %s: %m",
5100                                                             context->apparmor_profile);
5101                         }
5102                 }
5103 #endif
5104
5105                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5106                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5107                  * requires CAP_SETPCAP. */
5108                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5109                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5110                          * effective set here.
5111                          *
5112                          * The effective set is overwritten during execve() with the following values:
5113                          *
5114                          * - ambient set (for non-root processes)
5115                          *
5116                          * - (inheritable | bounding) set for root processes)
5117                          *
5118                          * Hence there is no security impact to raise it in the effective set before execve
5119                          */
5120                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5121                         if (r < 0) {
5122                                 *exit_status = EXIT_CAPABILITIES;
5123                                 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5124                         }
5125                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5126                                 *exit_status = EXIT_SECUREBITS;
5127                                 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5128                         }
5129                 }
5130
5131                 if (context_has_no_new_privileges(context))
5132                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5133                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5134                                 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5135                         }
5136
5137 #if HAVE_SECCOMP
5138                 r = apply_address_families(context, params);
5139                 if (r < 0) {
5140                         *exit_status = EXIT_ADDRESS_FAMILIES;
5141                         return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5142                 }
5143
5144                 r = apply_memory_deny_write_execute(context, params);
5145                 if (r < 0) {
5146                         *exit_status = EXIT_SECCOMP;
5147                         return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5148                 }
5149
5150                 r = apply_restrict_realtime(context, params);
5151                 if (r < 0) {
5152                         *exit_status = EXIT_SECCOMP;
5153                         return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5154                 }
5155
5156                 r = apply_restrict_suid_sgid(context, params);
5157                 if (r < 0) {
5158                         *exit_status = EXIT_SECCOMP;
5159                         return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5160                 }
5161
5162                 r = apply_restrict_namespaces(context, params);
5163                 if (r < 0) {
5164                         *exit_status = EXIT_SECCOMP;
5165                         return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5166                 }
5167
5168                 r = apply_protect_sysctl(context, params);
5169                 if (r < 0) {
5170                         *exit_status = EXIT_SECCOMP;
5171                         return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5172                 }
5173
5174                 r = apply_protect_kernel_modules(context, params);
5175                 if (r < 0) {
5176                         *exit_status = EXIT_SECCOMP;
5177                         return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5178                 }
5179
5180                 r = apply_protect_kernel_logs(context, params);
5181                 if (r < 0) {
5182                         *exit_status = EXIT_SECCOMP;
5183                         return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5184                 }
5185
5186                 r = apply_protect_clock(context, params);
5187                 if (r < 0) {
5188                         *exit_status = EXIT_SECCOMP;
5189                         return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5190                 }
5191
5192                 r = apply_private_devices(context, params);
5193                 if (r < 0) {
5194                         *exit_status = EXIT_SECCOMP;
5195                         return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5196                 }
5197
5198                 r = apply_syscall_archs(context, params);
5199                 if (r < 0) {
5200                         *exit_status = EXIT_SECCOMP;
5201                         return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5202                 }
5203
5204                 r = apply_lock_personality(context, params);
5205                 if (r < 0) {
5206                         *exit_status = EXIT_SECCOMP;
5207                         return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5208                 }
5209
5210                 r = apply_syscall_log(context, params);
5211                 if (r < 0) {
5212                         *exit_status = EXIT_SECCOMP;
5213                         return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5214                 }
5215 #endif
5216
5217 #if HAVE_LIBBPF
5218                 r = apply_restrict_filesystems(context, params);
5219                 if (r < 0) {
5220                         *exit_status = EXIT_BPF;
5221                         return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5222                 }
5223 #endif
5224
5225 #if HAVE_SECCOMP
5226                 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5227                  * by the filter as little as possible. */
5228                 r = apply_syscall_filter(context, params, needs_ambient_hack);
5229                 if (r < 0) {
5230                         *exit_status = EXIT_SECCOMP;
5231                         return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5232                 }
5233
5234                 if (keep_seccomp_privileges) {
5235                         /* Restore the capability bounding set with what's expected from the service + the
5236                          * ambient capabilities hack */
5237                         if (!cap_test_all(saved_bset)) {
5238                                 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5239                                 if (r < 0) {
5240                                         *exit_status = EXIT_CAPABILITIES;
5241                                         return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5242                                 }
5243                         }
5244
5245                         /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5246                          * applications that use it. */
5247                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5248                                 r = drop_capability(CAP_SYS_ADMIN);
5249                                 if (r < 0) {
5250                                         *exit_status = EXIT_USER;
5251                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5252                                 }
5253                         }
5254
5255                         /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5256                          * applications that use it. */
5257                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5258                                 r = drop_capability(CAP_SETPCAP);
5259                                 if (r < 0) {
5260                                         *exit_status = EXIT_USER;
5261                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5262                                 }
5263                         }
5264
5265                         if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5266                                 *exit_status = EXIT_USER;
5267                                 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5268                         }
5269                 }
5270 #endif
5271
5272         }
5273
5274         if (!strv_isempty(context->unset_environment)) {
5275                 char **ee = NULL;
5276
5277                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5278                 if (!ee) {
5279                         *exit_status = EXIT_MEMORY;
5280                         return log_oom();
5281                 }
5282
5283                 strv_free_and_replace(accum_env, ee);
5284         }
5285
5286         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5287                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5288
5289                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5290                 if (r < 0) {
5291                         *exit_status = EXIT_MEMORY;
5292                         return log_exec_error_errno(context,
5293                                                     params,
5294                                                     r,
5295                                                     "Failed to replace environment variables: %m");
5296                 }
5297                 final_argv = replaced_argv;
5298
5299                 if (!strv_isempty(unset_variables)) {
5300                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5301                         log_exec_warning(context,
5302                                          params,
5303                                          "Referenced but unset environment variable evaluates to an empty string: %s",
5304                                          strna(ju));
5305                 }
5306
5307                 if (!strv_isempty(bad_variables)) {
5308                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5309                         log_exec_warning(context,
5310                                          params,
5311                                          "Invalid environment variable name evaluates to an empty string: %s",
5312                                          strna(jb));
5313                 }
5314         } else
5315                 final_argv = command->argv;
5316
5317         log_command_line(context, params, "Executing", executable, final_argv);
5318
5319         /* We have finished with all our initializations. Let's now let the manager know that. From this
5320          * point on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5321
5322         r = exec_fd_mark_hot(context, params, /* hot= */ true, exit_status);
5323         if (r < 0)
5324                 return r;
5325
5326         /* As last thing before the execve(), let's send the handoff timestamp */
5327         r = send_handoff_timestamp(context, params, exit_status);
5328         if (r < 0) {
5329                 /* If this handoff timestamp failed, let's undo the marking as hot */
5330                 (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
5331                 return r;
5332         }
5333
5334         /* NB: we leave executable_fd, exec_fd, handoff_timestamp_fd open here. This is safe, because they
5335          * have O_CLOEXEC set, and the execve() below will thus automatically close them. In fact, for
5336          * exec_fd this is pretty much the whole raison d'etre. */
5337
5338         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5339
5340         /* The execve() failed, let's undo the marking as hot */
5341         (void) exec_fd_mark_hot(context, params, /* hot= */ false, /* reterr_exit_status= */ NULL);
5342
5343         *exit_status = EXIT_EXEC;
5344         return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5345 }