src/core/exec-invoke.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/eventfd.h>
   4 #include <sys/ioctl.h>
   5 #include <sys/mount.h>
   6 #include <sys/prctl.h>
   7
   8 #if HAVE_PAM
   9 #include <security/pam_appl.h>
  10 #include <security/pam_misc.h>
  11 #endif
  12
  13 #if HAVE_APPARMOR
  14 #include <sys/apparmor.h>
  15 #endif
  16
  17 #include "sd-messages.h"
  18
  19 #if HAVE_APPARMOR
  20 #include "apparmor-util.h"
  21 #endif
  22 #include "argv-util.h"
  23 #include "barrier.h"
  24 #include "bpf-dlopen.h"
  25 #include "bpf-lsm.h"
  26 #include "btrfs-util.h"
  27 #include "capability-util.h"
  28 #include "cgroup-setup.h"
  29 #include "chase.h"
  30 #include "chattr-util.h"
  31 #include "chown-recursive.h"
  32 #include "copy.h"
  33 #include "data-fd-util.h"
  34 #include "env-util.h"
  35 #include "escape.h"
  36 #include "exec-credential.h"
  37 #include "exec-invoke.h"
  38 #include "execute.h"
  39 #include "exit-status.h"
  40 #include "fd-util.h"
  41 #include "hexdecoct.h"
  42 #include "io-util.h"
  43 #include "iovec-util.h"
  44 #include "missing_ioprio.h"
  45 #include "missing_prctl.h"
  46 #include "missing_securebits.h"
  47 #include "missing_syscall.h"
  48 #include "mkdir-label.h"
  49 #include "proc-cmdline.h"
  50 #include "process-util.h"
  51 #include "psi-util.h"
  52 #include "rlimit-util.h"
  53 #include "seccomp-util.h"
  54 #include "selinux-util.h"
  55 #include "signal-util.h"
  56 #include "smack-util.h"
  57 #include "socket-util.h"
  58 #include "string-table.h"
  59 #include "strv.h"
  60 #include "terminal-util.h"
  61 #include "utmp-wtmp.h"
  62
  63 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  64 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  65
  66 #define SNDBUF_SIZE (8*1024*1024)
  67
  68 static int shift_fds(int fds[], size_t n_fds) {
  69         if (n_fds <= 0)
  70                 return 0;
  71
  72         /* Modifies the fds array! (sorts it) */
  73
  74         assert(fds);
  75
  76         for (int start = 0;;) {
  77                 int restart_from = -1;
  78
  79                 for (int i = start; i < (int) n_fds; i++) {
  80                         int nfd;
  81
  82                         /* Already at right index? */
  83                         if (fds[i] == i+3)
  84                                 continue;
  85
  86                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
  87                         if (nfd < 0)
  88                                 return -errno;
  89
  90                         safe_close(fds[i]);
  91                         fds[i] = nfd;
  92
  93                         /* Hmm, the fd we wanted isn't free? Then
  94                          * let's remember that and try again from here */
  95                         if (nfd != i+3 && restart_from < 0)
  96                                 restart_from = i;
  97                 }
  98
  99                 if (restart_from < 0)
 100                         break;
 101
 102                 start = restart_from;
 103         }
 104
 105         return 0;
 106 }
 107
 108 static int flag_fds(
 109                 const int fds[],
 110                 size_t n_socket_fds,
 111                 size_t n_fds,
 112                 bool nonblock) {
 113
 114         int r;
 115
 116         assert(fds || n_fds == 0);
 117
 118         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 119          * O_NONBLOCK only applies to socket activation though. */
 120
 121         for (size_t i = 0; i < n_fds; i++) {
 122
 123                 if (i < n_socket_fds) {
 124                         r = fd_nonblock(fds[i], nonblock);
 125                         if (r < 0)
 126                                 return r;
 127                 }
 128
 129                 /* We unconditionally drop FD_CLOEXEC from the fds,
 130                  * since after all we want to pass these fds to our
 131                  * children */
 132
 133                 r = fd_cloexec(fds[i], false);
 134                 if (r < 0)
 135                         return r;
 136         }
 137
 138         return 0;
 139 }
 140
 141 static bool is_terminal_input(ExecInput i) {
 142         return IN_SET(i,
 143                       EXEC_INPUT_TTY,
 144                       EXEC_INPUT_TTY_FORCE,
 145                       EXEC_INPUT_TTY_FAIL);
 146 }
 147
 148 static bool is_terminal_output(ExecOutput o) {
 149         return IN_SET(o,
 150                       EXEC_OUTPUT_TTY,
 151                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 152                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 153 }
 154
 155 static bool is_kmsg_output(ExecOutput o) {
 156         return IN_SET(o,
 157                       EXEC_OUTPUT_KMSG,
 158                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 159 }
 160
 161 static bool exec_context_needs_term(const ExecContext *c) {
 162         assert(c);
 163
 164         /* Return true if the execution context suggests we should set $TERM to something useful. */
 165
 166         if (is_terminal_input(c->std_input))
 167                 return true;
 168
 169         if (is_terminal_output(c->std_output))
 170                 return true;
 171
 172         if (is_terminal_output(c->std_error))
 173                 return true;
 174
 175         return !!c->tty_path;
 176 }
 177
 178 static int open_null_as(int flags, int nfd) {
 179         int fd;
 180
 181         assert(nfd >= 0);
 182
 183         fd = open("/dev/null", flags|O_NOCTTY);
 184         if (fd < 0)
 185                 return -errno;
 186
 187         return move_fd(fd, nfd, false);
 188 }
 189
 190 static int connect_journal_socket(
 191                 int fd,
 192                 const char *log_namespace,
 193                 uid_t uid,
 194                 gid_t gid) {
 195
 196         uid_t olduid = UID_INVALID;
 197         gid_t oldgid = GID_INVALID;
 198         const char *j;
 199         int r;
 200
 201         j = log_namespace ?
 202                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 203                 "/run/systemd/journal/stdout";
 204
 205         if (gid_is_valid(gid)) {
 206                 oldgid = getgid();
 207
 208                 if (setegid(gid) < 0)
 209                         return -errno;
 210         }
 211
 212         if (uid_is_valid(uid)) {
 213                 olduid = getuid();
 214
 215                 if (seteuid(uid) < 0) {
 216                         r = -errno;
 217                         goto restore_gid;
 218                 }
 219         }
 220
 221         r = connect_unix_path(fd, AT_FDCWD, j);
 222
 223         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 224            an LSM interferes. */
 225
 226         if (uid_is_valid(uid))
 227                 (void) seteuid(olduid);
 228
 229  restore_gid:
 230         if (gid_is_valid(gid))
 231                 (void) setegid(oldgid);
 232
 233         return r;
 234 }
 235
 236 static int connect_logger_as(
 237                 const ExecContext *context,
 238                 const ExecParameters *params,
 239                 ExecOutput output,
 240                 const char *ident,
 241                 int nfd,
 242                 uid_t uid,
 243                 gid_t gid) {
 244
 245         _cleanup_close_ int fd = -EBADF;
 246         int r;
 247
 248         assert(context);
 249         assert(params);
 250         assert(output < _EXEC_OUTPUT_MAX);
 251         assert(ident);
 252         assert(nfd >= 0);
 253
 254         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 255         if (fd < 0)
 256                 return -errno;
 257
 258         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 259         if (r < 0)
 260                 return r;
 261
 262         if (shutdown(fd, SHUT_RD) < 0)
 263                 return -errno;
 264
 265         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 266
 267         if (dprintf(fd,
 268                 "%s\n"
 269                 "%s\n"
 270                 "%i\n"
 271                 "%i\n"
 272                 "%i\n"
 273                 "%i\n"
 274                 "%i\n",
 275                 context->syslog_identifier ?: ident,
 276                 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
 277                 context->syslog_priority,
 278                 !!context->syslog_level_prefix,
 279                 false,
 280                 is_kmsg_output(output),
 281                 is_terminal_output(output)) < 0)
 282                 return -errno;
 283
 284         return move_fd(TAKE_FD(fd), nfd, false);
 285 }
 286
 287 static int open_terminal_as(const char *path, int flags, int nfd) {
 288         int fd;
 289
 290         assert(path);
 291         assert(nfd >= 0);
 292
 293         fd = open_terminal(path, flags | O_NOCTTY);
 294         if (fd < 0)
 295                 return fd;
 296
 297         return move_fd(fd, nfd, false);
 298 }
 299
 300 static int acquire_path(const char *path, int flags, mode_t mode) {
 301         _cleanup_close_ int fd = -EBADF;
 302         int r;
 303
 304         assert(path);
 305
 306         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 307                 flags |= O_CREAT;
 308
 309         fd = open(path, flags|O_NOCTTY, mode);
 310         if (fd >= 0)
 311                 return TAKE_FD(fd);
 312
 313         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 314                 return -errno;
 315
 316         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 317
 318         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 319         if (fd < 0)
 320                 return -errno;
 321
 322         r = connect_unix_path(fd, AT_FDCWD, path);
 323         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 324                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 325                  * wasn't an AF_UNIX socket after all */
 326                 return -ENXIO;
 327         if (r < 0)
 328                 return r;
 329
 330         if ((flags & O_ACCMODE) == O_RDONLY)
 331                 r = shutdown(fd, SHUT_WR);
 332         else if ((flags & O_ACCMODE) == O_WRONLY)
 333                 r = shutdown(fd, SHUT_RD);
 334         else
 335                 r = 0;
 336         if (r < 0)
 337                 return -errno;
 338
 339         return TAKE_FD(fd);
 340 }
 341
 342 static int fixup_input(
 343                 const ExecContext *context,
 344                 int socket_fd,
 345                 bool apply_tty_stdin) {
 346
 347         ExecInput std_input;
 348
 349         assert(context);
 350
 351         std_input = context->std_input;
 352
 353         if (is_terminal_input(std_input) && !apply_tty_stdin)
 354                 return EXEC_INPUT_NULL;
 355
 356         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 357                 return EXEC_INPUT_NULL;
 358
 359         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 360                 return EXEC_INPUT_NULL;
 361
 362         return std_input;
 363 }
 364
 365 static int fixup_output(ExecOutput output, int socket_fd) {
 366
 367         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 368                 return EXEC_OUTPUT_INHERIT;
 369
 370         return output;
 371 }
 372
 373 static int setup_input(
 374                 const ExecContext *context,
 375                 const ExecParameters *params,
 376                 int socket_fd,
 377                 const int named_iofds[static 3]) {
 378
 379         ExecInput i;
 380         int r;
 381
 382         assert(context);
 383         assert(params);
 384         assert(named_iofds);
 385
 386         if (params->stdin_fd >= 0) {
 387                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 388                         return -errno;
 389
 390                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 391                 if (isatty(STDIN_FILENO)) {
 392                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 393
 394                         if (context->tty_reset)
 395                                 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
 396
 397                         (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
 398                 }
 399
 400                 return STDIN_FILENO;
 401         }
 402
 403         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 404
 405         switch (i) {
 406
 407         case EXEC_INPUT_NULL:
 408                 return open_null_as(O_RDONLY, STDIN_FILENO);
 409
 410         case EXEC_INPUT_TTY:
 411         case EXEC_INPUT_TTY_FORCE:
 412         case EXEC_INPUT_TTY_FAIL: {
 413                 _cleanup_close_ int tty_fd = -EBADF;
 414                 const char *tty_path;
 415
 416                 tty_path = ASSERT_PTR(exec_context_tty_path(context));
 417
 418                 tty_fd = acquire_terminal(tty_path,
 419                                           i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 420                                           i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 421                                                                       ACQUIRE_TERMINAL_WAIT,
 422                                           USEC_INFINITY);
 423                 if (tty_fd < 0)
 424                         return tty_fd;
 425
 426                 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
 427                 if (r < 0)
 428                         return r;
 429
 430                 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
 431                 if (r < 0)
 432                         return r;
 433
 434                 TAKE_FD(tty_fd);
 435                 return r;
 436         }
 437
 438         case EXEC_INPUT_SOCKET:
 439                 assert(socket_fd >= 0);
 440
 441                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 442
 443         case EXEC_INPUT_NAMED_FD:
 444                 assert(named_iofds[STDIN_FILENO] >= 0);
 445
 446                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 447                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 448
 449         case EXEC_INPUT_DATA: {
 450                 int fd;
 451
 452                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 453                 if (fd < 0)
 454                         return fd;
 455
 456                 return move_fd(fd, STDIN_FILENO, false);
 457         }
 458
 459         case EXEC_INPUT_FILE: {
 460                 bool rw;
 461                 int fd;
 462
 463                 assert(context->stdio_file[STDIN_FILENO]);
 464
 465                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 466                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 467
 468                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 469                 if (fd < 0)
 470                         return fd;
 471
 472                 return move_fd(fd, STDIN_FILENO, false);
 473         }
 474
 475         default:
 476                 assert_not_reached();
 477         }
 478 }
 479
 480 static bool can_inherit_stderr_from_stdout(
 481                 const ExecContext *context,
 482                 ExecOutput o,
 483                 ExecOutput e) {
 484
 485         assert(context);
 486
 487         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 488          * stderr fd */
 489
 490         if (e == EXEC_OUTPUT_INHERIT)
 491                 return true;
 492         if (e != o)
 493                 return false;
 494
 495         if (e == EXEC_OUTPUT_NAMED_FD)
 496                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 497
 498         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 499                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 500
 501         return true;
 502 }
 503
 504 static int setup_output(
 505                 const ExecContext *context,
 506                 const ExecParameters *params,
 507                 int fileno,
 508                 int socket_fd,
 509                 const int named_iofds[static 3],
 510                 const char *ident,
 511                 uid_t uid,
 512                 gid_t gid,
 513                 dev_t *journal_stream_dev,
 514                 ino_t *journal_stream_ino) {
 515
 516         ExecOutput o;
 517         ExecInput i;
 518         int r;
 519
 520         assert(context);
 521         assert(params);
 522         assert(ident);
 523         assert(journal_stream_dev);
 524         assert(journal_stream_ino);
 525
 526         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 527
 528                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 529                         return -errno;
 530
 531                 return STDOUT_FILENO;
 532         }
 533
 534         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 535                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 536                         return -errno;
 537
 538                 return STDERR_FILENO;
 539         }
 540
 541         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 542         o = fixup_output(context->std_output, socket_fd);
 543
 544         if (fileno == STDERR_FILENO) {
 545                 ExecOutput e;
 546                 e = fixup_output(context->std_error, socket_fd);
 547
 548                 /* This expects the input and output are already set up */
 549
 550                 /* Don't change the stderr file descriptor if we inherit all
 551                  * the way and are not on a tty */
 552                 if (e == EXEC_OUTPUT_INHERIT &&
 553                     o == EXEC_OUTPUT_INHERIT &&
 554                     i == EXEC_INPUT_NULL &&
 555                     !is_terminal_input(context->std_input) &&
 556                     getppid() != 1)
 557                         return fileno;
 558
 559                 /* Duplicate from stdout if possible */
 560                 if (can_inherit_stderr_from_stdout(context, o, e))
 561                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 562
 563                 o = e;
 564
 565         } else if (o == EXEC_OUTPUT_INHERIT) {
 566                 /* If input got downgraded, inherit the original value */
 567                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 568                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 569
 570                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 571                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 572                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 573
 574                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 575                 if (getppid() != 1)
 576                         return fileno;
 577
 578                 /* We need to open /dev/null here anew, to get the right access mode. */
 579                 return open_null_as(O_WRONLY, fileno);
 580         }
 581
 582         switch (o) {
 583
 584         case EXEC_OUTPUT_NULL:
 585                 return open_null_as(O_WRONLY, fileno);
 586
 587         case EXEC_OUTPUT_TTY:
 588                 if (is_terminal_input(i))
 589                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 590
 591                 /* We don't reset the terminal if this is just about output */
 592                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 593
 594         case EXEC_OUTPUT_KMSG:
 595         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 596         case EXEC_OUTPUT_JOURNAL:
 597         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 598                 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
 599                 if (r < 0) {
 600                         log_exec_warning_errno(context,
 601                                                params,
 602                                                r,
 603                                                "Failed to connect %s to the journal socket, ignoring: %m",
 604                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 605                         r = open_null_as(O_WRONLY, fileno);
 606                 } else {
 607                         struct stat st;
 608
 609                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 610                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 611                          * services to detect whether they are connected to the journal or not.
 612                          *
 613                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 614                          * about STDERR as that's usually the best way to do logging. */
 615
 616                         if (fstat(fileno, &st) >= 0 &&
 617                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 618                                 *journal_stream_dev = st.st_dev;
 619                                 *journal_stream_ino = st.st_ino;
 620                         }
 621                 }
 622                 return r;
 623
 624         case EXEC_OUTPUT_SOCKET:
 625                 assert(socket_fd >= 0);
 626
 627                 return RET_NERRNO(dup2(socket_fd, fileno));
 628
 629         case EXEC_OUTPUT_NAMED_FD:
 630                 assert(named_iofds[fileno] >= 0);
 631
 632                 (void) fd_nonblock(named_iofds[fileno], false);
 633                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 634
 635         case EXEC_OUTPUT_FILE:
 636         case EXEC_OUTPUT_FILE_APPEND:
 637         case EXEC_OUTPUT_FILE_TRUNCATE: {
 638                 bool rw;
 639                 int fd, flags;
 640
 641                 assert(context->stdio_file[fileno]);
 642
 643                 rw = context->std_input == EXEC_INPUT_FILE &&
 644                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 645
 646                 if (rw)
 647                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 648
 649                 flags = O_WRONLY;
 650                 if (o == EXEC_OUTPUT_FILE_APPEND)
 651                         flags |= O_APPEND;
 652                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 653                         flags |= O_TRUNC;
 654
 655                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 656                 if (fd < 0)
 657                         return fd;
 658
 659                 return move_fd(fd, fileno, 0);
 660         }
 661
 662         default:
 663                 assert_not_reached();
 664         }
 665 }
 666
 667 static int chown_terminal(int fd, uid_t uid) {
 668         int r;
 669
 670         assert(fd >= 0);
 671
 672         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 673         if (!isatty_safe(fd))
 674                 return 0;
 675
 676         /* This might fail. What matters are the results. */
 677         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 678         if (r < 0)
 679                 return r;
 680
 681         return 1;
 682 }
 683
 684 static int setup_confirm_stdio(
 685                 const ExecContext *context,
 686                 const char *vc,
 687                 int *ret_saved_stdin,
 688                 int *ret_saved_stdout) {
 689
 690         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 691         int r;
 692
 693         assert(ret_saved_stdin);
 694         assert(ret_saved_stdout);
 695
 696         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 697         if (saved_stdin < 0)
 698                 return -errno;
 699
 700         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 701         if (saved_stdout < 0)
 702                 return -errno;
 703
 704         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 705         if (fd < 0)
 706                 return fd;
 707
 708         r = chown_terminal(fd, getuid());
 709         if (r < 0)
 710                 return r;
 711
 712         r = reset_terminal_fd(fd, /* switch_to_text= */ true);
 713         if (r < 0)
 714                 return r;
 715
 716         r = exec_context_apply_tty_size(context, fd, vc);
 717         if (r < 0)
 718                 return r;
 719
 720         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 721         TAKE_FD(fd);
 722         if (r < 0)
 723                 return r;
 724
 725         *ret_saved_stdin = TAKE_FD(saved_stdin);
 726         *ret_saved_stdout = TAKE_FD(saved_stdout);
 727         return 0;
 728 }
 729
 730 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
 731         assert(err < 0);
 732         assert(unit_id);
 733
 734         if (err == -ETIMEDOUT)
 735                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
 736         else {
 737                 errno = -err;
 738                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
 739         }
 740 }
 741
 742 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
 743         _cleanup_close_ int fd = -EBADF;
 744
 745         assert(vc);
 746
 747         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 748         if (fd < 0)
 749                 return;
 750
 751         write_confirm_error_fd(err, fd, unit_id);
 752 }
 753
 754 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 755         int r = 0;
 756
 757         assert(saved_stdin);
 758         assert(saved_stdout);
 759
 760         release_terminal();
 761
 762         if (*saved_stdin >= 0)
 763                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 764                         r = -errno;
 765
 766         if (*saved_stdout >= 0)
 767                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 768                         r = -errno;
 769
 770         *saved_stdin = safe_close(*saved_stdin);
 771         *saved_stdout = safe_close(*saved_stdout);
 772
 773         return r;
 774 }
 775
 776 enum {
 777         CONFIRM_PRETEND_FAILURE = -1,
 778         CONFIRM_PRETEND_SUCCESS =  0,
 779         CONFIRM_EXECUTE = 1,
 780 };
 781
 782 static bool confirm_spawn_disabled(void) {
 783         return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
 784 }
 785
 786 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
 787         int saved_stdout = -1, saved_stdin = -1, r;
 788         _cleanup_free_ char *e = NULL;
 789         char c;
 790
 791         assert(context);
 792         assert(params);
 793
 794         /* For any internal errors, assume a positive response. */
 795         r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
 796         if (r < 0) {
 797                 write_confirm_error(r, params->confirm_spawn, params->unit_id);
 798                 return CONFIRM_EXECUTE;
 799         }
 800
 801         /* confirm_spawn might have been disabled while we were sleeping. */
 802         if (!params->confirm_spawn || confirm_spawn_disabled()) {
 803                 r = 1;
 804                 goto restore_stdio;
 805         }
 806
 807         e = ellipsize(cmdline, 60, 100);
 808         if (!e) {
 809                 log_oom();
 810                 r = CONFIRM_EXECUTE;
 811                 goto restore_stdio;
 812         }
 813
 814         for (;;) {
 815                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 816                 if (r < 0) {
 817                         write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
 818                         r = CONFIRM_EXECUTE;
 819                         goto restore_stdio;
 820                 }
 821
 822                 switch (c) {
 823                 case 'c':
 824                         printf("Resuming normal execution.\n");
 825                         manager_disable_confirm_spawn();
 826                         r = 1;
 827                         break;
 828                 case 'D':
 829                         printf("  Unit: %s\n",
 830                                params->unit_id);
 831                         exec_context_dump(context, stdout, "  ");
 832                         exec_params_dump(params, stdout, "  ");
 833                         continue; /* ask again */
 834                 case 'f':
 835                         printf("Failing execution.\n");
 836                         r = CONFIRM_PRETEND_FAILURE;
 837                         break;
 838                 case 'h':
 839                         printf("  c - continue, proceed without asking anymore\n"
 840                                "  D - dump, show the state of the unit\n"
 841                                "  f - fail, don't execute the command and pretend it failed\n"
 842                                "  h - help\n"
 843                                "  i - info, show a short summary of the unit\n"
 844                                "  j - jobs, show jobs that are in progress\n"
 845                                "  s - skip, don't execute the command and pretend it succeeded\n"
 846                                "  y - yes, execute the command\n");
 847                         continue; /* ask again */
 848                 case 'i':
 849                         printf("  Unit:        %s\n"
 850                                "  Command:     %s\n",
 851                                params->unit_id, cmdline);
 852                         continue; /* ask again */
 853                 case 'j':
 854                         if (sigqueue(getppid(),
 855                                      SIGRTMIN+18,
 856                                      (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
 857                                 return -errno;
 858
 859                         continue; /* ask again */
 860                 case 'n':
 861                         /* 'n' was removed in favor of 'f'. */
 862                         printf("Didn't understand 'n', did you mean 'f'?\n");
 863                         continue; /* ask again */
 864                 case 's':
 865                         printf("Skipping execution.\n");
 866                         r = CONFIRM_PRETEND_SUCCESS;
 867                         break;
 868                 case 'y':
 869                         r = CONFIRM_EXECUTE;
 870                         break;
 871                 default:
 872                         assert_not_reached();
 873                 }
 874                 break;
 875         }
 876
 877 restore_stdio:
 878         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 879         return r;
 880 }
 881
 882 static int get_fixed_user(
 883                 const char *user_or_uid,
 884                 const char **ret_username,
 885                 uid_t *ret_uid,
 886                 gid_t *ret_gid,
 887                 const char **ret_home,
 888                 const char **ret_shell) {
 889
 890         int r;
 891
 892         assert(user_or_uid);
 893         assert(ret_username);
 894
 895         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 896          * (i.e. are "/" or "/bin/nologin"). */
 897
 898         r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
 899         if (r < 0)
 900                 return r;
 901
 902         /* user_or_uid is normalized by get_user_creds to username */
 903         *ret_username = user_or_uid;
 904
 905         return 0;
 906 }
 907
 908 static int get_fixed_group(
 909                 const char *group_or_gid,
 910                 const char **ret_groupname,
 911                 gid_t *ret_gid) {
 912
 913         int r;
 914
 915         assert(group_or_gid);
 916         assert(ret_groupname);
 917
 918         r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
 919         if (r < 0)
 920                 return r;
 921
 922         /* group_or_gid is normalized by get_group_creds to groupname */
 923         *ret_groupname = group_or_gid;
 924
 925         return 0;
 926 }
 927
 928 static int get_supplementary_groups(const ExecContext *c, const char *user,
 929                                     const char *group, gid_t gid,
 930                                     gid_t **supplementary_gids, int *ngids) {
 931         int r, k = 0;
 932         int ngroups_max;
 933         bool keep_groups = false;
 934         gid_t *groups = NULL;
 935         _cleanup_free_ gid_t *l_gids = NULL;
 936
 937         assert(c);
 938
 939         /*
 940          * If user is given, then lookup GID and supplementary groups list.
 941          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 942          * here and as early as possible so we keep the list of supplementary
 943          * groups of the caller.
 944          */
 945         if (user && gid_is_valid(gid) && gid != 0) {
 946                 /* First step, initialize groups from /etc/groups */
 947                 if (initgroups(user, gid) < 0)
 948                         return -errno;
 949
 950                 keep_groups = true;
 951         }
 952
 953         if (strv_isempty(c->supplementary_groups))
 954                 return 0;
 955
 956         /*
 957          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 958          * be positive, otherwise fail.
 959          */
 960         errno = 0;
 961         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 962         if (ngroups_max <= 0)
 963                 return errno_or_else(EOPNOTSUPP);
 964
 965         l_gids = new(gid_t, ngroups_max);
 966         if (!l_gids)
 967                 return -ENOMEM;
 968
 969         if (keep_groups) {
 970                 /*
 971                  * Lookup the list of groups that the user belongs to, we
 972                  * avoid NSS lookups here too for gid=0.
 973                  */
 974                 k = ngroups_max;
 975                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 976                         return -EINVAL;
 977         } else
 978                 k = 0;
 979
 980         STRV_FOREACH(i, c->supplementary_groups) {
 981                 const char *g;
 982
 983                 if (k >= ngroups_max)
 984                         return -E2BIG;
 985
 986                 g = *i;
 987                 r = get_group_creds(&g, l_gids+k, 0);
 988                 if (r < 0)
 989                         return r;
 990
 991                 k++;
 992         }
 993
 994         /*
 995          * Sets ngids to zero to drop all supplementary groups, happens
 996          * when we are under root and SupplementaryGroups= is empty.
 997          */
 998         if (k == 0) {
 999                 *ngids = 0;
1000                 return 0;
1001         }
1002
1003         /* Otherwise get the final list of supplementary groups */
1004         groups = memdup(l_gids, sizeof(gid_t) * k);
1005         if (!groups)
1006                 return -ENOMEM;
1007
1008         *supplementary_gids = groups;
1009         *ngids = k;
1010
1011         groups = NULL;
1012
1013         return 0;
1014 }
1015
1016 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1017         int r;
1018
1019         /* Handle SupplementaryGroups= if it is not empty */
1020         if (ngids > 0) {
1021                 r = maybe_setgroups(ngids, supplementary_gids);
1022                 if (r < 0)
1023                         return r;
1024         }
1025
1026         if (gid_is_valid(gid)) {
1027                 /* Then set our gids */
1028                 if (setresgid(gid, gid, gid) < 0)
1029                         return -errno;
1030         }
1031
1032         return 0;
1033 }
1034
1035 static int set_securebits(unsigned bits, unsigned mask) {
1036         unsigned applied;
1037         int current;
1038
1039         current = prctl(PR_GET_SECUREBITS);
1040         if (current < 0)
1041                 return -errno;
1042
1043         /* Clear all securebits defined in mask and set bits */
1044         applied = ((unsigned) current & ~mask) | bits;
1045         if ((unsigned) current == applied)
1046                 return 0;
1047
1048         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1049                 return -errno;
1050
1051         return 1;
1052 }
1053
1054 static int enforce_user(
1055                 const ExecContext *context,
1056                 uid_t uid,
1057                 uint64_t capability_ambient_set) {
1058         assert(context);
1059         int r;
1060
1061         if (!uid_is_valid(uid))
1062                 return 0;
1063
1064         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1065          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1066          * case. */
1067
1068         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1069
1070                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1071                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1072                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1073                 if (r < 0)
1074                         return r;
1075         }
1076
1077         /* Second step: actually set the uids */
1078         if (setresuid(uid, uid, uid) < 0)
1079                 return -errno;
1080
1081         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1082          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1083          * outside of this call. */
1084         return 0;
1085 }
1086
1087 #if HAVE_PAM
1088
1089 static int null_conv(
1090                 int num_msg,
1091                 const struct pam_message **msg,
1092                 struct pam_response **resp,
1093                 void *appdata_ptr) {
1094
1095         /* We don't support conversations */
1096
1097         return PAM_CONV_ERR;
1098 }
1099
1100 #endif
1101
1102 static int setup_pam(
1103                 const char *name,
1104                 const char *user,
1105                 uid_t uid,
1106                 gid_t gid,
1107                 const char *tty,
1108                 char ***env, /* updated on success */
1109                 const int fds[], size_t n_fds) {
1110
1111 #if HAVE_PAM
1112
1113         static const struct pam_conv conv = {
1114                 .conv = null_conv,
1115                 .appdata_ptr = NULL
1116         };
1117
1118         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1119         _cleanup_strv_free_ char **e = NULL;
1120         pam_handle_t *handle = NULL;
1121         sigset_t old_ss;
1122         int pam_code = PAM_SUCCESS, r;
1123         bool close_session = false;
1124         pid_t pam_pid = 0, parent_pid;
1125         int flags = 0;
1126
1127         assert(name);
1128         assert(user);
1129         assert(env);
1130
1131         /* We set up PAM in the parent process, then fork. The child
1132          * will then stay around until killed via PR_GET_PDEATHSIG or
1133          * systemd via the cgroup logic. It will then remove the PAM
1134          * session again. The parent process will exec() the actual
1135          * daemon. We do things this way to ensure that the main PID
1136          * of the daemon is the one we initially fork()ed. */
1137
1138         r = barrier_create(&barrier);
1139         if (r < 0)
1140                 goto fail;
1141
1142         if (log_get_max_level() < LOG_DEBUG)
1143                 flags |= PAM_SILENT;
1144
1145         pam_code = pam_start(name, user, &conv, &handle);
1146         if (pam_code != PAM_SUCCESS) {
1147                 handle = NULL;
1148                 goto fail;
1149         }
1150
1151         if (!tty) {
1152                 _cleanup_free_ char *q = NULL;
1153
1154                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1155                  * out if that's the case, and read the TTY off it. */
1156
1157                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1158                         tty = strjoina("/dev/", q);
1159         }
1160
1161         if (tty) {
1162                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1163                 if (pam_code != PAM_SUCCESS)
1164                         goto fail;
1165         }
1166
1167         STRV_FOREACH(nv, *env) {
1168                 pam_code = pam_putenv(handle, *nv);
1169                 if (pam_code != PAM_SUCCESS)
1170                         goto fail;
1171         }
1172
1173         pam_code = pam_acct_mgmt(handle, flags);
1174         if (pam_code != PAM_SUCCESS)
1175                 goto fail;
1176
1177         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1178         if (pam_code != PAM_SUCCESS)
1179                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1180
1181         pam_code = pam_open_session(handle, flags);
1182         if (pam_code != PAM_SUCCESS)
1183                 goto fail;
1184
1185         close_session = true;
1186
1187         e = pam_getenvlist(handle);
1188         if (!e) {
1189                 pam_code = PAM_BUF_ERR;
1190                 goto fail;
1191         }
1192
1193         /* Block SIGTERM, so that we know that it won't get lost in the child */
1194
1195         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1196
1197         parent_pid = getpid_cached();
1198
1199         r = safe_fork("(sd-pam)", 0, &pam_pid);
1200         if (r < 0)
1201                 goto fail;
1202         if (r == 0) {
1203                 int ret = EXIT_PAM;
1204
1205                 /* The child's job is to reset the PAM session on termination */
1206                 barrier_set_role(&barrier, BARRIER_CHILD);
1207
1208                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1209                  * those fds are open here that have been opened by PAM. */
1210                 (void) close_many(fds, n_fds);
1211
1212                 /* Drop privileges - we don't need any to pam_close_session and this will make
1213                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1214                  * threads to fail to exit normally */
1215
1216                 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1217                 if (r < 0)
1218                         log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1219
1220                 (void) ignore_signals(SIGPIPE);
1221
1222                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1223                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1224                  * this way. We rely on the control groups kill logic to do the rest for us. */
1225                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1226                         goto child_finish;
1227
1228                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1229                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1230                  *
1231                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1232                 (void) barrier_place(&barrier);
1233
1234                 /* Check if our parent process might already have died? */
1235                 if (getppid() == parent_pid) {
1236                         sigset_t ss;
1237                         int sig;
1238
1239                         assert_se(sigemptyset(&ss) >= 0);
1240                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1241
1242                         assert_se(sigwait(&ss, &sig) == 0);
1243                         assert(sig == SIGTERM);
1244                 }
1245
1246                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1247                 if (pam_code != PAM_SUCCESS)
1248                         goto child_finish;
1249
1250                 /* If our parent died we'll end the session */
1251                 if (getppid() != parent_pid) {
1252                         pam_code = pam_close_session(handle, flags);
1253                         if (pam_code != PAM_SUCCESS)
1254                                 goto child_finish;
1255                 }
1256
1257                 ret = 0;
1258
1259         child_finish:
1260                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1261                  * know about this. See pam_end(3) */
1262                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1263                 _exit(ret);
1264         }
1265
1266         barrier_set_role(&barrier, BARRIER_PARENT);
1267
1268         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1269          * here. */
1270         handle = NULL;
1271
1272         /* Unblock SIGTERM again in the parent */
1273         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1274
1275         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1276          * this fd around. */
1277         closelog();
1278
1279         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1280          * recover. However, warn loudly if it happens. */
1281         if (!barrier_place_and_sync(&barrier))
1282                 log_error("PAM initialization failed");
1283
1284         return strv_free_and_replace(*env, e);
1285
1286 fail:
1287         if (pam_code != PAM_SUCCESS) {
1288                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1289                 r = -EPERM;  /* PAM errors do not map to errno */
1290         } else
1291                 log_error_errno(r, "PAM failed: %m");
1292
1293         if (handle) {
1294                 if (close_session)
1295                         pam_code = pam_close_session(handle, flags);
1296
1297                 (void) pam_end(handle, pam_code | flags);
1298         }
1299
1300         closelog();
1301         return r;
1302 #else
1303         return 0;
1304 #endif
1305 }
1306
1307 static void rename_process_from_path(const char *path) {
1308         _cleanup_free_ char *buf = NULL;
1309         const char *p;
1310
1311         assert(path);
1312
1313         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1314          * /bin/ps */
1315
1316         if (path_extract_filename(path, &buf) < 0) {
1317                 rename_process("(...)");
1318                 return;
1319         }
1320
1321         size_t l = strlen(buf);
1322         if (l > 8) {
1323                 /* The end of the process name is usually more interesting, since the first bit might just be
1324                  * "systemd-" */
1325                 p = buf + l - 8;
1326                 l = 8;
1327         } else
1328                 p = buf;
1329
1330         char process_name[11];
1331         process_name[0] = '(';
1332         memcpy(process_name+1, p, l);
1333         process_name[1+l] = ')';
1334         process_name[1+l+1] = 0;
1335
1336         (void) rename_process(process_name);
1337 }
1338
1339 static bool context_has_address_families(const ExecContext *c) {
1340         assert(c);
1341
1342         return c->address_families_allow_list ||
1343                 !set_isempty(c->address_families);
1344 }
1345
1346 static bool context_has_syscall_filters(const ExecContext *c) {
1347         assert(c);
1348
1349         return c->syscall_allow_list ||
1350                 !hashmap_isempty(c->syscall_filter);
1351 }
1352
1353 static bool context_has_syscall_logs(const ExecContext *c) {
1354         assert(c);
1355
1356         return c->syscall_log_allow_list ||
1357                 !hashmap_isempty(c->syscall_log);
1358 }
1359
1360 static bool context_has_seccomp(const ExecContext *c) {
1361         /* We need NNP if we have any form of seccomp and are unprivileged */
1362         return c->lock_personality ||
1363                 c->memory_deny_write_execute ||
1364                 c->private_devices ||
1365                 c->protect_clock ||
1366                 c->protect_hostname ||
1367                 c->protect_kernel_tunables ||
1368                 c->protect_kernel_modules ||
1369                 c->protect_kernel_logs ||
1370                 context_has_address_families(c) ||
1371                 exec_context_restrict_namespaces_set(c) ||
1372                 c->restrict_realtime ||
1373                 c->restrict_suid_sgid ||
1374                 !set_isempty(c->syscall_archs) ||
1375                 context_has_syscall_filters(c) ||
1376                 context_has_syscall_logs(c);
1377 }
1378
1379 static bool context_has_no_new_privileges(const ExecContext *c) {
1380         assert(c);
1381
1382         if (c->no_new_privileges)
1383                 return true;
1384
1385         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1386                 return false;
1387
1388         return context_has_seccomp(c);
1389 }
1390
1391 #if HAVE_SECCOMP
1392
1393 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1394         void *id, *val;
1395         bool has_capget = false, has_capset = false, has_prctl = false;
1396
1397         assert(c);
1398
1399         /* No syscall filter, we are allowed to drop privileges */
1400         if (hashmap_isempty(c->syscall_filter))
1401                 return true;
1402
1403         HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1404                 _cleanup_free_ char *name = NULL;
1405
1406                 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1407
1408                 if (streq(name, "capget"))
1409                         has_capget = true;
1410                 else if (streq(name, "capset"))
1411                         has_capset = true;
1412                 else if (streq(name, "prctl"))
1413                         has_prctl = true;
1414         }
1415
1416         if (c->syscall_allow_list)
1417                 return has_capget && has_capset && has_prctl;
1418         else
1419                 return !(has_capget || has_capset || has_prctl);
1420 }
1421
1422 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1423
1424         if (is_seccomp_available())
1425                 return false;
1426
1427         log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1428         return true;
1429 }
1430
1431 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1432         uint32_t negative_action, default_action, action;
1433         int r;
1434
1435         assert(c);
1436         assert(p);
1437
1438         if (!context_has_syscall_filters(c))
1439                 return 0;
1440
1441         if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1442                 return 0;
1443
1444         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1445
1446         if (c->syscall_allow_list) {
1447                 default_action = negative_action;
1448                 action = SCMP_ACT_ALLOW;
1449         } else {
1450                 default_action = SCMP_ACT_ALLOW;
1451                 action = negative_action;
1452         }
1453
1454         if (needs_ambient_hack) {
1455                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1456                 if (r < 0)
1457                         return r;
1458         }
1459
1460         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1461 }
1462
1463 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1464 #ifdef SCMP_ACT_LOG
1465         uint32_t default_action, action;
1466 #endif
1467
1468         assert(c);
1469         assert(p);
1470
1471         if (!context_has_syscall_logs(c))
1472                 return 0;
1473
1474 #ifdef SCMP_ACT_LOG
1475         if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1476                 return 0;
1477
1478         if (c->syscall_log_allow_list) {
1479                 /* Log nothing but the ones listed */
1480                 default_action = SCMP_ACT_ALLOW;
1481                 action = SCMP_ACT_LOG;
1482         } else {
1483                 /* Log everything but the ones listed */
1484                 default_action = SCMP_ACT_LOG;
1485                 action = SCMP_ACT_ALLOW;
1486         }
1487
1488         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1489 #else
1490         /* old libseccomp */
1491         log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1492         return 0;
1493 #endif
1494 }
1495
1496 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1497         assert(c);
1498         assert(p);
1499
1500         if (set_isempty(c->syscall_archs))
1501                 return 0;
1502
1503         if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1504                 return 0;
1505
1506         return seccomp_restrict_archs(c->syscall_archs);
1507 }
1508
1509 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1510         assert(c);
1511         assert(p);
1512
1513         if (!context_has_address_families(c))
1514                 return 0;
1515
1516         if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1517                 return 0;
1518
1519         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1520 }
1521
1522 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1523         int r;
1524
1525         assert(c);
1526         assert(p);
1527
1528         if (!c->memory_deny_write_execute)
1529                 return 0;
1530
1531         /* use prctl() if kernel supports it (6.3) */
1532         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1533         if (r == 0) {
1534                 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1535                 return 0;
1536         }
1537         if (r < 0 && errno != EINVAL)
1538                 return log_exec_debug_errno(c,
1539                                             p,
1540                                             errno,
1541                                             "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1542         /* else use seccomp */
1543         log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1544
1545         if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1546                 return 0;
1547
1548         return seccomp_memory_deny_write_execute();
1549 }
1550
1551 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1552         assert(c);
1553         assert(p);
1554
1555         if (!c->restrict_realtime)
1556                 return 0;
1557
1558         if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1559                 return 0;
1560
1561         return seccomp_restrict_realtime();
1562 }
1563
1564 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1565         assert(c);
1566         assert(p);
1567
1568         if (!c->restrict_suid_sgid)
1569                 return 0;
1570
1571         if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1572                 return 0;
1573
1574         return seccomp_restrict_suid_sgid();
1575 }
1576
1577 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1578         assert(c);
1579         assert(p);
1580
1581         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1582          * let's protect even those systems where this is left on in the kernel. */
1583
1584         if (!c->protect_kernel_tunables)
1585                 return 0;
1586
1587         if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1588                 return 0;
1589
1590         return seccomp_protect_sysctl();
1591 }
1592
1593 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1594         assert(c);
1595         assert(p);
1596
1597         /* Turn off module syscalls on ProtectKernelModules=yes */
1598
1599         if (!c->protect_kernel_modules)
1600                 return 0;
1601
1602         if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1603                 return 0;
1604
1605         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1609         assert(c);
1610         assert(p);
1611
1612         if (!c->protect_kernel_logs)
1613                 return 0;
1614
1615         if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1616                 return 0;
1617
1618         return seccomp_protect_syslog();
1619 }
1620
1621 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1622         assert(c);
1623         assert(p);
1624
1625         if (!c->protect_clock)
1626                 return 0;
1627
1628         if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1629                 return 0;
1630
1631         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1632 }
1633
1634 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1635         assert(c);
1636         assert(p);
1637
1638         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1639
1640         if (!c->private_devices)
1641                 return 0;
1642
1643         if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1644                 return 0;
1645
1646         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1647 }
1648
1649 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1650         assert(c);
1651         assert(p);
1652
1653         if (!exec_context_restrict_namespaces_set(c))
1654                 return 0;
1655
1656         if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1657                 return 0;
1658
1659         return seccomp_restrict_namespaces(c->restrict_namespaces);
1660 }
1661
1662 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1663         unsigned long personality;
1664         int r;
1665
1666         assert(c);
1667         assert(p);
1668
1669         if (!c->lock_personality)
1670                 return 0;
1671
1672         if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1673                 return 0;
1674
1675         personality = c->personality;
1676
1677         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1678         if (personality == PERSONALITY_INVALID) {
1679
1680                 r = opinionated_personality(&personality);
1681                 if (r < 0)
1682                         return r;
1683         }
1684
1685         return seccomp_lock_personality(personality);
1686 }
1687
1688 #endif
1689
1690 #if HAVE_LIBBPF
1691 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1692         int r;
1693
1694         assert(c);
1695         assert(p);
1696
1697         if (!exec_context_restrict_filesystems_set(c))
1698                 return 0;
1699
1700         if (p->bpf_outer_map_fd < 0) {
1701                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1702                 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1703                 return 0;
1704         }
1705
1706         /* We are in a new binary, so dl-open again */
1707         r = dlopen_bpf();
1708         if (r < 0)
1709                 return r;
1710
1711         return lsm_bpf_restrict_filesystems(c->restrict_filesystems, p->cgroup_id, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
1712 }
1713 #endif
1714
1715 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1716         assert(c);
1717         assert(p);
1718
1719         if (!c->protect_hostname)
1720                 return 0;
1721
1722         if (ns_type_supported(NAMESPACE_UTS)) {
1723                 if (unshare(CLONE_NEWUTS) < 0) {
1724                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1725                                 *ret_exit_status = EXIT_NAMESPACE;
1726                                 return log_exec_error_errno(c,
1727                                                             p,
1728                                                             errno,
1729                                                             "Failed to set up UTS namespacing: %m");
1730                         }
1731
1732                         log_exec_warning(c,
1733                                          p,
1734                                          "ProtectHostname=yes is configured, but UTS namespace setup is "
1735                                          "prohibited (container manager?), ignoring namespace setup.");
1736                 }
1737         } else
1738                 log_exec_warning(c,
1739                                  p,
1740                                  "ProtectHostname=yes is configured, but the kernel does not "
1741                                  "support UTS namespaces, ignoring namespace setup.");
1742
1743 #if HAVE_SECCOMP
1744         int r;
1745
1746         if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1747                 return 0;
1748
1749         r = seccomp_protect_hostname();
1750         if (r < 0) {
1751                 *ret_exit_status = EXIT_SECCOMP;
1752                 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1753         }
1754 #endif
1755
1756         return 0;
1757 }
1758
1759 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1760         assert(idle_pipe);
1761
1762         idle_pipe[1] = safe_close(idle_pipe[1]);
1763         idle_pipe[2] = safe_close(idle_pipe[2]);
1764
1765         if (idle_pipe[0] >= 0) {
1766                 int r;
1767
1768                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1769
1770                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1771                         ssize_t n;
1772
1773                         /* Signal systemd that we are bored and want to continue. */
1774                         n = write(idle_pipe[3], "x", 1);
1775                         if (n > 0)
1776                                 /* Wait for systemd to react to the signal above. */
1777                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1778                 }
1779
1780                 idle_pipe[0] = safe_close(idle_pipe[0]);
1781
1782         }
1783
1784         idle_pipe[3] = safe_close(idle_pipe[3]);
1785 }
1786
1787 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1788
1789 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1790  * the service payload in. */
1791 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1792         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1793         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1794         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1795         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1796         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1797 };
1798
1799 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1800
1801 static int build_environment(
1802                 const ExecContext *c,
1803                 const ExecParameters *p,
1804                 const CGroupContext *cgroup_context,
1805                 size_t n_fds,
1806                 const char *home,
1807                 const char *username,
1808                 const char *shell,
1809                 dev_t journal_stream_dev,
1810                 ino_t journal_stream_ino,
1811                 const char *memory_pressure_path,
1812                 char ***ret) {
1813
1814         _cleanup_strv_free_ char **our_env = NULL;
1815         size_t n_env = 0;
1816         char *x;
1817         int r;
1818
1819         assert(c);
1820         assert(p);
1821         assert(ret);
1822
1823 #define N_ENV_VARS 19
1824         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1825         if (!our_env)
1826                 return -ENOMEM;
1827
1828         if (n_fds > 0) {
1829                 _cleanup_free_ char *joined = NULL;
1830
1831                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1832                         return -ENOMEM;
1833                 our_env[n_env++] = x;
1834
1835                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1836                         return -ENOMEM;
1837                 our_env[n_env++] = x;
1838
1839                 joined = strv_join(p->fd_names, ":");
1840                 if (!joined)
1841                         return -ENOMEM;
1842
1843                 x = strjoin("LISTEN_FDNAMES=", joined);
1844                 if (!x)
1845                         return -ENOMEM;
1846                 our_env[n_env++] = x;
1847         }
1848
1849         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1850                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1851                         return -ENOMEM;
1852                 our_env[n_env++] = x;
1853
1854                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1855                         return -ENOMEM;
1856                 our_env[n_env++] = x;
1857         }
1858
1859         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1860          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1861          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1862         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1863                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1864                 if (!x)
1865                         return -ENOMEM;
1866                 our_env[n_env++] = x;
1867         }
1868
1869         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1870          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1871          * really make much sense since we're not logged in. Hence we conditionalize the three based on
1872          * SetLoginEnvironment= switch. */
1873         if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1874                 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1875                 if (r < 0)
1876                         return log_exec_debug_errno(c,
1877                                                     p,
1878                                                     r,
1879                                                     "Failed to determine user credentials for root: %m");
1880         }
1881
1882         bool set_user_login_env = exec_context_get_set_login_environment(c);
1883
1884         if (username) {
1885                 x = strjoin("USER=", username);
1886                 if (!x)
1887                         return -ENOMEM;
1888                 our_env[n_env++] = x;
1889
1890                 if (set_user_login_env) {
1891                         x = strjoin("LOGNAME=", username);
1892                         if (!x)
1893                                 return -ENOMEM;
1894                         our_env[n_env++] = x;
1895                 }
1896         }
1897
1898         if (home && set_user_login_env) {
1899                 x = strjoin("HOME=", home);
1900                 if (!x)
1901                         return -ENOMEM;
1902
1903                 path_simplify(x + 5);
1904                 our_env[n_env++] = x;
1905         }
1906
1907         if (shell && set_user_login_env) {
1908                 x = strjoin("SHELL=", shell);
1909                 if (!x)
1910                         return -ENOMEM;
1911
1912                 path_simplify(x + 6);
1913                 our_env[n_env++] = x;
1914         }
1915
1916         if (!sd_id128_is_null(p->invocation_id)) {
1917                 assert(p->invocation_id_string);
1918
1919                 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1920                 if (!x)
1921                         return -ENOMEM;
1922
1923                 our_env[n_env++] = x;
1924         }
1925
1926         if (exec_context_needs_term(c)) {
1927                 _cleanup_free_ char *cmdline = NULL;
1928                 const char *tty_path, *term = NULL;
1929
1930                 tty_path = exec_context_tty_path(c);
1931
1932                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1933                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1934                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1935
1936                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1937                         term = getenv("TERM");
1938                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1939                         _cleanup_free_ char *key = NULL;
1940
1941                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1942                         if (!key)
1943                                 return -ENOMEM;
1944
1945                         r = proc_cmdline_get_key(key, 0, &cmdline);
1946                         if (r < 0)
1947                                 log_exec_debug_errno(c,
1948                                                      p,
1949                                                      r,
1950                                                      "Failed to read %s from kernel cmdline, ignoring: %m",
1951                                                      key);
1952                         else if (r > 0)
1953                                 term = cmdline;
1954                 }
1955
1956                 if (!term)
1957                         term = default_term_for_tty(tty_path);
1958
1959                 x = strjoin("TERM=", term);
1960                 if (!x)
1961                         return -ENOMEM;
1962                 our_env[n_env++] = x;
1963         }
1964
1965         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1966                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1967                         return -ENOMEM;
1968
1969                 our_env[n_env++] = x;
1970         }
1971
1972         if (c->log_namespace) {
1973                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1974                 if (!x)
1975                         return -ENOMEM;
1976
1977                 our_env[n_env++] = x;
1978         }
1979
1980         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1981                 _cleanup_free_ char *joined = NULL;
1982                 const char *n;
1983
1984                 if (!p->prefix[t])
1985                         continue;
1986
1987                 if (c->directories[t].n_items == 0)
1988                         continue;
1989
1990                 n = exec_directory_env_name_to_string(t);
1991                 if (!n)
1992                         continue;
1993
1994                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1995                         _cleanup_free_ char *prefixed = NULL;
1996
1997                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1998                         if (!prefixed)
1999                                 return -ENOMEM;
2000
2001                         if (!strextend_with_separator(&joined, ":", prefixed))
2002                                 return -ENOMEM;
2003                 }
2004
2005                 x = strjoin(n, "=", joined);
2006                 if (!x)
2007                         return -ENOMEM;
2008
2009                 our_env[n_env++] = x;
2010         }
2011
2012         _cleanup_free_ char *creds_dir = NULL;
2013         r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2014         if (r < 0)
2015                 return r;
2016         if (r > 0) {
2017                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2018                 if (!x)
2019                         return -ENOMEM;
2020
2021                 our_env[n_env++] = x;
2022         }
2023
2024         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2025                 return -ENOMEM;
2026
2027         our_env[n_env++] = x;
2028
2029         if (memory_pressure_path) {
2030                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2031                 if (!x)
2032                         return -ENOMEM;
2033
2034                 our_env[n_env++] = x;
2035
2036                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2037                         _cleanup_free_ char *b = NULL, *e = NULL;
2038
2039                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2040                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2041                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2042                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2043                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2044                                 return -ENOMEM;
2045
2046                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2047                                 return -ENOMEM;
2048
2049                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2050                         if (!x)
2051                                 return -ENOMEM;
2052
2053                         our_env[n_env++] = x;
2054                 }
2055         }
2056
2057         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2058 #undef N_ENV_VARS
2059
2060         *ret = TAKE_PTR(our_env);
2061
2062         return 0;
2063 }
2064
2065 static int build_pass_environment(const ExecContext *c, char ***ret) {
2066         _cleanup_strv_free_ char **pass_env = NULL;
2067         size_t n_env = 0;
2068
2069         STRV_FOREACH(i, c->pass_environment) {
2070                 _cleanup_free_ char *x = NULL;
2071                 char *v;
2072
2073                 v = getenv(*i);
2074                 if (!v)
2075                         continue;
2076                 x = strjoin(*i, "=", v);
2077                 if (!x)
2078                         return -ENOMEM;
2079
2080                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2081                         return -ENOMEM;
2082
2083                 pass_env[n_env++] = TAKE_PTR(x);
2084                 pass_env[n_env] = NULL;
2085         }
2086
2087         *ret = TAKE_PTR(pass_env);
2088
2089         return 0;
2090 }
2091
2092 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2093         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2094         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2095         _cleanup_close_ int unshare_ready_fd = -EBADF;
2096         _cleanup_(sigkill_waitp) pid_t pid = 0;
2097         uint64_t c = 1;
2098         ssize_t n;
2099         int r;
2100
2101         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2102          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2103          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2104          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2105          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2106          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2107          * continues execution normally.
2108          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2109          * does not need CAP_SETUID to write the single line mapping to itself. */
2110
2111         /* Can only set up multiple mappings with CAP_SETUID. */
2112         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2113                 r = asprintf(&uid_map,
2114                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2115                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2116                              ouid, ouid, uid, uid);
2117         else
2118                 r = asprintf(&uid_map,
2119                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2120                              ouid, ouid);
2121
2122         if (r < 0)
2123                 return -ENOMEM;
2124
2125         /* Can only set up multiple mappings with CAP_SETGID. */
2126         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2127                 r = asprintf(&gid_map,
2128                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2129                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2130                              ogid, ogid, gid, gid);
2131         else
2132                 r = asprintf(&gid_map,
2133                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2134                              ogid, ogid);
2135
2136         if (r < 0)
2137                 return -ENOMEM;
2138
2139         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2140          * namespace. */
2141         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2142         if (unshare_ready_fd < 0)
2143                 return -errno;
2144
2145         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2146          * failed. */
2147         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2148                 return -errno;
2149
2150         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2151         if (r < 0)
2152                 return r;
2153         if (r == 0) {
2154                 _cleanup_close_ int fd = -EBADF;
2155                 const char *a;
2156                 pid_t ppid;
2157
2158                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2159                  * here, after the parent opened its own user namespace. */
2160
2161                 ppid = getppid();
2162                 errno_pipe[0] = safe_close(errno_pipe[0]);
2163
2164                 /* Wait until the parent unshared the user namespace */
2165                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2166                         r = -errno;
2167                         goto child_fail;
2168                 }
2169
2170                 /* Disable the setgroups() system call in the child user namespace, for good. */
2171                 a = procfs_file_alloca(ppid, "setgroups");
2172                 fd = open(a, O_WRONLY|O_CLOEXEC);
2173                 if (fd < 0) {
2174                         if (errno != ENOENT) {
2175                                 r = -errno;
2176                                 goto child_fail;
2177                         }
2178
2179                         /* If the file is missing the kernel is too old, let's continue anyway. */
2180                 } else {
2181                         if (write(fd, "deny\n", 5) < 0) {
2182                                 r = -errno;
2183                                 goto child_fail;
2184                         }
2185
2186                         fd = safe_close(fd);
2187                 }
2188
2189                 /* First write the GID map */
2190                 a = procfs_file_alloca(ppid, "gid_map");
2191                 fd = open(a, O_WRONLY|O_CLOEXEC);
2192                 if (fd < 0) {
2193                         r = -errno;
2194                         goto child_fail;
2195                 }
2196                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2197                         r = -errno;
2198                         goto child_fail;
2199                 }
2200                 fd = safe_close(fd);
2201
2202                 /* The write the UID map */
2203                 a = procfs_file_alloca(ppid, "uid_map");
2204                 fd = open(a, O_WRONLY|O_CLOEXEC);
2205                 if (fd < 0) {
2206                         r = -errno;
2207                         goto child_fail;
2208                 }
2209                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2210                         r = -errno;
2211                         goto child_fail;
2212                 }
2213
2214                 _exit(EXIT_SUCCESS);
2215
2216         child_fail:
2217                 (void) write(errno_pipe[1], &r, sizeof(r));
2218                 _exit(EXIT_FAILURE);
2219         }
2220
2221         errno_pipe[1] = safe_close(errno_pipe[1]);
2222
2223         if (unshare(CLONE_NEWUSER) < 0)
2224                 return -errno;
2225
2226         /* Let the child know that the namespace is ready now */
2227         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2228                 return -errno;
2229
2230         /* Try to read an error code from the child */
2231         n = read(errno_pipe[0], &r, sizeof(r));
2232         if (n < 0)
2233                 return -errno;
2234         if (n == sizeof(r)) { /* an error code was sent to us */
2235                 if (r < 0)
2236                         return r;
2237                 return -EIO;
2238         }
2239         if (n != 0) /* on success we should have read 0 bytes */
2240                 return -EIO;
2241
2242         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2243         if (r < 0)
2244                 return r;
2245         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2246                 return -EIO;
2247
2248         return 0;
2249 }
2250
2251 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2252         _cleanup_free_ char *src_abs = NULL;
2253         int r;
2254
2255         assert(source);
2256
2257         src_abs = path_join(root, source);
2258         if (!src_abs)
2259                 return -ENOMEM;
2260
2261         STRV_FOREACH(dst, symlinks) {
2262                 _cleanup_free_ char *dst_abs = NULL;
2263
2264                 dst_abs = path_join(root, *dst);
2265                 if (!dst_abs)
2266                         return -ENOMEM;
2267
2268                 r = mkdir_parents_label(dst_abs, 0755);
2269                 if (r < 0)
2270                         return r;
2271
2272                 r = symlink_idempotent(src_abs, dst_abs, true);
2273                 if (r < 0)
2274                         return r;
2275         }
2276
2277         return 0;
2278 }
2279
2280 static int setup_exec_directory(
2281                 const ExecContext *context,
2282                 const ExecParameters *params,
2283                 uid_t uid,
2284                 gid_t gid,
2285                 ExecDirectoryType type,
2286                 bool needs_mount_namespace,
2287                 int *exit_status) {
2288
2289         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2290                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2291                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2292                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2293                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2294                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2295         };
2296         int r;
2297
2298         assert(context);
2299         assert(params);
2300         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2301         assert(exit_status);
2302
2303         if (!params->prefix[type])
2304                 return 0;
2305
2306         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2307                 if (!uid_is_valid(uid))
2308                         uid = 0;
2309                 if (!gid_is_valid(gid))
2310                         gid = 0;
2311         }
2312
2313         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2314                 _cleanup_free_ char *p = NULL, *pp = NULL;
2315
2316                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2317                 if (!p) {
2318                         r = -ENOMEM;
2319                         goto fail;
2320                 }
2321
2322                 r = mkdir_parents_label(p, 0755);
2323                 if (r < 0)
2324                         goto fail;
2325
2326                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2327
2328                         /* If we are in user mode, and a configuration directory exists but a state directory
2329                          * doesn't exist, then we likely are upgrading from an older systemd version that
2330                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2331                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2332                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2333                          * separated. If a service has both dirs configured but only the configuration dir
2334                          * exists and the state dir does not, we assume we are looking at an update
2335                          * situation. Hence, create a compatibility symlink, so that all expectations are
2336                          * met.
2337                          *
2338                          * (We also do something similar with the log directory, which still doesn't exist in
2339                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2340
2341                         /* this assumes the state dir is always created before the configuration dir */
2342                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2343                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2344
2345                         r = laccess(p, F_OK);
2346                         if (r == -ENOENT) {
2347                                 _cleanup_free_ char *q = NULL;
2348
2349                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2350                                  * under the configuration hierarchy. */
2351
2352                                 if (type == EXEC_DIRECTORY_STATE)
2353                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2354                                 else if (type == EXEC_DIRECTORY_LOGS)
2355                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2356                                 else
2357                                         assert_not_reached();
2358                                 if (!q) {
2359                                         r = -ENOMEM;
2360                                         goto fail;
2361                                 }
2362
2363                                 r = laccess(q, F_OK);
2364                                 if (r >= 0) {
2365                                         /* It does exist! This hence looks like an update. Symlink the
2366                                          * configuration directory into the state directory. */
2367
2368                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2369                                         if (r < 0)
2370                                                 goto fail;
2371
2372                                         log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2373                                         continue;
2374                                 } else if (r != -ENOENT)
2375                                         log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2376
2377                         } else if (r < 0)
2378                                 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2379                 }
2380
2381                 if (exec_directory_is_private(context, type)) {
2382                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2383                          * case we want to avoid leaving a directory around fully accessible that is owned by
2384                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2385                          * trick used by container managers to prohibit host users to get access to files of
2386                          * the same UID in containers: we place everything inside a directory that has an
2387                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2388                          * for unprivileged host code. We then use fs namespacing to make this directory
2389                          * permeable for the service itself.
2390                          *
2391                          * Specifically: for a service which wants a special directory "foo/" we first create
2392                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2393                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2394                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2395                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2396                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2397                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2398                          * for the service and making sure it only gets access to the dirs it needs but no
2399                          * others. Tricky? Yes, absolutely, but it works!
2400                          *
2401                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2402                          * to be owned by the service itself.
2403                          *
2404                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2405                          * for sharing files or sockets with other services. */
2406
2407                         pp = path_join(params->prefix[type], "private");
2408                         if (!pp) {
2409                                 r = -ENOMEM;
2410                                 goto fail;
2411                         }
2412
2413                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2414                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2415                         if (r < 0)
2416                                 goto fail;
2417
2418                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2419                                 r = -ENOMEM;
2420                                 goto fail;
2421                         }
2422
2423                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2424                         r = mkdir_parents_label(pp, 0755);
2425                         if (r < 0)
2426                                 goto fail;
2427
2428                         if (is_dir(p, false) > 0 &&
2429                             (laccess(pp, F_OK) == -ENOENT)) {
2430
2431                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2432                                  * it over. Most likely the service has been upgraded from one that didn't use
2433                                  * DynamicUser=1, to one that does. */
2434
2435                                 log_exec_info(context,
2436                                               params,
2437                                               "Found pre-existing public %s= directory %s, migrating to %s.\n"
2438                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2439                                               exec_directory_type_to_string(type), p, pp);
2440
2441                                 r = RET_NERRNO(rename(p, pp));
2442                                 if (r < 0)
2443                                         goto fail;
2444                         } else {
2445                                 /* Otherwise, create the actual directory for the service */
2446
2447                                 r = mkdir_label(pp, context->directories[type].mode);
2448                                 if (r < 0 && r != -EEXIST)
2449                                         goto fail;
2450                         }
2451
2452                         if (!context->directories[type].items[i].only_create) {
2453                                 /* And link it up from the original place.
2454                                  * Notes
2455                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2456                                  *    the host, and a new one for the child namespace will be created later.
2457                                  * 2) It is not necessary to create this symlink when one of its parent
2458                                  *    directories is specified and already created. E.g.
2459                                  *        StateDirectory=foo foo/bar
2460                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2461                                  *        pp = "/var/lib/private/foo/bar"
2462                                  *        p = "/var/lib/foo/bar"
2463                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2464                                  *    we do not need to create the symlink, but we cannot create the symlink.
2465                                  *    See issue #24783. */
2466                                 r = symlink_idempotent(pp, p, true);
2467                                 if (r < 0)
2468                                         goto fail;
2469                         }
2470
2471                 } else {
2472                         _cleanup_free_ char *target = NULL;
2473
2474                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2475                             readlink_and_make_absolute(p, &target) >= 0) {
2476                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2477
2478                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2479                                  * by DynamicUser=1 (see above)?
2480                                  *
2481                                  * We do this for all directory types except for ConfigurationDirectory=,
2482                                  * since they all support the private/ symlink logic at least in some
2483                                  * configurations, see above. */
2484
2485                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2486                                 if (r < 0)
2487                                         goto fail;
2488
2489                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2490                                 if (!q) {
2491                                         r = -ENOMEM;
2492                                         goto fail;
2493                                 }
2494
2495                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2496                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2497                                 if (r < 0)
2498                                         goto fail;
2499
2500                                 if (path_equal(q_resolved, target_resolved)) {
2501
2502                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2503                                          * but is no longer. Let's move the directory back up. */
2504
2505                                         log_exec_info(context,
2506                                                       params,
2507                                                       "Found pre-existing private %s= directory %s, migrating to %s.\n"
2508                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2509                                                       exec_directory_type_to_string(type), q, p);
2510
2511                                         r = RET_NERRNO(unlink(p));
2512                                         if (r < 0)
2513                                                 goto fail;
2514
2515                                         r = RET_NERRNO(rename(q, p));
2516                                         if (r < 0)
2517                                                 goto fail;
2518                                 }
2519                         }
2520
2521                         r = mkdir_label(p, context->directories[type].mode);
2522                         if (r < 0) {
2523                                 if (r != -EEXIST)
2524                                         goto fail;
2525
2526                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2527                                         struct stat st;
2528
2529                                         /* Don't change the owner/access mode of the configuration directory,
2530                                          * as in the common case it is not written to by a service, and shall
2531                                          * not be writable. */
2532
2533                                         r = RET_NERRNO(stat(p, &st));
2534                                         if (r < 0)
2535                                                 goto fail;
2536
2537                                         /* Still complain if the access mode doesn't match */
2538                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2539                                                 log_exec_warning(context,
2540                                                                  params,
2541                                                                  "%s \'%s\' already exists but the mode is different. "
2542                                                                  "(File system: %o %sMode: %o)",
2543                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2544                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2545
2546                                         continue;
2547                                 }
2548                         }
2549                 }
2550
2551                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2552                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2553                  * current UID/GID ownership.) */
2554                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2555                 if (r < 0)
2556                         goto fail;
2557
2558                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2559                  * available to user code anyway */
2560                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2561                         continue;
2562
2563                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2564                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2565                  * assignments to exist. */
2566                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2567                 if (r < 0)
2568                         goto fail;
2569         }
2570
2571         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2572          * they are set up later, to allow configuring empty var/run/etc. */
2573         if (!needs_mount_namespace)
2574                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2575                         r = create_many_symlinks(params->prefix[type],
2576                                                  context->directories[type].items[i].path,
2577                                                  context->directories[type].items[i].symlinks);
2578                         if (r < 0)
2579                                 goto fail;
2580                 }
2581
2582         return 0;
2583
2584 fail:
2585         *exit_status = exit_status_table[type];
2586         return r;
2587 }
2588
2589 #if ENABLE_SMACK
2590 static int setup_smack(
2591                 const ExecParameters *params,
2592                 const ExecContext *context,
2593                 int executable_fd) {
2594         int r;
2595
2596         assert(params);
2597         assert(executable_fd >= 0);
2598
2599         if (context->smack_process_label) {
2600                 r = mac_smack_apply_pid(0, context->smack_process_label);
2601                 if (r < 0)
2602                         return r;
2603         } else if (params->fallback_smack_process_label) {
2604                 _cleanup_free_ char *exec_label = NULL;
2605
2606                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2607                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2608                         return r;
2609
2610                 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2611                 if (r < 0)
2612                         return r;
2613         }
2614
2615         return 0;
2616 }
2617 #endif
2618
2619 static int compile_bind_mounts(
2620                 const ExecContext *context,
2621                 const ExecParameters *params,
2622                 BindMount **ret_bind_mounts,
2623                 size_t *ret_n_bind_mounts,
2624                 char ***ret_empty_directories) {
2625
2626         _cleanup_strv_free_ char **empty_directories = NULL;
2627         BindMount *bind_mounts = NULL;
2628         size_t n, h = 0;
2629         int r;
2630
2631         assert(context);
2632         assert(params);
2633         assert(ret_bind_mounts);
2634         assert(ret_n_bind_mounts);
2635         assert(ret_empty_directories);
2636
2637         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2638
2639         n = context->n_bind_mounts;
2640         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2641                 if (!params->prefix[t])
2642                         continue;
2643
2644                 for (size_t i = 0; i < context->directories[t].n_items; i++)
2645                         n += !context->directories[t].items[i].only_create;
2646         }
2647
2648         if (n <= 0) {
2649                 *ret_bind_mounts = NULL;
2650                 *ret_n_bind_mounts = 0;
2651                 *ret_empty_directories = NULL;
2652                 return 0;
2653         }
2654
2655         bind_mounts = new(BindMount, n);
2656         if (!bind_mounts)
2657                 return -ENOMEM;
2658
2659         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2660                 BindMount *item = context->bind_mounts + i;
2661                 _cleanup_free_ char *s = NULL, *d = NULL;
2662
2663                 s = strdup(item->source);
2664                 if (!s)
2665                         return -ENOMEM;
2666
2667                 d = strdup(item->destination);
2668                 if (!d)
2669                         return -ENOMEM;
2670
2671                 bind_mounts[h++] = (BindMount) {
2672                         .source = TAKE_PTR(s),
2673                         .destination = TAKE_PTR(d),
2674                         .read_only = item->read_only,
2675                         .recursive = item->recursive,
2676                         .ignore_enoent = item->ignore_enoent,
2677                 };
2678         }
2679
2680         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2681                 if (!params->prefix[t])
2682                         continue;
2683
2684                 if (context->directories[t].n_items == 0)
2685                         continue;
2686
2687                 if (exec_directory_is_private(context, t) &&
2688                     !exec_context_with_rootfs(context)) {
2689                         char *private_root;
2690
2691                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2692                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2693                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2694
2695                         private_root = path_join(params->prefix[t], "private");
2696                         if (!private_root)
2697                                 return -ENOMEM;
2698
2699                         r = strv_consume(&empty_directories, private_root);
2700                         if (r < 0)
2701                                 return r;
2702                 }
2703
2704                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2705                         _cleanup_free_ char *s = NULL, *d = NULL;
2706
2707                         /* When one of the parent directories is in the list, we cannot create the symlink
2708                          * for the child directory. See also the comments in setup_exec_directory(). */
2709                         if (context->directories[t].items[i].only_create)
2710                                 continue;
2711
2712                         if (exec_directory_is_private(context, t))
2713                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2714                         else
2715                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2716                         if (!s)
2717                                 return -ENOMEM;
2718
2719                         if (exec_directory_is_private(context, t) &&
2720                             exec_context_with_rootfs(context))
2721                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2722                                  * directory is not created on the root directory. So, let's bind-mount the directory
2723                                  * on the 'non-private' place. */
2724                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2725                         else
2726                                 d = strdup(s);
2727                         if (!d)
2728                                 return -ENOMEM;
2729
2730                         bind_mounts[h++] = (BindMount) {
2731                                 .source = TAKE_PTR(s),
2732                                 .destination = TAKE_PTR(d),
2733                                 .read_only = false,
2734                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2735                                 .recursive = true,
2736                                 .ignore_enoent = false,
2737                         };
2738                 }
2739         }
2740
2741         assert(h == n);
2742
2743         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2744         *ret_n_bind_mounts = n;
2745         *ret_empty_directories = TAKE_PTR(empty_directories);
2746
2747         return (int) n;
2748 }
2749
2750 /* ret_symlinks will contain a list of pairs src:dest that describes
2751  * the symlinks to create later on. For example, the symlinks needed
2752  * to safely give private directories to DynamicUser=1 users. */
2753 static int compile_symlinks(
2754                 const ExecContext *context,
2755                 const ExecParameters *params,
2756                 bool setup_os_release_symlink,
2757                 char ***ret_symlinks) {
2758
2759         _cleanup_strv_free_ char **symlinks = NULL;
2760         int r;
2761
2762         assert(context);
2763         assert(params);
2764         assert(ret_symlinks);
2765
2766         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2767                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2768                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2769
2770                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2771                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2772
2773                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2774                                 dst_abs = path_join(params->prefix[dt], *symlink);
2775                                 if (!src_abs || !dst_abs)
2776                                         return -ENOMEM;
2777
2778                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2779                                 if (r < 0)
2780                                         return r;
2781                         }
2782
2783                         if (!exec_directory_is_private(context, dt) ||
2784                             exec_context_with_rootfs(context) ||
2785                             context->directories[dt].items[i].only_create)
2786                                 continue;
2787
2788                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2789                         if (!private_path)
2790                                 return -ENOMEM;
2791
2792                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2793                         if (!path)
2794                                 return -ENOMEM;
2795
2796                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2797                         if (r < 0)
2798                                 return r;
2799                 }
2800         }
2801
2802         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2803          * and readers will never get a half-written version. Note that, while the paths specified here are
2804          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2805          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2806         if (setup_os_release_symlink) {
2807                 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2808                 if (r < 0)
2809                         return r;
2810
2811                 r = strv_extend(&symlinks, "/run/host/os-release");
2812                 if (r < 0)
2813                         return r;
2814         }
2815
2816         *ret_symlinks = TAKE_PTR(symlinks);
2817
2818         return 0;
2819 }
2820
2821 static bool insist_on_sandboxing(
2822                 const ExecContext *context,
2823                 const char *root_dir,
2824                 const char *root_image,
2825                 const BindMount *bind_mounts,
2826                 size_t n_bind_mounts) {
2827
2828         assert(context);
2829         assert(n_bind_mounts == 0 || bind_mounts);
2830
2831         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2832          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2833          * rearrange stuff in a way we cannot ignore gracefully. */
2834
2835         if (context->n_temporary_filesystems > 0)
2836                 return true;
2837
2838         if (root_dir || root_image)
2839                 return true;
2840
2841         if (context->n_mount_images > 0)
2842                 return true;
2843
2844         if (context->dynamic_user)
2845                 return true;
2846
2847         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2848                 return true;
2849
2850         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2851          * essential. */
2852         for (size_t i = 0; i < n_bind_mounts; i++)
2853                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2854                         return true;
2855
2856         if (context->log_namespace)
2857                 return true;
2858
2859         return false;
2860 }
2861
2862 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2863         _cleanup_close_ int fd = -EBADF;
2864         int r;
2865
2866         if (!runtime || !runtime->ephemeral_copy)
2867                 return 0;
2868
2869         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2870         if (r < 0)
2871                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2872
2873         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2874
2875         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2876         if (fd >= 0)
2877                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2878                 return 0;
2879
2880         if (fd != -EAGAIN)
2881                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2882
2883         log_debug("Making ephemeral snapshot of %s to %s",
2884                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2885
2886         if (context->root_image)
2887                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
2888                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
2889         else
2890                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
2891                                               AT_FDCWD, runtime->ephemeral_copy,
2892                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
2893                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2894                                               BTRFS_SNAPSHOT_RECURSIVE |
2895                                               BTRFS_SNAPSHOT_LOCK_BSD);
2896         if (fd < 0)
2897                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
2898                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
2899
2900         if (context->root_image) {
2901                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2902                  * which tends to not perform well in combination with lots of random writes.
2903                  *
2904                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2905                  * copy, but we at least want to make the intention clear.
2906                  */
2907                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2908                 if (r < 0)
2909                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
2910         }
2911
2912         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2913         if (r < 0)
2914                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2915
2916         return 1;
2917 }
2918
2919 static int verity_settings_prepare(
2920                 VeritySettings *verity,
2921                 const char *root_image,
2922                 const void *root_hash,
2923                 size_t root_hash_size,
2924                 const char *root_hash_path,
2925                 const void *root_hash_sig,
2926                 size_t root_hash_sig_size,
2927                 const char *root_hash_sig_path,
2928                 const char *verity_data_path) {
2929
2930         int r;
2931
2932         assert(verity);
2933
2934         if (root_hash) {
2935                 void *d;
2936
2937                 d = memdup(root_hash, root_hash_size);
2938                 if (!d)
2939                         return -ENOMEM;
2940
2941                 free_and_replace(verity->root_hash, d);
2942                 verity->root_hash_size = root_hash_size;
2943                 verity->designator = PARTITION_ROOT;
2944         }
2945
2946         if (root_hash_sig) {
2947                 void *d;
2948
2949                 d = memdup(root_hash_sig, root_hash_sig_size);
2950                 if (!d)
2951                         return -ENOMEM;
2952
2953                 free_and_replace(verity->root_hash_sig, d);
2954                 verity->root_hash_sig_size = root_hash_sig_size;
2955                 verity->designator = PARTITION_ROOT;
2956         }
2957
2958         if (verity_data_path) {
2959                 r = free_and_strdup(&verity->data_path, verity_data_path);
2960                 if (r < 0)
2961                         return r;
2962         }
2963
2964         r = verity_settings_load(
2965                         verity,
2966                         root_image,
2967                         root_hash_path,
2968                         root_hash_sig_path);
2969         if (r < 0)
2970                 return log_debug_errno(r, "Failed to load root hash: %m");
2971
2972         return 0;
2973 }
2974
2975 static int apply_mount_namespace(
2976                 ExecCommandFlags command_flags,
2977                 const ExecContext *context,
2978                 const ExecParameters *params,
2979                 ExecRuntime *runtime,
2980                 const char *memory_pressure_path,
2981                 char **error_path) {
2982
2983         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
2984         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
2985                         **read_write_paths_cleanup = NULL;
2986         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
2987                         *extension_dir = NULL, *host_os_release_stage = NULL;
2988         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
2989         char **read_write_paths;
2990         bool needs_sandboxing, setup_os_release_symlink;
2991         BindMount *bind_mounts = NULL;
2992         size_t n_bind_mounts = 0;
2993         int r;
2994
2995         assert(context);
2996
2997         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
2998
2999         if (params->flags & EXEC_APPLY_CHROOT) {
3000                 r = setup_ephemeral(context, runtime);
3001                 if (r < 0)
3002                         return r;
3003
3004                 if (context->root_image)
3005                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3006                 else
3007                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3008         }
3009
3010         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3011         if (r < 0)
3012                 return r;
3013
3014         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3015          * service will need to write to it in order to start the notifications. */
3016         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3017                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3018                 if (!read_write_paths_cleanup)
3019                         return -ENOMEM;
3020
3021                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3022                 if (r < 0)
3023                         return r;
3024
3025                 read_write_paths = read_write_paths_cleanup;
3026         } else
3027                 read_write_paths = context->read_write_paths;
3028
3029         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3030         if (needs_sandboxing) {
3031                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3032                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3033                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3034
3035                 if (context->private_tmp && runtime && runtime->shared) {
3036                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3037                                 tmp_dir = runtime->shared->tmp_dir;
3038                         else if (runtime->shared->tmp_dir)
3039                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3040
3041                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3042                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3043                         else if (runtime->shared->var_tmp_dir)
3044                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3045                 }
3046         }
3047
3048         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3049         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3050         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3051         if (r < 0)
3052                 return r;
3053
3054         if (context->mount_propagation_flag == MS_SHARED)
3055                 log_exec_debug(context,
3056                                params,
3057                                "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3058
3059         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3060                 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3061                 if (r < 0)
3062                         return r;
3063         }
3064
3065         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3066                 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3067                 if (!propagate_dir)
3068                         return -ENOMEM;
3069
3070                 incoming_dir = strdup("/run/systemd/incoming");
3071                 if (!incoming_dir)
3072                         return -ENOMEM;
3073
3074                 extension_dir = strdup("/run/systemd/unit-extensions");
3075                 if (!extension_dir)
3076                         return -ENOMEM;
3077
3078                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3079                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3080                 if (setup_os_release_symlink) {
3081                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3082                         if (!host_os_release_stage)
3083                                 return -ENOMEM;
3084                 }
3085         } else {
3086                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3087
3088                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3089                         return -ENOMEM;
3090
3091                 if (setup_os_release_symlink) {
3092                         if (asprintf(&host_os_release_stage,
3093                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3094                                      geteuid()) < 0)
3095                                 return -ENOMEM;
3096                 }
3097         }
3098
3099         if (root_image) {
3100                 r = verity_settings_prepare(
3101                         &verity,
3102                         root_image,
3103                         context->root_hash, context->root_hash_size, context->root_hash_path,
3104                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3105                         context->root_verity);
3106                 if (r < 0)
3107                         return r;
3108         }
3109
3110         NamespaceParameters parameters = {
3111                 .runtime_scope = params->runtime_scope,
3112
3113                 .root_directory = root_dir,
3114                 .root_image = root_image,
3115                 .root_image_options = context->root_image_options,
3116                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3117
3118                 .read_write_paths = read_write_paths,
3119                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3120                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3121
3122                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3123                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3124
3125                 .empty_directories = empty_directories,
3126                 .symlinks = symlinks,
3127
3128                 .bind_mounts = bind_mounts,
3129                 .n_bind_mounts = n_bind_mounts,
3130
3131                 .temporary_filesystems = context->temporary_filesystems,
3132                 .n_temporary_filesystems = context->n_temporary_filesystems,
3133
3134                 .mount_images = context->mount_images,
3135                 .n_mount_images = context->n_mount_images,
3136                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3137
3138                 .tmp_dir = tmp_dir,
3139                 .var_tmp_dir = var_tmp_dir,
3140
3141                 .creds_path = creds_path,
3142                 .log_namespace = context->log_namespace,
3143                 .mount_propagation_flag = context->mount_propagation_flag,
3144
3145                 .verity = &verity,
3146
3147                 .extension_images = context->extension_images,
3148                 .n_extension_images = context->n_extension_images,
3149                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3150                 .extension_directories = context->extension_directories,
3151
3152                 .propagate_dir = propagate_dir,
3153                 .incoming_dir = incoming_dir,
3154                 .extension_dir = extension_dir,
3155                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3156                 .host_os_release_stage = host_os_release_stage,
3157
3158                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3159                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3160                  * sandbox inside the mount namespace. */
3161                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3162
3163                 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3164                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3165                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3166                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3167                 .protect_hostname = needs_sandboxing && context->protect_hostname,
3168
3169                 .private_dev = needs_sandboxing && context->private_devices,
3170                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3171                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3172
3173                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3174
3175                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3176                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3177
3178                 .protect_home = needs_sandboxing ? context->protect_home : false,
3179                 .protect_system = needs_sandboxing ? context->protect_system : false,
3180                 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3181                 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3182         };
3183
3184         r = setup_namespace(&parameters, error_path);
3185         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3186          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3187          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3188          * completely different execution environment. */
3189         if (r == -ENOANO) {
3190                 if (insist_on_sandboxing(
3191                                     context,
3192                                     root_dir, root_image,
3193                                     bind_mounts,
3194                                     n_bind_mounts))
3195                         return log_exec_debug_errno(context,
3196                                                     params,
3197                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3198                                                     "Failed to set up namespace, and refusing to continue since "
3199                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3200                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3201                                                     n_bind_mounts,
3202                                                     context->n_temporary_filesystems,
3203                                                     yes_no(root_dir),
3204                                                     yes_no(root_image),
3205                                                     yes_no(context->dynamic_user));
3206
3207                 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3208                 return 0;
3209         }
3210
3211         return r;
3212 }
3213
3214 static int apply_working_directory(
3215                 const ExecContext *context,
3216                 const ExecParameters *params,
3217                 ExecRuntime *runtime,
3218                 const char *home,
3219                 int *exit_status) {
3220
3221         const char *d, *wd;
3222
3223         assert(context);
3224         assert(exit_status);
3225
3226         if (context->working_directory_home) {
3227
3228                 if (!home) {
3229                         *exit_status = EXIT_CHDIR;
3230                         return -ENXIO;
3231                 }
3232
3233                 wd = home;
3234
3235         } else
3236                 wd = empty_to_root(context->working_directory);
3237
3238         if (params->flags & EXEC_APPLY_CHROOT)
3239                 d = wd;
3240         else
3241                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3242
3243         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3244                 *exit_status = EXIT_CHDIR;
3245                 return -errno;
3246         }
3247
3248         return 0;
3249 }
3250
3251 static int apply_root_directory(
3252                 const ExecContext *context,
3253                 const ExecParameters *params,
3254                 ExecRuntime *runtime,
3255                 const bool needs_mount_ns,
3256                 int *exit_status) {
3257
3258         assert(context);
3259         assert(exit_status);
3260
3261         if (params->flags & EXEC_APPLY_CHROOT)
3262                 if (!needs_mount_ns && context->root_directory)
3263                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3264                                 *exit_status = EXIT_CHROOT;
3265                                 return -errno;
3266                         }
3267
3268         return 0;
3269 }
3270
3271 static int setup_keyring(
3272                 const ExecContext *context,
3273                 const ExecParameters *p,
3274                 uid_t uid, gid_t gid) {
3275
3276         key_serial_t keyring;
3277         int r = 0;
3278         uid_t saved_uid;
3279         gid_t saved_gid;
3280
3281         assert(context);
3282         assert(p);
3283
3284         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3285          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3286          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3287          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3288          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3289          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3290
3291         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3292                 return 0;
3293
3294         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3295          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3296          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3297          * & group is just as nasty as acquiring a reference to the user keyring. */
3298
3299         saved_uid = getuid();
3300         saved_gid = getgid();
3301
3302         if (gid_is_valid(gid) && gid != saved_gid) {
3303                 if (setregid(gid, -1) < 0)
3304                         return log_exec_error_errno(context,
3305                                                     p,
3306                                                     errno,
3307                                                     "Failed to change GID for user keyring: %m");
3308         }
3309
3310         if (uid_is_valid(uid) && uid != saved_uid) {
3311                 if (setreuid(uid, -1) < 0) {
3312                         r = log_exec_error_errno(context,
3313                                                  p,
3314                                                  errno,
3315                                                  "Failed to change UID for user keyring: %m");
3316                         goto out;
3317                 }
3318         }
3319
3320         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3321         if (keyring == -1) {
3322                 if (errno == ENOSYS)
3323                         log_exec_debug_errno(context,
3324                                              p,
3325                                              errno,
3326                                              "Kernel keyring not supported, ignoring.");
3327                 else if (ERRNO_IS_PRIVILEGE(errno))
3328                         log_exec_debug_errno(context,
3329                                              p,
3330                                              errno,
3331                                              "Kernel keyring access prohibited, ignoring.");
3332                 else if (errno == EDQUOT)
3333                         log_exec_debug_errno(context,
3334                                              p,
3335                                              errno,
3336                                              "Out of kernel keyrings to allocate, ignoring.");
3337                 else
3338                         r = log_exec_error_errno(context,
3339                                                  p,
3340                                                  errno,
3341                                                  "Setting up kernel keyring failed: %m");
3342
3343                 goto out;
3344         }
3345
3346         /* When requested link the user keyring into the session keyring. */
3347         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3348
3349                 if (keyctl(KEYCTL_LINK,
3350                            KEY_SPEC_USER_KEYRING,
3351                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3352                         r = log_exec_error_errno(context,
3353                                                  p,
3354                                                  errno,
3355                                                  "Failed to link user keyring into session keyring: %m");
3356                         goto out;
3357                 }
3358         }
3359
3360         /* Restore uid/gid back */
3361         if (uid_is_valid(uid) && uid != saved_uid) {
3362                 if (setreuid(saved_uid, -1) < 0) {
3363                         r = log_exec_error_errno(context,
3364                                                  p,
3365                                                  errno,
3366                                                  "Failed to change UID back for user keyring: %m");
3367                         goto out;
3368                 }
3369         }
3370
3371         if (gid_is_valid(gid) && gid != saved_gid) {
3372                 if (setregid(saved_gid, -1) < 0)
3373                         return log_exec_error_errno(context,
3374                                                     p,
3375                                                     errno,
3376                                                     "Failed to change GID back for user keyring: %m");
3377         }
3378
3379         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3380         if (!sd_id128_is_null(p->invocation_id)) {
3381                 key_serial_t key;
3382
3383                 key = add_key("user",
3384                               "invocation_id",
3385                               &p->invocation_id,
3386                               sizeof(p->invocation_id),
3387                               KEY_SPEC_SESSION_KEYRING);
3388                 if (key == -1)
3389                         log_exec_debug_errno(context,
3390                                              p,
3391                                              errno,
3392                                              "Failed to add invocation ID to keyring, ignoring: %m");
3393                 else {
3394                         if (keyctl(KEYCTL_SETPERM, key,
3395                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3396                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3397                                 r = log_exec_error_errno(context,
3398                                                          p,
3399                                                          errno,
3400                                                          "Failed to restrict invocation ID permission: %m");
3401                 }
3402         }
3403
3404 out:
3405         /* Revert back uid & gid for the last time, and exit */
3406         /* no extra logging, as only the first already reported error matters */
3407         if (getuid() != saved_uid)
3408                 (void) setreuid(saved_uid, -1);
3409
3410         if (getgid() != saved_gid)
3411                 (void) setregid(saved_gid, -1);
3412
3413         return r;
3414 }
3415
3416 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3417         assert(array);
3418         assert(n);
3419         assert(pair);
3420
3421         if (pair[0] >= 0)
3422                 array[(*n)++] = pair[0];
3423         if (pair[1] >= 0)
3424                 array[(*n)++] = pair[1];
3425 }
3426
3427 static int close_remaining_fds(
3428                 const ExecParameters *params,
3429                 const ExecRuntime *runtime,
3430                 int socket_fd,
3431                 const int *fds, size_t n_fds) {
3432
3433         size_t n_dont_close = 0;
3434         int dont_close[n_fds + 14];
3435
3436         assert(params);
3437
3438         if (params->stdin_fd >= 0)
3439                 dont_close[n_dont_close++] = params->stdin_fd;
3440         if (params->stdout_fd >= 0)
3441                 dont_close[n_dont_close++] = params->stdout_fd;
3442         if (params->stderr_fd >= 0)
3443                 dont_close[n_dont_close++] = params->stderr_fd;
3444
3445         if (socket_fd >= 0)
3446                 dont_close[n_dont_close++] = socket_fd;
3447         if (n_fds > 0) {
3448                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3449                 n_dont_close += n_fds;
3450         }
3451
3452         if (runtime)
3453                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3454
3455         if (runtime && runtime->shared) {
3456                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3457                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3458         }
3459
3460         if (runtime && runtime->dynamic_creds) {
3461                 if (runtime->dynamic_creds->user)
3462                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3463                 if (runtime->dynamic_creds->group)
3464                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3465         }
3466
3467         if (params->user_lookup_fd >= 0)
3468                 dont_close[n_dont_close++] = params->user_lookup_fd;
3469
3470         return close_all_fds(dont_close, n_dont_close);
3471 }
3472
3473 static int send_user_lookup(
3474                 const char *unit_id,
3475                 int user_lookup_fd,
3476                 uid_t uid,
3477                 gid_t gid) {
3478
3479         assert(unit_id);
3480
3481         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3482          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3483          * specified. */
3484
3485         if (user_lookup_fd < 0)
3486                 return 0;
3487
3488         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3489                 return 0;
3490
3491         if (writev(user_lookup_fd,
3492                (struct iovec[]) {
3493                            IOVEC_MAKE(&uid, sizeof(uid)),
3494                            IOVEC_MAKE(&gid, sizeof(gid)),
3495                            IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3496                 return -errno;
3497
3498         return 0;
3499 }
3500
3501 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3502         int r;
3503
3504         assert(c);
3505         assert(home);
3506         assert(buf);
3507
3508         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3509
3510         if (*home)
3511                 return 0;
3512
3513         if (!c->working_directory_home)
3514                 return 0;
3515
3516         r = get_home_dir(buf);
3517         if (r < 0)
3518                 return r;
3519
3520         *home = *buf;
3521         return 1;
3522 }
3523
3524 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3525         _cleanup_strv_free_ char ** list = NULL;
3526         int r;
3527
3528         assert(c);
3529         assert(p);
3530         assert(ret);
3531
3532         assert(c->dynamic_user);
3533
3534         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3535          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3536          * directories. */
3537
3538         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3539                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3540                         continue;
3541
3542                 if (!p->prefix[t])
3543                         continue;
3544
3545                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3546                         char *e;
3547
3548                         if (exec_directory_is_private(c, t))
3549                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3550                         else
3551                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3552                         if (!e)
3553                                 return -ENOMEM;
3554
3555                         r = strv_consume(&list, e);
3556                         if (r < 0)
3557                                 return r;
3558                 }
3559         }
3560
3561         *ret = TAKE_PTR(list);
3562
3563         return 0;
3564 }
3565
3566 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3567         _cleanup_(cpu_set_reset) CPUSet s = {};
3568         int r;
3569
3570         assert(c);
3571         assert(ret);
3572
3573         if (!c->numa_policy.nodes.set) {
3574                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3575                 return 0;
3576         }
3577
3578         r = numa_to_cpu_set(&c->numa_policy, &s);
3579         if (r < 0)
3580                 return r;
3581
3582         cpu_set_reset(ret);
3583
3584         return cpu_set_add_all(ret, &s);
3585 }
3586
3587 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3588         int r;
3589
3590         assert(fds);
3591         assert(n_fds);
3592         assert(*n_fds < fds_size);
3593         assert(fd);
3594
3595         if (*fd < 0)
3596                return 0;
3597
3598         if (*fd < 3 + (int) *n_fds) {
3599                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3600                  * the fds we pass to the process (or which are closed only during execve). */
3601
3602                 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3603                 if (r < 0)
3604                         return -errno;
3605
3606                 close_and_replace(*fd, r);
3607         }
3608
3609         fds[(*n_fds)++] = *fd;
3610         return 1;
3611 }
3612
3613 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3614         union sockaddr_union addr = {
3615                 .un.sun_family = AF_UNIX,
3616         };
3617         socklen_t sa_len;
3618         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3619         int r;
3620
3621         assert(c);
3622         assert(p);
3623         assert(of);
3624         assert(ofd >= 0);
3625
3626         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3627         if (r < 0)
3628                 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3629
3630         sa_len = r;
3631
3632         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3633                 _cleanup_close_ int fd = -EBADF;
3634
3635                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3636                 if (fd < 0)
3637                         return log_exec_error_errno(c,
3638                                                     p,
3639                                                     errno,
3640                                                     "Failed to create socket for %s: %m",
3641                                                     of->path);
3642
3643                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3644                 if (r == -EPROTOTYPE)
3645                         continue;
3646                 if (r < 0)
3647                         return log_exec_error_errno(c,
3648                                                     p,
3649                                                     r,
3650                                                     "Failed to connect socket for %s: %m",
3651                                                     of->path);
3652
3653                 return TAKE_FD(fd);
3654         }
3655
3656         return log_exec_error_errno(c,
3657                                     p,
3658                                     SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3659                                     of->path);
3660 }
3661
3662 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3663         struct stat st;
3664         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3665
3666         assert(c);
3667         assert(p);
3668         assert(of);
3669
3670         ofd = open(of->path, O_PATH | O_CLOEXEC);
3671         if (ofd < 0)
3672                 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3673
3674         if (fstat(ofd, &st) < 0)
3675                 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3676
3677         if (S_ISSOCK(st.st_mode)) {
3678                 fd = connect_unix_harder(c, p, of, ofd);
3679                 if (fd < 0)
3680                         return fd;
3681
3682                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3683                         return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3684                                                     of->path);
3685
3686                 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3687         } else {
3688                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3689                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3690                         flags |= O_APPEND;
3691                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3692                         flags |= O_TRUNC;
3693
3694                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3695                 if (fd < 0)
3696                         return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3697
3698                 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3699         }
3700
3701         return TAKE_FD(fd);
3702 }
3703
3704 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3705         int r;
3706
3707         assert(c);
3708         assert(p);
3709         assert(n_fds);
3710
3711         LIST_FOREACH(open_files, of, p->open_files) {
3712                 _cleanup_close_ int fd = -EBADF;
3713
3714                 fd = get_open_file_fd(c, p, of);
3715                 if (fd < 0) {
3716                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3717                                 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3718                                 continue;
3719                         }
3720
3721                         return fd;
3722                 }
3723
3724                 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3725                         return -ENOMEM;
3726
3727                 r = strv_extend(&p->fd_names, of->fdname);
3728                 if (r < 0)
3729                         return r;
3730
3731                 p->fds[*n_fds] = TAKE_FD(fd);
3732
3733                 (*n_fds)++;
3734         }
3735
3736         return 0;
3737 }
3738
3739 static void log_command_line(
3740                 const ExecContext *context,
3741                 const ExecParameters *params,
3742                 const char *msg,
3743                 const char *executable,
3744                 char **argv) {
3745
3746         assert(context);
3747         assert(params);
3748         assert(msg);
3749         assert(executable);
3750
3751         if (!DEBUG_LOGGING)
3752                 return;
3753
3754         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3755
3756         log_exec_struct(context, params, LOG_DEBUG,
3757                         "EXECUTABLE=%s", executable,
3758                         LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3759                         LOG_EXEC_INVOCATION_ID(params));
3760 }
3761
3762 static bool exec_context_need_unprivileged_private_users(
3763                 const ExecContext *context,
3764                 const ExecParameters *params) {
3765
3766         assert(context);
3767         assert(params);
3768
3769         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3770          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3771          * (system manager) then we have privileges and don't need this. */
3772         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3773                 return false;
3774
3775         return context->private_users ||
3776                context->private_tmp ||
3777                context->private_devices ||
3778                context->private_network ||
3779                context->network_namespace_path ||
3780                context->private_ipc ||
3781                context->ipc_namespace_path ||
3782                context->private_mounts > 0 ||
3783                context->mount_apivfs ||
3784                context->n_bind_mounts > 0 ||
3785                context->n_temporary_filesystems > 0 ||
3786                context->root_directory ||
3787                !strv_isempty(context->extension_directories) ||
3788                context->protect_system != PROTECT_SYSTEM_NO ||
3789                context->protect_home != PROTECT_HOME_NO ||
3790                context->protect_kernel_tunables ||
3791                context->protect_kernel_modules ||
3792                context->protect_kernel_logs ||
3793                context->protect_control_groups ||
3794                context->protect_clock ||
3795                context->protect_hostname ||
3796                !strv_isempty(context->read_write_paths) ||
3797                !strv_isempty(context->read_only_paths) ||
3798                !strv_isempty(context->inaccessible_paths) ||
3799                !strv_isempty(context->exec_paths) ||
3800                !strv_isempty(context->no_exec_paths);
3801 }
3802
3803 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3804         assert(context);
3805
3806         if (confirm_spawn_disabled())
3807                 return false;
3808
3809         /* For some reasons units remaining in the same process group
3810          * as PID 1 fail to acquire the console even if it's not used
3811          * by any process. So skip the confirmation question for them. */
3812         return !context->same_pgrp;
3813 }
3814
3815 static int exec_context_named_iofds(
3816                 const ExecContext *c,
3817                 const ExecParameters *p,
3818                 int named_iofds[static 3]) {
3819
3820         size_t targets;
3821         const char* stdio_fdname[3];
3822         size_t n_fds;
3823
3824         assert(c);
3825         assert(p);
3826         assert(named_iofds);
3827
3828         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3829                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3830                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3831
3832         for (size_t i = 0; i < 3; i++)
3833                 stdio_fdname[i] = exec_context_fdname(c, i);
3834
3835         n_fds = p->n_storage_fds + p->n_socket_fds;
3836
3837         for (size_t i = 0; i < n_fds  && targets > 0; i++)
3838                 if (named_iofds[STDIN_FILENO] < 0 &&
3839                     c->std_input == EXEC_INPUT_NAMED_FD &&
3840                     stdio_fdname[STDIN_FILENO] &&
3841                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3842
3843                         named_iofds[STDIN_FILENO] = p->fds[i];
3844                         targets--;
3845
3846                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3847                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3848                            stdio_fdname[STDOUT_FILENO] &&
3849                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3850
3851                         named_iofds[STDOUT_FILENO] = p->fds[i];
3852                         targets--;
3853
3854                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3855                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3856                            stdio_fdname[STDERR_FILENO] &&
3857                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3858
3859                         named_iofds[STDERR_FILENO] = p->fds[i];
3860                         targets--;
3861                 }
3862
3863         return targets == 0 ? 0 : -ENOENT;
3864 }
3865
3866 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3867         if (!shared)
3868                 return;
3869
3870         safe_close_pair(shared->netns_storage_socket);
3871         safe_close_pair(shared->ipcns_storage_socket);
3872 }
3873
3874 static void exec_runtime_close(ExecRuntime *rt) {
3875         if (!rt)
3876                 return;
3877
3878         safe_close_pair(rt->ephemeral_storage_socket);
3879
3880         exec_shared_runtime_close(rt->shared);
3881         dynamic_creds_close(rt->dynamic_creds);
3882 }
3883
3884 static void exec_params_close(ExecParameters *p) {
3885         if (!p)
3886                 return;
3887
3888         p->stdin_fd = safe_close(p->stdin_fd);
3889         p->stdout_fd = safe_close(p->stdout_fd);
3890         p->stderr_fd = safe_close(p->stderr_fd);
3891 }
3892
3893 int exec_invoke(
3894                 const ExecCommand *command,
3895                 const ExecContext *context,
3896                 ExecParameters *params,
3897                 ExecRuntime *runtime,
3898                 const CGroupContext *cgroup_context,
3899                 int *exit_status) {
3900
3901         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3902         int r, ngids = 0;
3903         _cleanup_free_ gid_t *supplementary_gids = NULL;
3904         const char *username = NULL, *groupname = NULL;
3905         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3906         const char *home = NULL, *shell = NULL;
3907         char **final_argv = NULL;
3908         dev_t journal_stream_dev = 0;
3909         ino_t journal_stream_ino = 0;
3910         bool userns_set_up = false;
3911         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3912                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3913                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3914                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3915         bool keep_seccomp_privileges = false;
3916 #if HAVE_SELINUX
3917         _cleanup_free_ char *mac_selinux_context_net = NULL;
3918         bool use_selinux = false;
3919 #endif
3920 #if ENABLE_SMACK
3921         bool use_smack = false;
3922 #endif
3923 #if HAVE_APPARMOR
3924         bool use_apparmor = false;
3925 #endif
3926 #if HAVE_SECCOMP
3927         uint64_t saved_bset = 0;
3928 #endif
3929         uid_t saved_uid = getuid();
3930         gid_t saved_gid = getgid();
3931         uid_t uid = UID_INVALID;
3932         gid_t gid = GID_INVALID;
3933         size_t n_fds, /* fds to pass to the child */
3934                n_keep_fds; /* total number of fds not to close */
3935         int secure_bits;
3936         _cleanup_free_ gid_t *gids_after_pam = NULL;
3937         int ngids_after_pam = 0;
3938
3939         int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
3940         size_t n_storage_fds, n_socket_fds;
3941
3942         assert(command);
3943         assert(context);
3944         assert(params);
3945         assert(exit_status);
3946
3947         /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
3948          * and is already applied earlier. Just for safety. */
3949         if (context->log_level_max >= 0)
3950                 log_set_max_level(context->log_level_max);
3951
3952         /* Explicitly test for CVE-2021-4034 inspired invocations */
3953         if (!command->path || strv_isempty(command->argv)) {
3954                 *exit_status = EXIT_EXEC;
3955                 return log_exec_error_errno(
3956                                 context,
3957                                 params,
3958                                 SYNTHETIC_ERRNO(EINVAL),
3959                                 "Invalid command line arguments.");
3960         }
3961
3962         LOG_CONTEXT_PUSH_EXEC(context, params);
3963
3964         if (context->std_input == EXEC_INPUT_SOCKET ||
3965             context->std_output == EXEC_OUTPUT_SOCKET ||
3966             context->std_error == EXEC_OUTPUT_SOCKET) {
3967
3968                 if (params->n_socket_fds > 1)
3969                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
3970
3971                 if (params->n_socket_fds == 0)
3972                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
3973
3974                 socket_fd = params->fds[0];
3975                 n_storage_fds = n_socket_fds = 0;
3976         } else {
3977                 n_socket_fds = params->n_socket_fds;
3978                 n_storage_fds = params->n_storage_fds;
3979         }
3980         n_fds = n_socket_fds + n_storage_fds;
3981
3982         r = exec_context_named_iofds(context, params, named_iofds);
3983         if (r < 0)
3984                 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
3985
3986         rename_process_from_path(command->path);
3987
3988         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3989          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3990          * both of which will be demoted to SIG_DFL. */
3991         (void) default_signals(SIGNALS_CRASH_HANDLER,
3992                                SIGNALS_IGNORE);
3993
3994         if (context->ignore_sigpipe)
3995                 (void) ignore_signals(SIGPIPE);
3996
3997         r = reset_signal_mask();
3998         if (r < 0) {
3999                 *exit_status = EXIT_SIGNAL_MASK;
4000                 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4001         }
4002
4003         if (params->idle_pipe)
4004                 do_idle_pipe_dance(params->idle_pipe);
4005
4006         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4007          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4008          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4009          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4010
4011         log_forget_fds();
4012         log_set_open_when_needed(true);
4013         log_settle_target();
4014
4015         /* In case anything used libc syslog(), close this here, too */
4016         closelog();
4017
4018         r = collect_open_file_fds(context, params, &n_fds);
4019         if (r < 0) {
4020                 *exit_status = EXIT_FDS;
4021                 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4022         }
4023
4024         int keep_fds[n_fds + 3];
4025         memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4026         n_keep_fds = n_fds;
4027
4028         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4029         if (r < 0) {
4030                 *exit_status = EXIT_FDS;
4031                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4032         }
4033
4034 #if HAVE_LIBBPF
4035         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_outer_map_fd);
4036         if (r < 0) {
4037                 *exit_status = EXIT_FDS;
4038                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4039         }
4040 #endif
4041
4042         r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4043         if (r < 0) {
4044                 *exit_status = EXIT_FDS;
4045                 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4046         }
4047
4048         if (!context->same_pgrp &&
4049             setsid() < 0) {
4050                 *exit_status = EXIT_SETSID;
4051                 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4052         }
4053
4054         exec_context_tty_reset(context, params);
4055
4056         if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4057                 _cleanup_free_ char *cmdline = NULL;
4058
4059                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4060                 if (!cmdline) {
4061                         *exit_status = EXIT_MEMORY;
4062                         return log_oom();
4063                 }
4064
4065                 r = ask_for_confirmation(context, params, cmdline);
4066                 if (r != CONFIRM_EXECUTE) {
4067                         if (r == CONFIRM_PRETEND_SUCCESS) {
4068                                 *exit_status = EXIT_SUCCESS;
4069                                 return 0;
4070                         }
4071
4072                         *exit_status = EXIT_CONFIRM;
4073                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4074                                                     "Execution cancelled by the user");
4075                 }
4076         }
4077
4078         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4079          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4080          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4081          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4082          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4083         if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4084             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4085                 *exit_status = EXIT_MEMORY;
4086                 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4087         }
4088
4089         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4090                 _cleanup_strv_free_ char **suggested_paths = NULL;
4091
4092                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4093                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4094                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4095                         *exit_status = EXIT_USER;
4096                         return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4097                 }
4098
4099                 r = compile_suggested_paths(context, params, &suggested_paths);
4100                 if (r < 0) {
4101                         *exit_status = EXIT_MEMORY;
4102                         return log_oom();
4103                 }
4104
4105                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4106                 if (r < 0) {
4107                         *exit_status = EXIT_USER;
4108                         if (r == -EILSEQ)
4109                                 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4110                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4111                         return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4112                 }
4113
4114                 if (!uid_is_valid(uid)) {
4115                         *exit_status = EXIT_USER;
4116                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4117                 }
4118
4119                 if (!gid_is_valid(gid)) {
4120                         *exit_status = EXIT_USER;
4121                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4122                 }
4123
4124                 if (runtime->dynamic_creds->user)
4125                         username = runtime->dynamic_creds->user->name;
4126
4127         } else {
4128                 if (context->user) {
4129                         r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4130                         if (r < 0) {
4131                                 *exit_status = EXIT_USER;
4132                                 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4133                         }
4134                 }
4135
4136                 if (context->group) {
4137                         r = get_fixed_group(context->group, &groupname, &gid);
4138                         if (r < 0) {
4139                                 *exit_status = EXIT_GROUP;
4140                                 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4141                         }
4142                 }
4143         }
4144
4145         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4146         r = get_supplementary_groups(context, username, groupname, gid,
4147                                      &supplementary_gids, &ngids);
4148         if (r < 0) {
4149                 *exit_status = EXIT_GROUP;
4150                 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4151         }
4152
4153         r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4154         if (r < 0) {
4155                 *exit_status = EXIT_USER;
4156                 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4157         }
4158
4159         params->user_lookup_fd = safe_close(params->user_lookup_fd);
4160
4161         r = acquire_home(context, uid, &home, &home_buffer);
4162         if (r < 0) {
4163                 *exit_status = EXIT_CHDIR;
4164                 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4165         }
4166
4167         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4168         if (socket_fd >= 0)
4169                 (void) fd_nonblock(socket_fd, false);
4170
4171         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4172          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4173         if (params->cgroup_path) {
4174                 _cleanup_free_ char *p = NULL;
4175
4176                 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4177                 if (r < 0) {
4178                         *exit_status = EXIT_CGROUP;
4179                         return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4180                 }
4181
4182                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4183                 if (r == -EUCLEAN) {
4184                         *exit_status = EXIT_CGROUP;
4185                         return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4186                                                     "because the cgroup or one of its parents or "
4187                                                     "siblings is in the threaded mode: %m", p);
4188                 }
4189                 if (r < 0) {
4190                         *exit_status = EXIT_CGROUP;
4191                         return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4192                 }
4193         }
4194
4195         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4196                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4197                 if (r < 0) {
4198                         *exit_status = EXIT_NETWORK;
4199                         return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4200                 }
4201         }
4202
4203         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4204                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4205                 if (r < 0) {
4206                         *exit_status = EXIT_NAMESPACE;
4207                         return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4208                 }
4209         }
4210
4211         r = setup_input(context, params, socket_fd, named_iofds);
4212         if (r < 0) {
4213                 *exit_status = EXIT_STDIN;
4214                 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4215         }
4216
4217         r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4218         if (r < 0) {
4219                 *exit_status = EXIT_STDOUT;
4220                 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4221         }
4222
4223         r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4224         if (r < 0) {
4225                 *exit_status = EXIT_STDERR;
4226                 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4227         }
4228
4229         if (context->oom_score_adjust_set) {
4230                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4231                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4232                 r = set_oom_score_adjust(context->oom_score_adjust);
4233                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4234                         log_exec_debug_errno(context, params, r,
4235                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4236                 else if (r < 0) {
4237                         *exit_status = EXIT_OOM_ADJUST;
4238                         return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4239                 }
4240         }
4241
4242         if (context->coredump_filter_set) {
4243                 r = set_coredump_filter(context->coredump_filter);
4244                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4245                         log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4246                 else if (r < 0) {
4247                         *exit_status = EXIT_LIMITS;
4248                         return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4249                 }
4250         }
4251
4252         if (context->nice_set) {
4253                 r = setpriority_closest(context->nice);
4254                 if (r < 0) {
4255                         *exit_status = EXIT_NICE;
4256                         return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4257                 }
4258         }
4259
4260         if (context->cpu_sched_set) {
4261                 struct sched_param param = {
4262                         .sched_priority = context->cpu_sched_priority,
4263                 };
4264
4265                 r = sched_setscheduler(0,
4266                                        context->cpu_sched_policy |
4267                                        (context->cpu_sched_reset_on_fork ?
4268                                         SCHED_RESET_ON_FORK : 0),
4269                                        &param);
4270                 if (r < 0) {
4271                         *exit_status = EXIT_SETSCHEDULER;
4272                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4273                 }
4274         }
4275
4276         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4277                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4278                 const CPUSet *cpu_set;
4279
4280                 if (context->cpu_affinity_from_numa) {
4281                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4282                         if (r < 0) {
4283                                 *exit_status = EXIT_CPUAFFINITY;
4284                                 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4285                         }
4286
4287                         cpu_set = &converted_cpu_set;
4288                 } else
4289                         cpu_set = &context->cpu_set;
4290
4291                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4292                         *exit_status = EXIT_CPUAFFINITY;
4293                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4294                 }
4295         }
4296
4297         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4298                 r = apply_numa_policy(&context->numa_policy);
4299                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4300                         log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4301                 else if (r < 0) {
4302                         *exit_status = EXIT_NUMA_POLICY;
4303                         return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4304                 }
4305         }
4306
4307         if (context->ioprio_set)
4308                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4309                         *exit_status = EXIT_IOPRIO;
4310                         return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4311                 }
4312
4313         if (context->timer_slack_nsec != NSEC_INFINITY)
4314                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4315                         *exit_status = EXIT_TIMERSLACK;
4316                         return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4317                 }
4318
4319         if (context->personality != PERSONALITY_INVALID) {
4320                 r = safe_personality(context->personality);
4321                 if (r < 0) {
4322                         *exit_status = EXIT_PERSONALITY;
4323                         return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4324                 }
4325         }
4326
4327 #if ENABLE_UTMP
4328         if (context->utmp_id) {
4329                 const char *line = context->tty_path ?
4330                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4331                         NULL;
4332                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4333                                       line,
4334                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4335                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4336                                       USER_PROCESS,
4337                                       username);
4338         }
4339 #endif
4340
4341         if (uid_is_valid(uid)) {
4342                 r = chown_terminal(STDIN_FILENO, uid);
4343                 if (r < 0) {
4344                         *exit_status = EXIT_STDIN;
4345                         return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4346                 }
4347         }
4348
4349         if (params->cgroup_path) {
4350                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4351                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4352                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4353                  * touch a single hierarchy too. */
4354
4355                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4356                         _cleanup_free_ char *p = NULL;
4357
4358                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4359                         if (r < 0) {
4360                                 *exit_status = EXIT_CGROUP;
4361                                 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4362                         }
4363
4364                         r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4365                         if (r < 0) {
4366                                 *exit_status = EXIT_CGROUP;
4367                                 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4368                         }
4369                         if (r > 0) {
4370                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4371                                 if (r < 0) {
4372                                         *exit_status = EXIT_CGROUP;
4373                                         return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4374                                 }
4375                         }
4376                 }
4377
4378                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4379                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4380                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4381                                 if (r < 0) {
4382                                         *exit_status = EXIT_MEMORY;
4383                                         return log_oom();
4384                                 }
4385
4386                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4387                                 if (r < 0) {
4388                                         log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4389                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4390                                         memory_pressure_path = mfree(memory_pressure_path);
4391                                 }
4392                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4393                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4394                                 if (!memory_pressure_path) {
4395                                         *exit_status = EXIT_MEMORY;
4396                                         return log_oom();
4397                                 }
4398                         }
4399                 }
4400         }
4401
4402         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4403
4404         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4405                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4406                 if (r < 0)
4407                         return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4408         }
4409
4410         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4411                 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4412                 if (r < 0) {
4413                         *exit_status = EXIT_CREDENTIALS;
4414                         return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4415                 }
4416         }
4417
4418         r = build_environment(
4419                         context,
4420                         params,
4421                         cgroup_context,
4422                         n_fds,
4423                         home,
4424                         username,
4425                         shell,
4426                         journal_stream_dev,
4427                         journal_stream_ino,
4428                         memory_pressure_path,
4429                         &our_env);
4430         if (r < 0) {
4431                 *exit_status = EXIT_MEMORY;
4432                 return log_oom();
4433         }
4434
4435         r = build_pass_environment(context, &pass_env);
4436         if (r < 0) {
4437                 *exit_status = EXIT_MEMORY;
4438                 return log_oom();
4439         }
4440
4441         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4442          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4443          * not specify PATH but the unit has ExecSearchPath. */
4444         if (!strv_isempty(context->exec_search_path)) {
4445                 _cleanup_free_ char *joined = NULL;
4446
4447                 joined = strv_join(context->exec_search_path, ":");
4448                 if (!joined) {
4449                         *exit_status = EXIT_MEMORY;
4450                         return log_oom();
4451                 }
4452
4453                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4454                 if (r < 0) {
4455                         *exit_status = EXIT_MEMORY;
4456                         return log_oom();
4457                 }
4458         }
4459
4460         accum_env = strv_env_merge(params->environment,
4461                                    our_env,
4462                                    joined_exec_search_path,
4463                                    pass_env,
4464                                    context->environment,
4465                                    params->files_env);
4466         if (!accum_env) {
4467                 *exit_status = EXIT_MEMORY;
4468                 return log_oom();
4469         }
4470         accum_env = strv_env_clean(accum_env);
4471
4472         (void) umask(context->umask);
4473
4474         r = setup_keyring(context, params, uid, gid);
4475         if (r < 0) {
4476                 *exit_status = EXIT_KEYRING;
4477                 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4478         }
4479
4480         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4481          * from it. */
4482         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4483
4484         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4485          * for it, and the kernel doesn't actually support ambient caps. */
4486         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4487
4488         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4489          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4490          * desired. */
4491         if (needs_ambient_hack)
4492                 needs_setuid = false;
4493         else
4494                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4495
4496         uint64_t capability_ambient_set = context->capability_ambient_set;
4497
4498         if (needs_sandboxing) {
4499                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4500                  * /sys being present. The actual MAC context application will happen later, as late as
4501                  * possible, to avoid impacting our own code paths. */
4502
4503 #if HAVE_SELINUX
4504                 use_selinux = mac_selinux_use();
4505 #endif
4506 #if ENABLE_SMACK
4507                 use_smack = mac_smack_use();
4508 #endif
4509 #if HAVE_APPARMOR
4510                 use_apparmor = mac_apparmor_use();
4511 #endif
4512         }
4513
4514         if (needs_sandboxing) {
4515                 int which_failed;
4516
4517                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4518                  * is set here. (See below.) */
4519
4520                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4521                 if (r < 0) {
4522                         *exit_status = EXIT_LIMITS;
4523                         return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4524                 }
4525         }
4526
4527         if (needs_setuid && context->pam_name && username) {
4528                 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4529                  * wins here. (See above.) */
4530
4531                 /* All fds passed in the fds array will be closed in the pam child process. */
4532                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds);
4533                 if (r < 0) {
4534                         *exit_status = EXIT_PAM;
4535                         return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4536                 }
4537
4538                 if (ambient_capabilities_supported()) {
4539                         uint64_t ambient_after_pam;
4540
4541                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4542                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4543                         r = capability_get_ambient(&ambient_after_pam);
4544                         if (r < 0) {
4545                                 *exit_status = EXIT_CAPABILITIES;
4546                                 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4547                         }
4548
4549                         capability_ambient_set |= ambient_after_pam;
4550                 }
4551
4552                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4553                 if (ngids_after_pam < 0) {
4554                         *exit_status = EXIT_GROUP;
4555                         return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4556                 }
4557         }
4558
4559         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4560                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4561                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4562                  * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4563
4564                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4565                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4566                  * the actual requested operations fail (or silently continue). */
4567                 if (r < 0 && context->private_users) {
4568                         *exit_status = EXIT_USER;
4569                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4570                 }
4571                 if (r < 0)
4572                         log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4573                 else
4574                         userns_set_up = true;
4575         }
4576
4577         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4578
4579                 /* Try to enable network namespacing if network namespacing is available and we have
4580                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4581                  * new network namespace. And if we don't have that, then we could only create a network
4582                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4583                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4584                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4585                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4586                                 log_exec_notice_errno(context, params, r,
4587                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4588                         else if (r < 0) {
4589                                 *exit_status = EXIT_NETWORK;
4590                                 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4591                         }
4592                 } else if (context->network_namespace_path) {
4593                         *exit_status = EXIT_NETWORK;
4594                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4595                                                     "NetworkNamespacePath= is not supported, refusing.");
4596                 } else
4597                         log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4598         }
4599
4600         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4601
4602                 if (ns_type_supported(NAMESPACE_IPC)) {
4603                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4604                         if (r == -EPERM)
4605                                 log_exec_warning_errno(context, params, r,
4606                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4607                         else if (r < 0) {
4608                                 *exit_status = EXIT_NAMESPACE;
4609                                 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4610                         }
4611                 } else if (context->ipc_namespace_path) {
4612                         *exit_status = EXIT_NAMESPACE;
4613                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4614                                                     "IPCNamespacePath= is not supported, refusing.");
4615                 } else
4616                         log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4617         }
4618
4619         if (needs_mount_namespace) {
4620                 _cleanup_free_ char *error_path = NULL;
4621
4622                 r = apply_mount_namespace(command->flags, context, params, runtime, memory_pressure_path, &error_path);
4623                 if (r < 0) {
4624                         *exit_status = EXIT_NAMESPACE;
4625                         return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4626                                                     error_path ? ": " : "", strempty(error_path));
4627                 }
4628         }
4629
4630         if (needs_sandboxing) {
4631                 r = apply_protect_hostname(context, params, exit_status);
4632                 if (r < 0)
4633                         return r;
4634         }
4635
4636         if (context->memory_ksm >= 0)
4637                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4638                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4639                                 log_exec_debug_errno(context,
4640                                                      params,
4641                                                      errno,
4642                                                      "KSM support not available, ignoring.");
4643                         else {
4644                                 *exit_status = EXIT_KSM;
4645                                 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4646                         }
4647                 }
4648
4649         /* Drop groups as early as possible.
4650          * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4651          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4652         if (needs_setuid) {
4653                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4654                 int ngids_to_enforce = 0;
4655
4656                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4657                                                    ngids,
4658                                                    gids_after_pam,
4659                                                    ngids_after_pam,
4660                                                    &gids_to_enforce);
4661                 if (ngids_to_enforce < 0) {
4662                         *exit_status = EXIT_GROUP;
4663                         return log_exec_error_errno(context, params,
4664                                                     ngids_to_enforce,
4665                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4666                 }
4667
4668                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4669                 if (r < 0) {
4670                         *exit_status = EXIT_GROUP;
4671                         return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4672                 }
4673         }
4674
4675         /* If the user namespace was not set up above, try to do it now.
4676          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4677          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4678          * case of mount namespaces being less privileged when the mount point list is copied from a
4679          * different user namespace). */
4680
4681         if (needs_sandboxing && context->private_users && !userns_set_up) {
4682                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4683                 if (r < 0) {
4684                         *exit_status = EXIT_USER;
4685                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4686                 }
4687         }
4688
4689         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4690          * shall execute. */
4691
4692         _cleanup_free_ char *executable = NULL;
4693         _cleanup_close_ int executable_fd = -EBADF;
4694         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4695         if (r < 0) {
4696                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4697                         log_exec_struct_errno(context, params, LOG_INFO, r,
4698                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4699                                               LOG_EXEC_INVOCATION_ID(params),
4700                                               LOG_EXEC_MESSAGE(params,
4701                                                                "Executable %s missing, skipping: %m",
4702                                                                command->path),
4703                                               "EXECUTABLE=%s", command->path);
4704                         *exit_status = EXIT_SUCCESS;
4705                         return 0;
4706                 }
4707
4708                 *exit_status = EXIT_EXEC;
4709                 return log_exec_struct_errno(context, params, LOG_INFO, r,
4710                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4711                                              LOG_EXEC_INVOCATION_ID(params),
4712                                              LOG_EXEC_MESSAGE(params,
4713                                                               "Failed to locate executable %s: %m",
4714                                                               command->path),
4715                                              "EXECUTABLE=%s", command->path);
4716         }
4717
4718         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4719         if (r < 0) {
4720                 *exit_status = EXIT_FDS;
4721                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4722         }
4723
4724 #if HAVE_SELINUX
4725         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4726                 int fd = -EBADF;
4727
4728                 if (socket_fd >= 0)
4729                         fd = socket_fd;
4730                 else if (params->n_socket_fds == 1)
4731                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4732                          * use context from that fd to compute the label. */
4733                         fd = params->fds[0];
4734
4735                 if (fd >= 0) {
4736                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4737                         if (r < 0) {
4738                                 if (!context->selinux_context_ignore) {
4739                                         *exit_status = EXIT_SELINUX_CONTEXT;
4740                                         return log_exec_error_errno(context,
4741                                                                     params,
4742                                                                     r,
4743                                                                     "Failed to determine SELinux context: %m");
4744                                 }
4745                                 log_exec_debug_errno(context,
4746                                                      params,
4747                                                      r,
4748                                                      "Failed to determine SELinux context, ignoring: %m");
4749                         }
4750                 }
4751         }
4752 #endif
4753
4754         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4755          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4756          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4757          * execve(). But first, close the remaining sockets in the context objects. */
4758
4759         exec_runtime_close(runtime);
4760         exec_params_close(params);
4761
4762         r = close_all_fds(keep_fds, n_keep_fds);
4763         if (r >= 0)
4764                 r = shift_fds(params->fds, n_fds);
4765         if (r >= 0)
4766                 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4767         if (r < 0) {
4768                 *exit_status = EXIT_FDS;
4769                 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4770         }
4771
4772         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4773          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4774          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4775          * came this far. */
4776
4777         secure_bits = context->secure_bits;
4778
4779         if (needs_sandboxing) {
4780                 uint64_t bset;
4781
4782                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4783                  * (Note this is placed after the general resource limit initialization, see above, in order
4784                  * to take precedence.) */
4785                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4786                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4787                                 *exit_status = EXIT_LIMITS;
4788                                 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4789                         }
4790                 }
4791
4792 #if ENABLE_SMACK
4793                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4794                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4795                 if (use_smack && context->smack_process_label) {
4796                         r = setup_smack(params, context, executable_fd);
4797                         if (r < 0 && !context->smack_process_label_ignore) {
4798                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4799                                 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4800                         }
4801                 }
4802 #endif
4803
4804                 bset = context->capability_bounding_set;
4805                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4806                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4807                  * instead of us doing that */
4808                 if (needs_ambient_hack)
4809                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4810                                 (UINT64_C(1) << CAP_SETUID) |
4811                                 (UINT64_C(1) << CAP_SETGID);
4812
4813 #if HAVE_SECCOMP
4814                 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4815                  * keep the needed privileges to apply it even if we're not root. */
4816                 if (needs_setuid &&
4817                     uid_is_valid(uid) &&
4818                     context_has_seccomp(context) &&
4819                     seccomp_allows_drop_privileges(context)) {
4820                         keep_seccomp_privileges = true;
4821
4822                         if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4823                                 *exit_status = EXIT_USER;
4824                                 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4825                         }
4826
4827                         /* Save the current bounding set so we can restore it after applying the seccomp
4828                          * filter */
4829                         saved_bset = bset;
4830                         bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4831                                 (UINT64_C(1) << CAP_SETPCAP);
4832                 }
4833 #endif
4834
4835                 if (!cap_test_all(bset)) {
4836                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4837                         if (r < 0) {
4838                                 *exit_status = EXIT_CAPABILITIES;
4839                                 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4840                         }
4841                 }
4842
4843                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4844                  * keep-caps set.
4845                  *
4846                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4847                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4848                  * the ambient capabilities can be raised as they are present in the permitted and
4849                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4850                  * without changing the user, so we also set the ambient capabilities here.
4851                  *
4852                  * The requested ambient capabilities are raised in the inheritable set if the second
4853                  * argument is true. */
4854                 if (!needs_ambient_hack) {
4855                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4856                         if (r < 0) {
4857                                 *exit_status = EXIT_CAPABILITIES;
4858                                 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4859                         }
4860                 }
4861         }
4862
4863         /* chroot to root directory first, before we lose the ability to chroot */
4864         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4865         if (r < 0)
4866                 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4867
4868         if (needs_setuid) {
4869                 if (uid_is_valid(uid)) {
4870                         r = enforce_user(context, uid, capability_ambient_set);
4871                         if (r < 0) {
4872                                 *exit_status = EXIT_USER;
4873                                 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4874                         }
4875
4876                         if (keep_seccomp_privileges) {
4877                                 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
4878                                         r = drop_capability(CAP_SETUID);
4879                                         if (r < 0) {
4880                                                 *exit_status = EXIT_USER;
4881                                                 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4882                                         }
4883                                 }
4884
4885                                 r = keep_capability(CAP_SYS_ADMIN);
4886                                 if (r < 0) {
4887                                         *exit_status = EXIT_USER;
4888                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4889                                 }
4890
4891                                 r = keep_capability(CAP_SETPCAP);
4892                                 if (r < 0) {
4893                                         *exit_status = EXIT_USER;
4894                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4895                                 }
4896                         }
4897
4898                         if (!needs_ambient_hack && capability_ambient_set != 0) {
4899
4900                                 /* Raise the ambient capabilities after user change. */
4901                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4902                                 if (r < 0) {
4903                                         *exit_status = EXIT_CAPABILITIES;
4904                                         return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4905                                 }
4906                         }
4907                 }
4908         }
4909
4910         /* Apply working directory here, because the working directory might be on NFS and only the user running
4911          * this service might have the correct privilege to change to the working directory */
4912         r = apply_working_directory(context, params, runtime, home, exit_status);
4913         if (r < 0)
4914                 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
4915
4916         if (needs_sandboxing) {
4917                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4918                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4919                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4920                  * are restricted. */
4921
4922 #if HAVE_SELINUX
4923                 if (use_selinux) {
4924                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4925
4926                         if (exec_context) {
4927                                 r = setexeccon(exec_context);
4928                                 if (r < 0) {
4929                                         if (!context->selinux_context_ignore) {
4930                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4931                                                 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
4932                                         }
4933                                         log_exec_debug_errno(context,
4934                                                              params,
4935                                                              r,
4936                                                              "Failed to change SELinux context to %s, ignoring: %m",
4937                                                              exec_context);
4938                                 }
4939                         }
4940                 }
4941 #endif
4942
4943 #if HAVE_APPARMOR
4944                 if (use_apparmor && context->apparmor_profile) {
4945                         r = aa_change_onexec(context->apparmor_profile);
4946                         if (r < 0 && !context->apparmor_profile_ignore) {
4947                                 *exit_status = EXIT_APPARMOR_PROFILE;
4948                                 return log_exec_error_errno(context,
4949                                                             params,
4950                                                             errno,
4951                                                             "Failed to prepare AppArmor profile change to %s: %m",
4952                                                             context->apparmor_profile);
4953                         }
4954                 }
4955 #endif
4956
4957                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4958                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4959                  * requires CAP_SETPCAP. */
4960                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4961                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4962                          * effective set here.
4963                          *
4964                          * The effective set is overwritten during execve() with the following values:
4965                          *
4966                          * - ambient set (for non-root processes)
4967                          *
4968                          * - (inheritable | bounding) set for root processes)
4969                          *
4970                          * Hence there is no security impact to raise it in the effective set before execve
4971                          */
4972                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4973                         if (r < 0) {
4974                                 *exit_status = EXIT_CAPABILITIES;
4975                                 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4976                         }
4977                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4978                                 *exit_status = EXIT_SECUREBITS;
4979                                 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
4980                         }
4981                 }
4982
4983                 if (context_has_no_new_privileges(context))
4984                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4985                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4986                                 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
4987                         }
4988
4989 #if HAVE_SECCOMP
4990                 r = apply_address_families(context, params);
4991                 if (r < 0) {
4992                         *exit_status = EXIT_ADDRESS_FAMILIES;
4993                         return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
4994                 }
4995
4996                 r = apply_memory_deny_write_execute(context, params);
4997                 if (r < 0) {
4998                         *exit_status = EXIT_SECCOMP;
4999                         return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5000                 }
5001
5002                 r = apply_restrict_realtime(context, params);
5003                 if (r < 0) {
5004                         *exit_status = EXIT_SECCOMP;
5005                         return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5006                 }
5007
5008                 r = apply_restrict_suid_sgid(context, params);
5009                 if (r < 0) {
5010                         *exit_status = EXIT_SECCOMP;
5011                         return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5012                 }
5013
5014                 r = apply_restrict_namespaces(context, params);
5015                 if (r < 0) {
5016                         *exit_status = EXIT_SECCOMP;
5017                         return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5018                 }
5019
5020                 r = apply_protect_sysctl(context, params);
5021                 if (r < 0) {
5022                         *exit_status = EXIT_SECCOMP;
5023                         return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5024                 }
5025
5026                 r = apply_protect_kernel_modules(context, params);
5027                 if (r < 0) {
5028                         *exit_status = EXIT_SECCOMP;
5029                         return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5030                 }
5031
5032                 r = apply_protect_kernel_logs(context, params);
5033                 if (r < 0) {
5034                         *exit_status = EXIT_SECCOMP;
5035                         return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5036                 }
5037
5038                 r = apply_protect_clock(context, params);
5039                 if (r < 0) {
5040                         *exit_status = EXIT_SECCOMP;
5041                         return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5042                 }
5043
5044                 r = apply_private_devices(context, params);
5045                 if (r < 0) {
5046                         *exit_status = EXIT_SECCOMP;
5047                         return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5048                 }
5049
5050                 r = apply_syscall_archs(context, params);
5051                 if (r < 0) {
5052                         *exit_status = EXIT_SECCOMP;
5053                         return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5054                 }
5055
5056                 r = apply_lock_personality(context, params);
5057                 if (r < 0) {
5058                         *exit_status = EXIT_SECCOMP;
5059                         return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5060                 }
5061
5062                 r = apply_syscall_log(context, params);
5063                 if (r < 0) {
5064                         *exit_status = EXIT_SECCOMP;
5065                         return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5066                 }
5067 #endif
5068
5069 #if HAVE_LIBBPF
5070                 r = apply_restrict_filesystems(context, params);
5071                 if (r < 0) {
5072                         *exit_status = EXIT_BPF;
5073                         return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5074                 }
5075 #endif
5076
5077 #if HAVE_SECCOMP
5078                 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5079                  * by the filter as little as possible. */
5080                 r = apply_syscall_filter(context, params, needs_ambient_hack);
5081                 if (r < 0) {
5082                         *exit_status = EXIT_SECCOMP;
5083                         return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5084                 }
5085
5086                 if (keep_seccomp_privileges) {
5087                         /* Restore the capability bounding set with what's expected from the service + the
5088                          * ambient capabilities hack */
5089                         if (!cap_test_all(saved_bset)) {
5090                                 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5091                                 if (r < 0) {
5092                                         *exit_status = EXIT_CAPABILITIES;
5093                                         return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5094                                 }
5095                         }
5096
5097                         /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5098                          * applications that use it. */
5099                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5100                                 r = drop_capability(CAP_SYS_ADMIN);
5101                                 if (r < 0) {
5102                                         *exit_status = EXIT_USER;
5103                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5104                                 }
5105                         }
5106
5107                         /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5108                          * applications that use it. */
5109                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5110                                 r = drop_capability(CAP_SETPCAP);
5111                                 if (r < 0) {
5112                                         *exit_status = EXIT_USER;
5113                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5114                                 }
5115                         }
5116
5117                         if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5118                                 *exit_status = EXIT_USER;
5119                                 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5120                         }
5121                 }
5122 #endif
5123
5124         }
5125
5126         if (!strv_isempty(context->unset_environment)) {
5127                 char **ee = NULL;
5128
5129                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5130                 if (!ee) {
5131                         *exit_status = EXIT_MEMORY;
5132                         return log_oom();
5133                 }
5134
5135                 strv_free_and_replace(accum_env, ee);
5136         }
5137
5138         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5139                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5140
5141                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5142                 if (r < 0) {
5143                         *exit_status = EXIT_MEMORY;
5144                         return log_exec_error_errno(context,
5145                                                     params,
5146                                                     r,
5147                                                     "Failed to replace environment variables: %m");
5148                 }
5149                 final_argv = replaced_argv;
5150
5151                 if (!strv_isempty(unset_variables)) {
5152                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5153                         log_exec_warning(context,
5154                                          params,
5155                                          "Referenced but unset environment variable evaluates to an empty string: %s",
5156                                          strna(ju));
5157                 }
5158
5159                 if (!strv_isempty(bad_variables)) {
5160                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5161                         log_exec_warning(context,
5162                                          params,
5163                                          "Invalid environment variable name evaluates to an empty string: %s",
5164                                          strna(jb));
5165                 }
5166         } else
5167                 final_argv = command->argv;
5168
5169         log_command_line(context, params, "Executing", executable, final_argv);
5170
5171         if (params->exec_fd >= 0) {
5172                 uint8_t hot = 1;
5173
5174                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5175                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5176
5177                 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5178                         *exit_status = EXIT_EXEC;
5179                         return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5180                 }
5181         }
5182
5183         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5184
5185         if (params->exec_fd >= 0) {
5186                 uint8_t hot = 0;
5187
5188                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5189                  * that POLLHUP on it no longer means execve() succeeded. */
5190
5191                 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5192                         *exit_status = EXIT_EXEC;
5193                         return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5194                 }
5195         }
5196
5197         *exit_status = EXIT_EXEC;
5198         return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5199 }