src/core/exec-invoke.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/eventfd.h>
   4 #include <sys/ioctl.h>
   5 #include <sys/mount.h>
   6 #include <sys/prctl.h>
   7
   8 #if HAVE_PAM
   9 #include <security/pam_appl.h>
  10 #include <security/pam_misc.h>
  11 #endif
  12
  13 #if HAVE_APPARMOR
  14 #include <sys/apparmor.h>
  15 #endif
  16
  17 #include "sd-messages.h"
  18
  19 #if HAVE_APPARMOR
  20 #include "apparmor-util.h"
  21 #endif
  22 #include "argv-util.h"
  23 #include "barrier.h"
  24 #include "bpf-dlopen.h"
  25 #include "bpf-restrict-fs.h"
  26 #include "btrfs-util.h"
  27 #include "capability-util.h"
  28 #include "cgroup-setup.h"
  29 #include "chase.h"
  30 #include "chattr-util.h"
  31 #include "chown-recursive.h"
  32 #include "copy.h"
  33 #include "data-fd-util.h"
  34 #include "env-util.h"
  35 #include "escape.h"
  36 #include "exec-credential.h"
  37 #include "exec-invoke.h"
  38 #include "execute.h"
  39 #include "exit-status.h"
  40 #include "fd-util.h"
  41 #include "hexdecoct.h"
  42 #include "io-util.h"
  43 #include "iovec-util.h"
  44 #include "missing_ioprio.h"
  45 #include "missing_prctl.h"
  46 #include "missing_securebits.h"
  47 #include "missing_syscall.h"
  48 #include "mkdir-label.h"
  49 #include "proc-cmdline.h"
  50 #include "process-util.h"
  51 #include "psi-util.h"
  52 #include "rlimit-util.h"
  53 #include "seccomp-util.h"
  54 #include "selinux-util.h"
  55 #include "signal-util.h"
  56 #include "smack-util.h"
  57 #include "socket-util.h"
  58 #include "string-table.h"
  59 #include "strv.h"
  60 #include "terminal-util.h"
  61 #include "utmp-wtmp.h"
  62 #include "vpick.h"
  63
  64 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  65 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  66
  67 #define SNDBUF_SIZE (8*1024*1024)
  68
  69 static int flag_fds(
  70                 const int fds[],
  71                 size_t n_socket_fds,
  72                 size_t n_fds,
  73                 bool nonblock) {
  74
  75         int r;
  76
  77         assert(fds || n_fds == 0);
  78
  79         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
  80          * O_NONBLOCK only applies to socket activation though. */
  81
  82         for (size_t i = 0; i < n_fds; i++) {
  83
  84                 if (i < n_socket_fds) {
  85                         r = fd_nonblock(fds[i], nonblock);
  86                         if (r < 0)
  87                                 return r;
  88                 }
  89
  90                 /* We unconditionally drop FD_CLOEXEC from the fds,
  91                  * since after all we want to pass these fds to our
  92                  * children */
  93
  94                 r = fd_cloexec(fds[i], false);
  95                 if (r < 0)
  96                         return r;
  97         }
  98
  99         return 0;
 100 }
 101
 102 static bool is_terminal_input(ExecInput i) {
 103         return IN_SET(i,
 104                       EXEC_INPUT_TTY,
 105                       EXEC_INPUT_TTY_FORCE,
 106                       EXEC_INPUT_TTY_FAIL);
 107 }
 108
 109 static bool is_terminal_output(ExecOutput o) {
 110         return IN_SET(o,
 111                       EXEC_OUTPUT_TTY,
 112                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 113                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 114 }
 115
 116 static bool is_kmsg_output(ExecOutput o) {
 117         return IN_SET(o,
 118                       EXEC_OUTPUT_KMSG,
 119                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 120 }
 121
 122 static bool exec_context_needs_term(const ExecContext *c) {
 123         assert(c);
 124
 125         /* Return true if the execution context suggests we should set $TERM to something useful. */
 126
 127         if (is_terminal_input(c->std_input))
 128                 return true;
 129
 130         if (is_terminal_output(c->std_output))
 131                 return true;
 132
 133         if (is_terminal_output(c->std_error))
 134                 return true;
 135
 136         return !!c->tty_path;
 137 }
 138
 139 static int open_null_as(int flags, int nfd) {
 140         int fd;
 141
 142         assert(nfd >= 0);
 143
 144         fd = open("/dev/null", flags|O_NOCTTY);
 145         if (fd < 0)
 146                 return -errno;
 147
 148         return move_fd(fd, nfd, false);
 149 }
 150
 151 static int connect_journal_socket(
 152                 int fd,
 153                 const char *log_namespace,
 154                 uid_t uid,
 155                 gid_t gid) {
 156
 157         uid_t olduid = UID_INVALID;
 158         gid_t oldgid = GID_INVALID;
 159         const char *j;
 160         int r;
 161
 162         j = log_namespace ?
 163                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 164                 "/run/systemd/journal/stdout";
 165
 166         if (gid_is_valid(gid)) {
 167                 oldgid = getgid();
 168
 169                 if (setegid(gid) < 0)
 170                         return -errno;
 171         }
 172
 173         if (uid_is_valid(uid)) {
 174                 olduid = getuid();
 175
 176                 if (seteuid(uid) < 0) {
 177                         r = -errno;
 178                         goto restore_gid;
 179                 }
 180         }
 181
 182         r = connect_unix_path(fd, AT_FDCWD, j);
 183
 184         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 185            an LSM interferes. */
 186
 187         if (uid_is_valid(uid))
 188                 (void) seteuid(olduid);
 189
 190  restore_gid:
 191         if (gid_is_valid(gid))
 192                 (void) setegid(oldgid);
 193
 194         return r;
 195 }
 196
 197 static int connect_logger_as(
 198                 const ExecContext *context,
 199                 const ExecParameters *params,
 200                 ExecOutput output,
 201                 const char *ident,
 202                 int nfd,
 203                 uid_t uid,
 204                 gid_t gid) {
 205
 206         _cleanup_close_ int fd = -EBADF;
 207         int r;
 208
 209         assert(context);
 210         assert(params);
 211         assert(output < _EXEC_OUTPUT_MAX);
 212         assert(ident);
 213         assert(nfd >= 0);
 214
 215         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 216         if (fd < 0)
 217                 return -errno;
 218
 219         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 220         if (r < 0)
 221                 return r;
 222
 223         if (shutdown(fd, SHUT_RD) < 0)
 224                 return -errno;
 225
 226         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 227
 228         if (dprintf(fd,
 229                 "%s\n"
 230                 "%s\n"
 231                 "%i\n"
 232                 "%i\n"
 233                 "%i\n"
 234                 "%i\n"
 235                 "%i\n",
 236                 context->syslog_identifier ?: ident,
 237                 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
 238                 context->syslog_priority,
 239                 !!context->syslog_level_prefix,
 240                 false,
 241                 is_kmsg_output(output),
 242                 is_terminal_output(output)) < 0)
 243                 return -errno;
 244
 245         return move_fd(TAKE_FD(fd), nfd, false);
 246 }
 247
 248 static int open_terminal_as(const char *path, int flags, int nfd) {
 249         int fd;
 250
 251         assert(path);
 252         assert(nfd >= 0);
 253
 254         fd = open_terminal(path, flags | O_NOCTTY);
 255         if (fd < 0)
 256                 return fd;
 257
 258         return move_fd(fd, nfd, false);
 259 }
 260
 261 static int acquire_path(const char *path, int flags, mode_t mode) {
 262         _cleanup_close_ int fd = -EBADF;
 263         int r;
 264
 265         assert(path);
 266
 267         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 268                 flags |= O_CREAT;
 269
 270         fd = open(path, flags|O_NOCTTY, mode);
 271         if (fd >= 0)
 272                 return TAKE_FD(fd);
 273
 274         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 275                 return -errno;
 276
 277         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 278
 279         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 280         if (fd < 0)
 281                 return -errno;
 282
 283         r = connect_unix_path(fd, AT_FDCWD, path);
 284         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 285                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 286                  * wasn't an AF_UNIX socket after all */
 287                 return -ENXIO;
 288         if (r < 0)
 289                 return r;
 290
 291         if ((flags & O_ACCMODE) == O_RDONLY)
 292                 r = shutdown(fd, SHUT_WR);
 293         else if ((flags & O_ACCMODE) == O_WRONLY)
 294                 r = shutdown(fd, SHUT_RD);
 295         else
 296                 r = 0;
 297         if (r < 0)
 298                 return -errno;
 299
 300         return TAKE_FD(fd);
 301 }
 302
 303 static int fixup_input(
 304                 const ExecContext *context,
 305                 int socket_fd,
 306                 bool apply_tty_stdin) {
 307
 308         ExecInput std_input;
 309
 310         assert(context);
 311
 312         std_input = context->std_input;
 313
 314         if (is_terminal_input(std_input) && !apply_tty_stdin)
 315                 return EXEC_INPUT_NULL;
 316
 317         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 318                 return EXEC_INPUT_NULL;
 319
 320         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 321                 return EXEC_INPUT_NULL;
 322
 323         return std_input;
 324 }
 325
 326 static int fixup_output(ExecOutput output, int socket_fd) {
 327
 328         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 329                 return EXEC_OUTPUT_INHERIT;
 330
 331         return output;
 332 }
 333
 334 static int setup_input(
 335                 const ExecContext *context,
 336                 const ExecParameters *params,
 337                 int socket_fd,
 338                 const int named_iofds[static 3]) {
 339
 340         ExecInput i;
 341         int r;
 342
 343         assert(context);
 344         assert(params);
 345         assert(named_iofds);
 346
 347         if (params->stdin_fd >= 0) {
 348                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 349                         return -errno;
 350
 351                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 352                 if (isatty(STDIN_FILENO)) {
 353                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 354
 355                         if (context->tty_reset)
 356                                 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
 357
 358                         (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
 359                 }
 360
 361                 return STDIN_FILENO;
 362         }
 363
 364         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 365
 366         switch (i) {
 367
 368         case EXEC_INPUT_NULL:
 369                 return open_null_as(O_RDONLY, STDIN_FILENO);
 370
 371         case EXEC_INPUT_TTY:
 372         case EXEC_INPUT_TTY_FORCE:
 373         case EXEC_INPUT_TTY_FAIL: {
 374                 _cleanup_close_ int tty_fd = -EBADF;
 375                 const char *tty_path;
 376
 377                 tty_path = ASSERT_PTR(exec_context_tty_path(context));
 378
 379                 tty_fd = acquire_terminal(tty_path,
 380                                           i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 381                                           i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 382                                                                       ACQUIRE_TERMINAL_WAIT,
 383                                           USEC_INFINITY);
 384                 if (tty_fd < 0)
 385                         return tty_fd;
 386
 387                 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
 388                 if (r < 0)
 389                         return r;
 390
 391                 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
 392                 if (r < 0)
 393                         return r;
 394
 395                 TAKE_FD(tty_fd);
 396                 return r;
 397         }
 398
 399         case EXEC_INPUT_SOCKET:
 400                 assert(socket_fd >= 0);
 401
 402                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 403
 404         case EXEC_INPUT_NAMED_FD:
 405                 assert(named_iofds[STDIN_FILENO] >= 0);
 406
 407                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 408                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 409
 410         case EXEC_INPUT_DATA: {
 411                 int fd;
 412
 413                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 414                 if (fd < 0)
 415                         return fd;
 416
 417                 return move_fd(fd, STDIN_FILENO, false);
 418         }
 419
 420         case EXEC_INPUT_FILE: {
 421                 bool rw;
 422                 int fd;
 423
 424                 assert(context->stdio_file[STDIN_FILENO]);
 425
 426                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 427                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 428
 429                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 430                 if (fd < 0)
 431                         return fd;
 432
 433                 return move_fd(fd, STDIN_FILENO, false);
 434         }
 435
 436         default:
 437                 assert_not_reached();
 438         }
 439 }
 440
 441 static bool can_inherit_stderr_from_stdout(
 442                 const ExecContext *context,
 443                 ExecOutput o,
 444                 ExecOutput e) {
 445
 446         assert(context);
 447
 448         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 449          * stderr fd */
 450
 451         if (e == EXEC_OUTPUT_INHERIT)
 452                 return true;
 453         if (e != o)
 454                 return false;
 455
 456         if (e == EXEC_OUTPUT_NAMED_FD)
 457                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 458
 459         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 460                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 461
 462         return true;
 463 }
 464
 465 static int setup_output(
 466                 const ExecContext *context,
 467                 const ExecParameters *params,
 468                 int fileno,
 469                 int socket_fd,
 470                 const int named_iofds[static 3],
 471                 const char *ident,
 472                 uid_t uid,
 473                 gid_t gid,
 474                 dev_t *journal_stream_dev,
 475                 ino_t *journal_stream_ino) {
 476
 477         ExecOutput o;
 478         ExecInput i;
 479         int r;
 480
 481         assert(context);
 482         assert(params);
 483         assert(ident);
 484         assert(journal_stream_dev);
 485         assert(journal_stream_ino);
 486
 487         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 488
 489                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 490                         return -errno;
 491
 492                 return STDOUT_FILENO;
 493         }
 494
 495         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 496                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 497                         return -errno;
 498
 499                 return STDERR_FILENO;
 500         }
 501
 502         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 503         o = fixup_output(context->std_output, socket_fd);
 504
 505         if (fileno == STDERR_FILENO) {
 506                 ExecOutput e;
 507                 e = fixup_output(context->std_error, socket_fd);
 508
 509                 /* This expects the input and output are already set up */
 510
 511                 /* Don't change the stderr file descriptor if we inherit all
 512                  * the way and are not on a tty */
 513                 if (e == EXEC_OUTPUT_INHERIT &&
 514                     o == EXEC_OUTPUT_INHERIT &&
 515                     i == EXEC_INPUT_NULL &&
 516                     !is_terminal_input(context->std_input) &&
 517                     getppid() != 1)
 518                         return fileno;
 519
 520                 /* Duplicate from stdout if possible */
 521                 if (can_inherit_stderr_from_stdout(context, o, e))
 522                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 523
 524                 o = e;
 525
 526         } else if (o == EXEC_OUTPUT_INHERIT) {
 527                 /* If input got downgraded, inherit the original value */
 528                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 529                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 530
 531                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 532                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 533                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 534
 535                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 536                 if (getppid() != 1)
 537                         return fileno;
 538
 539                 /* We need to open /dev/null here anew, to get the right access mode. */
 540                 return open_null_as(O_WRONLY, fileno);
 541         }
 542
 543         switch (o) {
 544
 545         case EXEC_OUTPUT_NULL:
 546                 return open_null_as(O_WRONLY, fileno);
 547
 548         case EXEC_OUTPUT_TTY:
 549                 if (is_terminal_input(i))
 550                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 551
 552                 /* We don't reset the terminal if this is just about output */
 553                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 554
 555         case EXEC_OUTPUT_KMSG:
 556         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 557         case EXEC_OUTPUT_JOURNAL:
 558         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 559                 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
 560                 if (r < 0) {
 561                         log_exec_warning_errno(context,
 562                                                params,
 563                                                r,
 564                                                "Failed to connect %s to the journal socket, ignoring: %m",
 565                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 566                         r = open_null_as(O_WRONLY, fileno);
 567                 } else {
 568                         struct stat st;
 569
 570                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 571                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 572                          * services to detect whether they are connected to the journal or not.
 573                          *
 574                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 575                          * about STDERR as that's usually the best way to do logging. */
 576
 577                         if (fstat(fileno, &st) >= 0 &&
 578                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 579                                 *journal_stream_dev = st.st_dev;
 580                                 *journal_stream_ino = st.st_ino;
 581                         }
 582                 }
 583                 return r;
 584
 585         case EXEC_OUTPUT_SOCKET:
 586                 assert(socket_fd >= 0);
 587
 588                 return RET_NERRNO(dup2(socket_fd, fileno));
 589
 590         case EXEC_OUTPUT_NAMED_FD:
 591                 assert(named_iofds[fileno] >= 0);
 592
 593                 (void) fd_nonblock(named_iofds[fileno], false);
 594                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 595
 596         case EXEC_OUTPUT_FILE:
 597         case EXEC_OUTPUT_FILE_APPEND:
 598         case EXEC_OUTPUT_FILE_TRUNCATE: {
 599                 bool rw;
 600                 int fd, flags;
 601
 602                 assert(context->stdio_file[fileno]);
 603
 604                 rw = context->std_input == EXEC_INPUT_FILE &&
 605                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 606
 607                 if (rw)
 608                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 609
 610                 flags = O_WRONLY;
 611                 if (o == EXEC_OUTPUT_FILE_APPEND)
 612                         flags |= O_APPEND;
 613                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 614                         flags |= O_TRUNC;
 615
 616                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 617                 if (fd < 0)
 618                         return fd;
 619
 620                 return move_fd(fd, fileno, 0);
 621         }
 622
 623         default:
 624                 assert_not_reached();
 625         }
 626 }
 627
 628 static int chown_terminal(int fd, uid_t uid) {
 629         int r;
 630
 631         assert(fd >= 0);
 632
 633         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 634         if (!isatty_safe(fd))
 635                 return 0;
 636
 637         /* This might fail. What matters are the results. */
 638         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 639         if (r < 0)
 640                 return r;
 641
 642         return 1;
 643 }
 644
 645 static int setup_confirm_stdio(
 646                 const ExecContext *context,
 647                 const char *vc,
 648                 int *ret_saved_stdin,
 649                 int *ret_saved_stdout) {
 650
 651         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 652         int r;
 653
 654         assert(ret_saved_stdin);
 655         assert(ret_saved_stdout);
 656
 657         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 658         if (saved_stdin < 0)
 659                 return -errno;
 660
 661         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 662         if (saved_stdout < 0)
 663                 return -errno;
 664
 665         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 666         if (fd < 0)
 667                 return fd;
 668
 669         r = chown_terminal(fd, getuid());
 670         if (r < 0)
 671                 return r;
 672
 673         r = reset_terminal_fd(fd, /* switch_to_text= */ true);
 674         if (r < 0)
 675                 return r;
 676
 677         r = exec_context_apply_tty_size(context, fd, vc);
 678         if (r < 0)
 679                 return r;
 680
 681         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 682         TAKE_FD(fd);
 683         if (r < 0)
 684                 return r;
 685
 686         *ret_saved_stdin = TAKE_FD(saved_stdin);
 687         *ret_saved_stdout = TAKE_FD(saved_stdout);
 688         return 0;
 689 }
 690
 691 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
 692         assert(err < 0);
 693         assert(unit_id);
 694
 695         if (err == -ETIMEDOUT)
 696                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
 697         else {
 698                 errno = -err;
 699                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
 700         }
 701 }
 702
 703 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
 704         _cleanup_close_ int fd = -EBADF;
 705
 706         assert(vc);
 707
 708         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 709         if (fd < 0)
 710                 return;
 711
 712         write_confirm_error_fd(err, fd, unit_id);
 713 }
 714
 715 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 716         int r = 0;
 717
 718         assert(saved_stdin);
 719         assert(saved_stdout);
 720
 721         release_terminal();
 722
 723         if (*saved_stdin >= 0)
 724                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 725                         r = -errno;
 726
 727         if (*saved_stdout >= 0)
 728                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 729                         r = -errno;
 730
 731         *saved_stdin = safe_close(*saved_stdin);
 732         *saved_stdout = safe_close(*saved_stdout);
 733
 734         return r;
 735 }
 736
 737 enum {
 738         CONFIRM_PRETEND_FAILURE = -1,
 739         CONFIRM_PRETEND_SUCCESS =  0,
 740         CONFIRM_EXECUTE = 1,
 741 };
 742
 743 static bool confirm_spawn_disabled(void) {
 744         return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
 745 }
 746
 747 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
 748         int saved_stdout = -1, saved_stdin = -1, r;
 749         _cleanup_free_ char *e = NULL;
 750         char c;
 751
 752         assert(context);
 753         assert(params);
 754
 755         /* For any internal errors, assume a positive response. */
 756         r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
 757         if (r < 0) {
 758                 write_confirm_error(r, params->confirm_spawn, params->unit_id);
 759                 return CONFIRM_EXECUTE;
 760         }
 761
 762         /* confirm_spawn might have been disabled while we were sleeping. */
 763         if (!params->confirm_spawn || confirm_spawn_disabled()) {
 764                 r = 1;
 765                 goto restore_stdio;
 766         }
 767
 768         e = ellipsize(cmdline, 60, 100);
 769         if (!e) {
 770                 log_oom();
 771                 r = CONFIRM_EXECUTE;
 772                 goto restore_stdio;
 773         }
 774
 775         for (;;) {
 776                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 777                 if (r < 0) {
 778                         write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
 779                         r = CONFIRM_EXECUTE;
 780                         goto restore_stdio;
 781                 }
 782
 783                 switch (c) {
 784                 case 'c':
 785                         printf("Resuming normal execution.\n");
 786                         manager_disable_confirm_spawn();
 787                         r = 1;
 788                         break;
 789                 case 'D':
 790                         printf("  Unit: %s\n",
 791                                params->unit_id);
 792                         exec_context_dump(context, stdout, "  ");
 793                         exec_params_dump(params, stdout, "  ");
 794                         continue; /* ask again */
 795                 case 'f':
 796                         printf("Failing execution.\n");
 797                         r = CONFIRM_PRETEND_FAILURE;
 798                         break;
 799                 case 'h':
 800                         printf("  c - continue, proceed without asking anymore\n"
 801                                "  D - dump, show the state of the unit\n"
 802                                "  f - fail, don't execute the command and pretend it failed\n"
 803                                "  h - help\n"
 804                                "  i - info, show a short summary of the unit\n"
 805                                "  j - jobs, show jobs that are in progress\n"
 806                                "  s - skip, don't execute the command and pretend it succeeded\n"
 807                                "  y - yes, execute the command\n");
 808                         continue; /* ask again */
 809                 case 'i':
 810                         printf("  Unit:        %s\n"
 811                                "  Command:     %s\n",
 812                                params->unit_id, cmdline);
 813                         continue; /* ask again */
 814                 case 'j':
 815                         if (sigqueue(getppid(),
 816                                      SIGRTMIN+18,
 817                                      (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
 818                                 return -errno;
 819
 820                         continue; /* ask again */
 821                 case 'n':
 822                         /* 'n' was removed in favor of 'f'. */
 823                         printf("Didn't understand 'n', did you mean 'f'?\n");
 824                         continue; /* ask again */
 825                 case 's':
 826                         printf("Skipping execution.\n");
 827                         r = CONFIRM_PRETEND_SUCCESS;
 828                         break;
 829                 case 'y':
 830                         r = CONFIRM_EXECUTE;
 831                         break;
 832                 default:
 833                         assert_not_reached();
 834                 }
 835                 break;
 836         }
 837
 838 restore_stdio:
 839         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 840         return r;
 841 }
 842
 843 static int get_fixed_user(
 844                 const char *user_or_uid,
 845                 const char **ret_username,
 846                 uid_t *ret_uid,
 847                 gid_t *ret_gid,
 848                 const char **ret_home,
 849                 const char **ret_shell) {
 850
 851         int r;
 852
 853         assert(user_or_uid);
 854         assert(ret_username);
 855
 856         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 857          * (i.e. are "/" or "/bin/nologin"). */
 858
 859         r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
 860         if (r < 0)
 861                 return r;
 862
 863         /* user_or_uid is normalized by get_user_creds to username */
 864         *ret_username = user_or_uid;
 865
 866         return 0;
 867 }
 868
 869 static int get_fixed_group(
 870                 const char *group_or_gid,
 871                 const char **ret_groupname,
 872                 gid_t *ret_gid) {
 873
 874         int r;
 875
 876         assert(group_or_gid);
 877         assert(ret_groupname);
 878
 879         r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
 880         if (r < 0)
 881                 return r;
 882
 883         /* group_or_gid is normalized by get_group_creds to groupname */
 884         *ret_groupname = group_or_gid;
 885
 886         return 0;
 887 }
 888
 889 static int get_supplementary_groups(const ExecContext *c, const char *user,
 890                                     const char *group, gid_t gid,
 891                                     gid_t **supplementary_gids, int *ngids) {
 892         int r, k = 0;
 893         int ngroups_max;
 894         bool keep_groups = false;
 895         gid_t *groups = NULL;
 896         _cleanup_free_ gid_t *l_gids = NULL;
 897
 898         assert(c);
 899
 900         /*
 901          * If user is given, then lookup GID and supplementary groups list.
 902          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 903          * here and as early as possible so we keep the list of supplementary
 904          * groups of the caller.
 905          */
 906         if (user && gid_is_valid(gid) && gid != 0) {
 907                 /* First step, initialize groups from /etc/groups */
 908                 if (initgroups(user, gid) < 0)
 909                         return -errno;
 910
 911                 keep_groups = true;
 912         }
 913
 914         if (strv_isempty(c->supplementary_groups))
 915                 return 0;
 916
 917         /*
 918          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 919          * be positive, otherwise fail.
 920          */
 921         errno = 0;
 922         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 923         if (ngroups_max <= 0)
 924                 return errno_or_else(EOPNOTSUPP);
 925
 926         l_gids = new(gid_t, ngroups_max);
 927         if (!l_gids)
 928                 return -ENOMEM;
 929
 930         if (keep_groups) {
 931                 /*
 932                  * Lookup the list of groups that the user belongs to, we
 933                  * avoid NSS lookups here too for gid=0.
 934                  */
 935                 k = ngroups_max;
 936                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 937                         return -EINVAL;
 938         } else
 939                 k = 0;
 940
 941         STRV_FOREACH(i, c->supplementary_groups) {
 942                 const char *g;
 943
 944                 if (k >= ngroups_max)
 945                         return -E2BIG;
 946
 947                 g = *i;
 948                 r = get_group_creds(&g, l_gids+k, 0);
 949                 if (r < 0)
 950                         return r;
 951
 952                 k++;
 953         }
 954
 955         /*
 956          * Sets ngids to zero to drop all supplementary groups, happens
 957          * when we are under root and SupplementaryGroups= is empty.
 958          */
 959         if (k == 0) {
 960                 *ngids = 0;
 961                 return 0;
 962         }
 963
 964         /* Otherwise get the final list of supplementary groups */
 965         groups = memdup(l_gids, sizeof(gid_t) * k);
 966         if (!groups)
 967                 return -ENOMEM;
 968
 969         *supplementary_gids = groups;
 970         *ngids = k;
 971
 972         groups = NULL;
 973
 974         return 0;
 975 }
 976
 977 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
 978         int r;
 979
 980         /* Handle SupplementaryGroups= if it is not empty */
 981         if (ngids > 0) {
 982                 r = maybe_setgroups(ngids, supplementary_gids);
 983                 if (r < 0)
 984                         return r;
 985         }
 986
 987         if (gid_is_valid(gid)) {
 988                 /* Then set our gids */
 989                 if (setresgid(gid, gid, gid) < 0)
 990                         return -errno;
 991         }
 992
 993         return 0;
 994 }
 995
 996 static int set_securebits(unsigned bits, unsigned mask) {
 997         unsigned applied;
 998         int current;
 999
1000         current = prctl(PR_GET_SECUREBITS);
1001         if (current < 0)
1002                 return -errno;
1003
1004         /* Clear all securebits defined in mask and set bits */
1005         applied = ((unsigned) current & ~mask) | bits;
1006         if ((unsigned) current == applied)
1007                 return 0;
1008
1009         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1010                 return -errno;
1011
1012         return 1;
1013 }
1014
1015 static int enforce_user(
1016                 const ExecContext *context,
1017                 uid_t uid,
1018                 uint64_t capability_ambient_set) {
1019         assert(context);
1020         int r;
1021
1022         if (!uid_is_valid(uid))
1023                 return 0;
1024
1025         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1026          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1027          * case. */
1028
1029         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1030
1031                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1032                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1033                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1034                 if (r < 0)
1035                         return r;
1036         }
1037
1038         /* Second step: actually set the uids */
1039         if (setresuid(uid, uid, uid) < 0)
1040                 return -errno;
1041
1042         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1043          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1044          * outside of this call. */
1045         return 0;
1046 }
1047
1048 #if HAVE_PAM
1049
1050 static int null_conv(
1051                 int num_msg,
1052                 const struct pam_message **msg,
1053                 struct pam_response **resp,
1054                 void *appdata_ptr) {
1055
1056         /* We don't support conversations */
1057
1058         return PAM_CONV_ERR;
1059 }
1060
1061 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1062         int r, s;
1063
1064         assert(handle);
1065
1066         r = pam_close_session(handle, flags);
1067         if (r != PAM_SUCCESS)
1068                 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1069
1070         s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1071         if (s != PAM_SUCCESS)
1072                 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1073
1074         return r != PAM_SUCCESS ? r : s;
1075 }
1076
1077 #endif
1078
1079 static int setup_pam(
1080                 const char *name,
1081                 const char *user,
1082                 uid_t uid,
1083                 gid_t gid,
1084                 const char *tty,
1085                 char ***env, /* updated on success */
1086                 const int fds[], size_t n_fds,
1087                 int exec_fd) {
1088
1089 #if HAVE_PAM
1090
1091         static const struct pam_conv conv = {
1092                 .conv = null_conv,
1093                 .appdata_ptr = NULL
1094         };
1095
1096         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1097         _cleanup_strv_free_ char **e = NULL;
1098         pam_handle_t *handle = NULL;
1099         sigset_t old_ss;
1100         int pam_code = PAM_SUCCESS, r;
1101         bool close_session = false;
1102         pid_t parent_pid;
1103         int flags = 0;
1104
1105         assert(name);
1106         assert(user);
1107         assert(env);
1108
1109         /* We set up PAM in the parent process, then fork. The child
1110          * will then stay around until killed via PR_GET_PDEATHSIG or
1111          * systemd via the cgroup logic. It will then remove the PAM
1112          * session again. The parent process will exec() the actual
1113          * daemon. We do things this way to ensure that the main PID
1114          * of the daemon is the one we initially fork()ed. */
1115
1116         r = barrier_create(&barrier);
1117         if (r < 0)
1118                 goto fail;
1119
1120         if (log_get_max_level() < LOG_DEBUG)
1121                 flags |= PAM_SILENT;
1122
1123         pam_code = pam_start(name, user, &conv, &handle);
1124         if (pam_code != PAM_SUCCESS) {
1125                 handle = NULL;
1126                 goto fail;
1127         }
1128
1129         if (!tty) {
1130                 _cleanup_free_ char *q = NULL;
1131
1132                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1133                  * out if that's the case, and read the TTY off it. */
1134
1135                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1136                         tty = strjoina("/dev/", q);
1137         }
1138
1139         if (tty) {
1140                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1141                 if (pam_code != PAM_SUCCESS)
1142                         goto fail;
1143         }
1144
1145         STRV_FOREACH(nv, *env) {
1146                 pam_code = pam_putenv(handle, *nv);
1147                 if (pam_code != PAM_SUCCESS)
1148                         goto fail;
1149         }
1150
1151         pam_code = pam_acct_mgmt(handle, flags);
1152         if (pam_code != PAM_SUCCESS)
1153                 goto fail;
1154
1155         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1156         if (pam_code != PAM_SUCCESS)
1157                 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1158
1159         pam_code = pam_open_session(handle, flags);
1160         if (pam_code != PAM_SUCCESS)
1161                 goto fail;
1162
1163         close_session = true;
1164
1165         e = pam_getenvlist(handle);
1166         if (!e) {
1167                 pam_code = PAM_BUF_ERR;
1168                 goto fail;
1169         }
1170
1171         /* Block SIGTERM, so that we know that it won't get lost in the child */
1172
1173         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM) >= 0);
1174
1175         parent_pid = getpid_cached();
1176
1177         r = safe_fork("(sd-pam)", 0, NULL);
1178         if (r < 0)
1179                 goto fail;
1180         if (r == 0) {
1181                 int ret = EXIT_PAM;
1182
1183                 /* The child's job is to reset the PAM session on termination */
1184                 barrier_set_role(&barrier, BARRIER_CHILD);
1185
1186                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1187                  * those fds are open here that have been opened by PAM. */
1188                 (void) close_many(fds, n_fds);
1189
1190                 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1191                  * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1192                  * we'd never signal completion. */
1193                 exec_fd = safe_close(exec_fd);
1194
1195                 /* Drop privileges - we don't need any to pam_close_session and this will make
1196                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1197                  * threads to fail to exit normally */
1198
1199                 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1200                 if (r < 0)
1201                         log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1202
1203                 (void) ignore_signals(SIGPIPE);
1204
1205                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1206                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1207                  * this way. We rely on the control groups kill logic to do the rest for us. */
1208                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1209                         goto child_finish;
1210
1211                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1212                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1213                  *
1214                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1215                 (void) barrier_place(&barrier);
1216
1217                 /* Check if our parent process might already have died? */
1218                 if (getppid() == parent_pid) {
1219                         sigset_t ss;
1220                         int sig;
1221
1222                         assert_se(sigemptyset(&ss) >= 0);
1223                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1224
1225                         assert_se(sigwait(&ss, &sig) == 0);
1226                         assert(sig == SIGTERM);
1227                 }
1228
1229                 /* If our parent died we'll end the session */
1230                 if (getppid() != parent_pid) {
1231                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1232                         if (pam_code != PAM_SUCCESS)
1233                                 goto child_finish;
1234                 }
1235
1236                 ret = 0;
1237
1238         child_finish:
1239                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1240                  * know about this. See pam_end(3) */
1241                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1242                 _exit(ret);
1243         }
1244
1245         barrier_set_role(&barrier, BARRIER_PARENT);
1246
1247         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1248          * here. */
1249         handle = NULL;
1250
1251         /* Unblock SIGTERM again in the parent */
1252         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1253
1254         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1255          * this fd around. */
1256         closelog();
1257
1258         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1259          * recover. However, warn loudly if it happens. */
1260         if (!barrier_place_and_sync(&barrier))
1261                 log_error("PAM initialization failed");
1262
1263         return strv_free_and_replace(*env, e);
1264
1265 fail:
1266         if (pam_code != PAM_SUCCESS) {
1267                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1268                 r = -EPERM;  /* PAM errors do not map to errno */
1269         } else
1270                 log_error_errno(r, "PAM failed: %m");
1271
1272         if (handle) {
1273                 if (close_session)
1274                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1275
1276                 (void) pam_end(handle, pam_code | flags);
1277         }
1278
1279         closelog();
1280         return r;
1281 #else
1282         return 0;
1283 #endif
1284 }
1285
1286 static void rename_process_from_path(const char *path) {
1287         _cleanup_free_ char *buf = NULL;
1288         const char *p;
1289
1290         assert(path);
1291
1292         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1293          * /bin/ps */
1294
1295         if (path_extract_filename(path, &buf) < 0) {
1296                 rename_process("(...)");
1297                 return;
1298         }
1299
1300         size_t l = strlen(buf);
1301         if (l > 8) {
1302                 /* The end of the process name is usually more interesting, since the first bit might just be
1303                  * "systemd-" */
1304                 p = buf + l - 8;
1305                 l = 8;
1306         } else
1307                 p = buf;
1308
1309         char process_name[11];
1310         process_name[0] = '(';
1311         memcpy(process_name+1, p, l);
1312         process_name[1+l] = ')';
1313         process_name[1+l+1] = 0;
1314
1315         (void) rename_process(process_name);
1316 }
1317
1318 static bool context_has_address_families(const ExecContext *c) {
1319         assert(c);
1320
1321         return c->address_families_allow_list ||
1322                 !set_isempty(c->address_families);
1323 }
1324
1325 static bool context_has_syscall_filters(const ExecContext *c) {
1326         assert(c);
1327
1328         return c->syscall_allow_list ||
1329                 !hashmap_isempty(c->syscall_filter);
1330 }
1331
1332 static bool context_has_syscall_logs(const ExecContext *c) {
1333         assert(c);
1334
1335         return c->syscall_log_allow_list ||
1336                 !hashmap_isempty(c->syscall_log);
1337 }
1338
1339 static bool context_has_seccomp(const ExecContext *c) {
1340         /* We need NNP if we have any form of seccomp and are unprivileged */
1341         return c->lock_personality ||
1342                 c->memory_deny_write_execute ||
1343                 c->private_devices ||
1344                 c->protect_clock ||
1345                 c->protect_hostname ||
1346                 c->protect_kernel_tunables ||
1347                 c->protect_kernel_modules ||
1348                 c->protect_kernel_logs ||
1349                 context_has_address_families(c) ||
1350                 exec_context_restrict_namespaces_set(c) ||
1351                 c->restrict_realtime ||
1352                 c->restrict_suid_sgid ||
1353                 !set_isempty(c->syscall_archs) ||
1354                 context_has_syscall_filters(c) ||
1355                 context_has_syscall_logs(c);
1356 }
1357
1358 static bool context_has_no_new_privileges(const ExecContext *c) {
1359         assert(c);
1360
1361         if (c->no_new_privileges)
1362                 return true;
1363
1364         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1365                 return false;
1366
1367         return context_has_seccomp(c);
1368 }
1369
1370 #if HAVE_SECCOMP
1371
1372 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1373         void *id, *val;
1374         bool has_capget = false, has_capset = false, has_prctl = false;
1375
1376         assert(c);
1377
1378         /* No syscall filter, we are allowed to drop privileges */
1379         if (hashmap_isempty(c->syscall_filter))
1380                 return true;
1381
1382         HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1383                 _cleanup_free_ char *name = NULL;
1384
1385                 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1386
1387                 if (streq(name, "capget"))
1388                         has_capget = true;
1389                 else if (streq(name, "capset"))
1390                         has_capset = true;
1391                 else if (streq(name, "prctl"))
1392                         has_prctl = true;
1393         }
1394
1395         if (c->syscall_allow_list)
1396                 return has_capget && has_capset && has_prctl;
1397         else
1398                 return !(has_capget || has_capset || has_prctl);
1399 }
1400
1401 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1402
1403         if (is_seccomp_available())
1404                 return false;
1405
1406         log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1407         return true;
1408 }
1409
1410 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1411         uint32_t negative_action, default_action, action;
1412         int r;
1413
1414         assert(c);
1415         assert(p);
1416
1417         if (!context_has_syscall_filters(c))
1418                 return 0;
1419
1420         if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1421                 return 0;
1422
1423         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1424
1425         if (c->syscall_allow_list) {
1426                 default_action = negative_action;
1427                 action = SCMP_ACT_ALLOW;
1428         } else {
1429                 default_action = SCMP_ACT_ALLOW;
1430                 action = negative_action;
1431         }
1432
1433         if (needs_ambient_hack) {
1434                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1435                 if (r < 0)
1436                         return r;
1437         }
1438
1439         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1440 }
1441
1442 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1443 #ifdef SCMP_ACT_LOG
1444         uint32_t default_action, action;
1445 #endif
1446
1447         assert(c);
1448         assert(p);
1449
1450         if (!context_has_syscall_logs(c))
1451                 return 0;
1452
1453 #ifdef SCMP_ACT_LOG
1454         if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1455                 return 0;
1456
1457         if (c->syscall_log_allow_list) {
1458                 /* Log nothing but the ones listed */
1459                 default_action = SCMP_ACT_ALLOW;
1460                 action = SCMP_ACT_LOG;
1461         } else {
1462                 /* Log everything but the ones listed */
1463                 default_action = SCMP_ACT_LOG;
1464                 action = SCMP_ACT_ALLOW;
1465         }
1466
1467         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1468 #else
1469         /* old libseccomp */
1470         log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1471         return 0;
1472 #endif
1473 }
1474
1475 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1476         assert(c);
1477         assert(p);
1478
1479         if (set_isempty(c->syscall_archs))
1480                 return 0;
1481
1482         if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1483                 return 0;
1484
1485         return seccomp_restrict_archs(c->syscall_archs);
1486 }
1487
1488 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1489         assert(c);
1490         assert(p);
1491
1492         if (!context_has_address_families(c))
1493                 return 0;
1494
1495         if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1496                 return 0;
1497
1498         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1499 }
1500
1501 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1502         int r;
1503
1504         assert(c);
1505         assert(p);
1506
1507         if (!c->memory_deny_write_execute)
1508                 return 0;
1509
1510         /* use prctl() if kernel supports it (6.3) */
1511         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1512         if (r == 0) {
1513                 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1514                 return 0;
1515         }
1516         if (r < 0 && errno != EINVAL)
1517                 return log_exec_debug_errno(c,
1518                                             p,
1519                                             errno,
1520                                             "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1521         /* else use seccomp */
1522         log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1523
1524         if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1525                 return 0;
1526
1527         return seccomp_memory_deny_write_execute();
1528 }
1529
1530 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1531         assert(c);
1532         assert(p);
1533
1534         if (!c->restrict_realtime)
1535                 return 0;
1536
1537         if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1538                 return 0;
1539
1540         return seccomp_restrict_realtime();
1541 }
1542
1543 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1544         assert(c);
1545         assert(p);
1546
1547         if (!c->restrict_suid_sgid)
1548                 return 0;
1549
1550         if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1551                 return 0;
1552
1553         return seccomp_restrict_suid_sgid();
1554 }
1555
1556 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1557         assert(c);
1558         assert(p);
1559
1560         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1561          * let's protect even those systems where this is left on in the kernel. */
1562
1563         if (!c->protect_kernel_tunables)
1564                 return 0;
1565
1566         if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1567                 return 0;
1568
1569         return seccomp_protect_sysctl();
1570 }
1571
1572 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1573         assert(c);
1574         assert(p);
1575
1576         /* Turn off module syscalls on ProtectKernelModules=yes */
1577
1578         if (!c->protect_kernel_modules)
1579                 return 0;
1580
1581         if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1582                 return 0;
1583
1584         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1585 }
1586
1587 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1588         assert(c);
1589         assert(p);
1590
1591         if (!c->protect_kernel_logs)
1592                 return 0;
1593
1594         if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1595                 return 0;
1596
1597         return seccomp_protect_syslog();
1598 }
1599
1600 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1601         assert(c);
1602         assert(p);
1603
1604         if (!c->protect_clock)
1605                 return 0;
1606
1607         if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1608                 return 0;
1609
1610         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1611 }
1612
1613 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1614         assert(c);
1615         assert(p);
1616
1617         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1618
1619         if (!c->private_devices)
1620                 return 0;
1621
1622         if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1623                 return 0;
1624
1625         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1626 }
1627
1628 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1629         assert(c);
1630         assert(p);
1631
1632         if (!exec_context_restrict_namespaces_set(c))
1633                 return 0;
1634
1635         if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1636                 return 0;
1637
1638         return seccomp_restrict_namespaces(c->restrict_namespaces);
1639 }
1640
1641 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1642         unsigned long personality;
1643         int r;
1644
1645         assert(c);
1646         assert(p);
1647
1648         if (!c->lock_personality)
1649                 return 0;
1650
1651         if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1652                 return 0;
1653
1654         personality = c->personality;
1655
1656         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1657         if (personality == PERSONALITY_INVALID) {
1658
1659                 r = opinionated_personality(&personality);
1660                 if (r < 0)
1661                         return r;
1662         }
1663
1664         return seccomp_lock_personality(personality);
1665 }
1666
1667 #endif
1668
1669 #if HAVE_LIBBPF
1670 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1671         int r;
1672
1673         assert(c);
1674         assert(p);
1675
1676         if (!exec_context_restrict_filesystems_set(c))
1677                 return 0;
1678
1679         if (p->bpf_restrict_fs_map_fd < 0) {
1680                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1681                 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1682                 return 0;
1683         }
1684
1685         /* We are in a new binary, so dl-open again */
1686         r = dlopen_bpf();
1687         if (r < 0)
1688                 return r;
1689
1690         return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1691 }
1692 #endif
1693
1694 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1695         assert(c);
1696         assert(p);
1697
1698         if (!c->protect_hostname)
1699                 return 0;
1700
1701         if (ns_type_supported(NAMESPACE_UTS)) {
1702                 if (unshare(CLONE_NEWUTS) < 0) {
1703                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1704                                 *ret_exit_status = EXIT_NAMESPACE;
1705                                 return log_exec_error_errno(c,
1706                                                             p,
1707                                                             errno,
1708                                                             "Failed to set up UTS namespacing: %m");
1709                         }
1710
1711                         log_exec_warning(c,
1712                                          p,
1713                                          "ProtectHostname=yes is configured, but UTS namespace setup is "
1714                                          "prohibited (container manager?), ignoring namespace setup.");
1715                 }
1716         } else
1717                 log_exec_warning(c,
1718                                  p,
1719                                  "ProtectHostname=yes is configured, but the kernel does not "
1720                                  "support UTS namespaces, ignoring namespace setup.");
1721
1722 #if HAVE_SECCOMP
1723         int r;
1724
1725         if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1726                 return 0;
1727
1728         r = seccomp_protect_hostname();
1729         if (r < 0) {
1730                 *ret_exit_status = EXIT_SECCOMP;
1731                 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1732         }
1733 #endif
1734
1735         return 0;
1736 }
1737
1738 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1739         assert(idle_pipe);
1740
1741         idle_pipe[1] = safe_close(idle_pipe[1]);
1742         idle_pipe[2] = safe_close(idle_pipe[2]);
1743
1744         if (idle_pipe[0] >= 0) {
1745                 int r;
1746
1747                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1748
1749                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1750                         ssize_t n;
1751
1752                         /* Signal systemd that we are bored and want to continue. */
1753                         n = write(idle_pipe[3], "x", 1);
1754                         if (n > 0)
1755                                 /* Wait for systemd to react to the signal above. */
1756                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1757                 }
1758
1759                 idle_pipe[0] = safe_close(idle_pipe[0]);
1760
1761         }
1762
1763         idle_pipe[3] = safe_close(idle_pipe[3]);
1764 }
1765
1766 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1767
1768 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1769  * the service payload in. */
1770 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1771         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1772         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1773         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1774         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1775         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1776 };
1777
1778 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1779
1780 static int build_environment(
1781                 const ExecContext *c,
1782                 const ExecParameters *p,
1783                 const CGroupContext *cgroup_context,
1784                 size_t n_fds,
1785                 const char *home,
1786                 const char *username,
1787                 const char *shell,
1788                 dev_t journal_stream_dev,
1789                 ino_t journal_stream_ino,
1790                 const char *memory_pressure_path,
1791                 char ***ret) {
1792
1793         _cleanup_strv_free_ char **our_env = NULL;
1794         size_t n_env = 0;
1795         char *x;
1796         int r;
1797
1798         assert(c);
1799         assert(p);
1800         assert(ret);
1801
1802 #define N_ENV_VARS 19
1803         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1804         if (!our_env)
1805                 return -ENOMEM;
1806
1807         if (n_fds > 0) {
1808                 _cleanup_free_ char *joined = NULL;
1809
1810                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1811                         return -ENOMEM;
1812                 our_env[n_env++] = x;
1813
1814                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1815                         return -ENOMEM;
1816                 our_env[n_env++] = x;
1817
1818                 joined = strv_join(p->fd_names, ":");
1819                 if (!joined)
1820                         return -ENOMEM;
1821
1822                 x = strjoin("LISTEN_FDNAMES=", joined);
1823                 if (!x)
1824                         return -ENOMEM;
1825                 our_env[n_env++] = x;
1826         }
1827
1828         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1829                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1830                         return -ENOMEM;
1831                 our_env[n_env++] = x;
1832
1833                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1834                         return -ENOMEM;
1835                 our_env[n_env++] = x;
1836         }
1837
1838         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1839          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1840          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1841         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1842                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1843                 if (!x)
1844                         return -ENOMEM;
1845                 our_env[n_env++] = x;
1846         }
1847
1848         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1849          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1850          * really make much sense since we're not logged in. Hence we conditionalize the three based on
1851          * SetLoginEnvironment= switch. */
1852         if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1853                 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1854                 if (r < 0)
1855                         return log_exec_debug_errno(c,
1856                                                     p,
1857                                                     r,
1858                                                     "Failed to determine user credentials for root: %m");
1859         }
1860
1861         bool set_user_login_env = exec_context_get_set_login_environment(c);
1862
1863         if (username) {
1864                 x = strjoin("USER=", username);
1865                 if (!x)
1866                         return -ENOMEM;
1867                 our_env[n_env++] = x;
1868
1869                 if (set_user_login_env) {
1870                         x = strjoin("LOGNAME=", username);
1871                         if (!x)
1872                                 return -ENOMEM;
1873                         our_env[n_env++] = x;
1874                 }
1875         }
1876
1877         if (home && set_user_login_env) {
1878                 x = strjoin("HOME=", home);
1879                 if (!x)
1880                         return -ENOMEM;
1881
1882                 path_simplify(x + 5);
1883                 our_env[n_env++] = x;
1884         }
1885
1886         if (shell && set_user_login_env) {
1887                 x = strjoin("SHELL=", shell);
1888                 if (!x)
1889                         return -ENOMEM;
1890
1891                 path_simplify(x + 6);
1892                 our_env[n_env++] = x;
1893         }
1894
1895         if (!sd_id128_is_null(p->invocation_id)) {
1896                 assert(p->invocation_id_string);
1897
1898                 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1899                 if (!x)
1900                         return -ENOMEM;
1901
1902                 our_env[n_env++] = x;
1903         }
1904
1905         if (exec_context_needs_term(c)) {
1906                 _cleanup_free_ char *cmdline = NULL;
1907                 const char *tty_path, *term = NULL;
1908
1909                 tty_path = exec_context_tty_path(c);
1910
1911                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1912                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1913                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1914
1915                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1916                         term = getenv("TERM");
1917                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1918                         _cleanup_free_ char *key = NULL;
1919
1920                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1921                         if (!key)
1922                                 return -ENOMEM;
1923
1924                         r = proc_cmdline_get_key(key, 0, &cmdline);
1925                         if (r < 0)
1926                                 log_exec_debug_errno(c,
1927                                                      p,
1928                                                      r,
1929                                                      "Failed to read %s from kernel cmdline, ignoring: %m",
1930                                                      key);
1931                         else if (r > 0)
1932                                 term = cmdline;
1933                 }
1934
1935                 if (!term)
1936                         term = default_term_for_tty(tty_path);
1937
1938                 x = strjoin("TERM=", term);
1939                 if (!x)
1940                         return -ENOMEM;
1941                 our_env[n_env++] = x;
1942         }
1943
1944         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1945                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1946                         return -ENOMEM;
1947
1948                 our_env[n_env++] = x;
1949         }
1950
1951         if (c->log_namespace) {
1952                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1953                 if (!x)
1954                         return -ENOMEM;
1955
1956                 our_env[n_env++] = x;
1957         }
1958
1959         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1960                 _cleanup_free_ char *joined = NULL;
1961                 const char *n;
1962
1963                 if (!p->prefix[t])
1964                         continue;
1965
1966                 if (c->directories[t].n_items == 0)
1967                         continue;
1968
1969                 n = exec_directory_env_name_to_string(t);
1970                 if (!n)
1971                         continue;
1972
1973                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1974                         _cleanup_free_ char *prefixed = NULL;
1975
1976                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1977                         if (!prefixed)
1978                                 return -ENOMEM;
1979
1980                         if (!strextend_with_separator(&joined, ":", prefixed))
1981                                 return -ENOMEM;
1982                 }
1983
1984                 x = strjoin(n, "=", joined);
1985                 if (!x)
1986                         return -ENOMEM;
1987
1988                 our_env[n_env++] = x;
1989         }
1990
1991         _cleanup_free_ char *creds_dir = NULL;
1992         r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
1993         if (r < 0)
1994                 return r;
1995         if (r > 0) {
1996                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
1997                 if (!x)
1998                         return -ENOMEM;
1999
2000                 our_env[n_env++] = x;
2001         }
2002
2003         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2004                 return -ENOMEM;
2005
2006         our_env[n_env++] = x;
2007
2008         if (memory_pressure_path) {
2009                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2010                 if (!x)
2011                         return -ENOMEM;
2012
2013                 our_env[n_env++] = x;
2014
2015                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2016                         _cleanup_free_ char *b = NULL, *e = NULL;
2017
2018                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2019                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2020                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2021                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2022                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2023                                 return -ENOMEM;
2024
2025                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2026                                 return -ENOMEM;
2027
2028                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2029                         if (!x)
2030                                 return -ENOMEM;
2031
2032                         our_env[n_env++] = x;
2033                 }
2034         }
2035
2036         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2037 #undef N_ENV_VARS
2038
2039         *ret = TAKE_PTR(our_env);
2040
2041         return 0;
2042 }
2043
2044 static int build_pass_environment(const ExecContext *c, char ***ret) {
2045         _cleanup_strv_free_ char **pass_env = NULL;
2046         size_t n_env = 0;
2047
2048         STRV_FOREACH(i, c->pass_environment) {
2049                 _cleanup_free_ char *x = NULL;
2050                 char *v;
2051
2052                 v = getenv(*i);
2053                 if (!v)
2054                         continue;
2055                 x = strjoin(*i, "=", v);
2056                 if (!x)
2057                         return -ENOMEM;
2058
2059                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2060                         return -ENOMEM;
2061
2062                 pass_env[n_env++] = TAKE_PTR(x);
2063                 pass_env[n_env] = NULL;
2064         }
2065
2066         *ret = TAKE_PTR(pass_env);
2067
2068         return 0;
2069 }
2070
2071 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2072         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2073         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2074         _cleanup_close_ int unshare_ready_fd = -EBADF;
2075         _cleanup_(sigkill_waitp) pid_t pid = 0;
2076         uint64_t c = 1;
2077         ssize_t n;
2078         int r;
2079
2080         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2081          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2082          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2083          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2084          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2085          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2086          * continues execution normally.
2087          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2088          * does not need CAP_SETUID to write the single line mapping to itself. */
2089
2090         /* Can only set up multiple mappings with CAP_SETUID. */
2091         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2092                 r = asprintf(&uid_map,
2093                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2094                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2095                              ouid, ouid, uid, uid);
2096         else
2097                 r = asprintf(&uid_map,
2098                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2099                              ouid, ouid);
2100
2101         if (r < 0)
2102                 return -ENOMEM;
2103
2104         /* Can only set up multiple mappings with CAP_SETGID. */
2105         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2106                 r = asprintf(&gid_map,
2107                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2108                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2109                              ogid, ogid, gid, gid);
2110         else
2111                 r = asprintf(&gid_map,
2112                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2113                              ogid, ogid);
2114
2115         if (r < 0)
2116                 return -ENOMEM;
2117
2118         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2119          * namespace. */
2120         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2121         if (unshare_ready_fd < 0)
2122                 return -errno;
2123
2124         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2125          * failed. */
2126         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2127                 return -errno;
2128
2129         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2130         if (r < 0)
2131                 return r;
2132         if (r == 0) {
2133                 _cleanup_close_ int fd = -EBADF;
2134                 const char *a;
2135                 pid_t ppid;
2136
2137                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2138                  * here, after the parent opened its own user namespace. */
2139
2140                 ppid = getppid();
2141                 errno_pipe[0] = safe_close(errno_pipe[0]);
2142
2143                 /* Wait until the parent unshared the user namespace */
2144                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2145                         r = -errno;
2146                         goto child_fail;
2147                 }
2148
2149                 /* Disable the setgroups() system call in the child user namespace, for good. */
2150                 a = procfs_file_alloca(ppid, "setgroups");
2151                 fd = open(a, O_WRONLY|O_CLOEXEC);
2152                 if (fd < 0) {
2153                         if (errno != ENOENT) {
2154                                 r = -errno;
2155                                 goto child_fail;
2156                         }
2157
2158                         /* If the file is missing the kernel is too old, let's continue anyway. */
2159                 } else {
2160                         if (write(fd, "deny\n", 5) < 0) {
2161                                 r = -errno;
2162                                 goto child_fail;
2163                         }
2164
2165                         fd = safe_close(fd);
2166                 }
2167
2168                 /* First write the GID map */
2169                 a = procfs_file_alloca(ppid, "gid_map");
2170                 fd = open(a, O_WRONLY|O_CLOEXEC);
2171                 if (fd < 0) {
2172                         r = -errno;
2173                         goto child_fail;
2174                 }
2175                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2176                         r = -errno;
2177                         goto child_fail;
2178                 }
2179                 fd = safe_close(fd);
2180
2181                 /* The write the UID map */
2182                 a = procfs_file_alloca(ppid, "uid_map");
2183                 fd = open(a, O_WRONLY|O_CLOEXEC);
2184                 if (fd < 0) {
2185                         r = -errno;
2186                         goto child_fail;
2187                 }
2188                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2189                         r = -errno;
2190                         goto child_fail;
2191                 }
2192
2193                 _exit(EXIT_SUCCESS);
2194
2195         child_fail:
2196                 (void) write(errno_pipe[1], &r, sizeof(r));
2197                 _exit(EXIT_FAILURE);
2198         }
2199
2200         errno_pipe[1] = safe_close(errno_pipe[1]);
2201
2202         if (unshare(CLONE_NEWUSER) < 0)
2203                 return -errno;
2204
2205         /* Let the child know that the namespace is ready now */
2206         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2207                 return -errno;
2208
2209         /* Try to read an error code from the child */
2210         n = read(errno_pipe[0], &r, sizeof(r));
2211         if (n < 0)
2212                 return -errno;
2213         if (n == sizeof(r)) { /* an error code was sent to us */
2214                 if (r < 0)
2215                         return r;
2216                 return -EIO;
2217         }
2218         if (n != 0) /* on success we should have read 0 bytes */
2219                 return -EIO;
2220
2221         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2222         if (r < 0)
2223                 return r;
2224         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2225                 return -EIO;
2226
2227         return 0;
2228 }
2229
2230 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2231         _cleanup_free_ char *src_abs = NULL;
2232         int r;
2233
2234         assert(source);
2235
2236         src_abs = path_join(root, source);
2237         if (!src_abs)
2238                 return -ENOMEM;
2239
2240         STRV_FOREACH(dst, symlinks) {
2241                 _cleanup_free_ char *dst_abs = NULL;
2242
2243                 dst_abs = path_join(root, *dst);
2244                 if (!dst_abs)
2245                         return -ENOMEM;
2246
2247                 r = mkdir_parents_label(dst_abs, 0755);
2248                 if (r < 0)
2249                         return r;
2250
2251                 r = symlink_idempotent(src_abs, dst_abs, true);
2252                 if (r < 0)
2253                         return r;
2254         }
2255
2256         return 0;
2257 }
2258
2259 static int setup_exec_directory(
2260                 const ExecContext *context,
2261                 const ExecParameters *params,
2262                 uid_t uid,
2263                 gid_t gid,
2264                 ExecDirectoryType type,
2265                 bool needs_mount_namespace,
2266                 int *exit_status) {
2267
2268         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2269                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2270                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2271                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2272                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2273                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2274         };
2275         int r;
2276
2277         assert(context);
2278         assert(params);
2279         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2280         assert(exit_status);
2281
2282         if (!params->prefix[type])
2283                 return 0;
2284
2285         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2286                 if (!uid_is_valid(uid))
2287                         uid = 0;
2288                 if (!gid_is_valid(gid))
2289                         gid = 0;
2290         }
2291
2292         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2293                 _cleanup_free_ char *p = NULL, *pp = NULL;
2294
2295                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2296                 if (!p) {
2297                         r = -ENOMEM;
2298                         goto fail;
2299                 }
2300
2301                 r = mkdir_parents_label(p, 0755);
2302                 if (r < 0)
2303                         goto fail;
2304
2305                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2306
2307                         /* If we are in user mode, and a configuration directory exists but a state directory
2308                          * doesn't exist, then we likely are upgrading from an older systemd version that
2309                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2310                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2311                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2312                          * separated. If a service has both dirs configured but only the configuration dir
2313                          * exists and the state dir does not, we assume we are looking at an update
2314                          * situation. Hence, create a compatibility symlink, so that all expectations are
2315                          * met.
2316                          *
2317                          * (We also do something similar with the log directory, which still doesn't exist in
2318                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2319
2320                         /* this assumes the state dir is always created before the configuration dir */
2321                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2322                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2323
2324                         r = laccess(p, F_OK);
2325                         if (r == -ENOENT) {
2326                                 _cleanup_free_ char *q = NULL;
2327
2328                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2329                                  * under the configuration hierarchy. */
2330
2331                                 if (type == EXEC_DIRECTORY_STATE)
2332                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2333                                 else if (type == EXEC_DIRECTORY_LOGS)
2334                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2335                                 else
2336                                         assert_not_reached();
2337                                 if (!q) {
2338                                         r = -ENOMEM;
2339                                         goto fail;
2340                                 }
2341
2342                                 r = laccess(q, F_OK);
2343                                 if (r >= 0) {
2344                                         /* It does exist! This hence looks like an update. Symlink the
2345                                          * configuration directory into the state directory. */
2346
2347                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2348                                         if (r < 0)
2349                                                 goto fail;
2350
2351                                         log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2352                                         continue;
2353                                 } else if (r != -ENOENT)
2354                                         log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2355
2356                         } else if (r < 0)
2357                                 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2358                 }
2359
2360                 if (exec_directory_is_private(context, type)) {
2361                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2362                          * case we want to avoid leaving a directory around fully accessible that is owned by
2363                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2364                          * trick used by container managers to prohibit host users to get access to files of
2365                          * the same UID in containers: we place everything inside a directory that has an
2366                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2367                          * for unprivileged host code. We then use fs namespacing to make this directory
2368                          * permeable for the service itself.
2369                          *
2370                          * Specifically: for a service which wants a special directory "foo/" we first create
2371                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2372                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2373                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2374                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2375                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2376                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2377                          * for the service and making sure it only gets access to the dirs it needs but no
2378                          * others. Tricky? Yes, absolutely, but it works!
2379                          *
2380                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2381                          * to be owned by the service itself.
2382                          *
2383                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2384                          * for sharing files or sockets with other services. */
2385
2386                         pp = path_join(params->prefix[type], "private");
2387                         if (!pp) {
2388                                 r = -ENOMEM;
2389                                 goto fail;
2390                         }
2391
2392                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2393                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2394                         if (r < 0)
2395                                 goto fail;
2396
2397                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2398                                 r = -ENOMEM;
2399                                 goto fail;
2400                         }
2401
2402                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2403                         r = mkdir_parents_label(pp, 0755);
2404                         if (r < 0)
2405                                 goto fail;
2406
2407                         if (is_dir(p, false) > 0 &&
2408                             (laccess(pp, F_OK) == -ENOENT)) {
2409
2410                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2411                                  * it over. Most likely the service has been upgraded from one that didn't use
2412                                  * DynamicUser=1, to one that does. */
2413
2414                                 log_exec_info(context,
2415                                               params,
2416                                               "Found pre-existing public %s= directory %s, migrating to %s.\n"
2417                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2418                                               exec_directory_type_to_string(type), p, pp);
2419
2420                                 r = RET_NERRNO(rename(p, pp));
2421                                 if (r < 0)
2422                                         goto fail;
2423                         } else {
2424                                 /* Otherwise, create the actual directory for the service */
2425
2426                                 r = mkdir_label(pp, context->directories[type].mode);
2427                                 if (r < 0 && r != -EEXIST)
2428                                         goto fail;
2429                         }
2430
2431                         if (!context->directories[type].items[i].only_create) {
2432                                 /* And link it up from the original place.
2433                                  * Notes
2434                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2435                                  *    the host, and a new one for the child namespace will be created later.
2436                                  * 2) It is not necessary to create this symlink when one of its parent
2437                                  *    directories is specified and already created. E.g.
2438                                  *        StateDirectory=foo foo/bar
2439                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2440                                  *        pp = "/var/lib/private/foo/bar"
2441                                  *        p = "/var/lib/foo/bar"
2442                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2443                                  *    we do not need to create the symlink, but we cannot create the symlink.
2444                                  *    See issue #24783. */
2445                                 r = symlink_idempotent(pp, p, true);
2446                                 if (r < 0)
2447                                         goto fail;
2448                         }
2449
2450                 } else {
2451                         _cleanup_free_ char *target = NULL;
2452
2453                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2454                             readlink_and_make_absolute(p, &target) >= 0) {
2455                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2456
2457                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2458                                  * by DynamicUser=1 (see above)?
2459                                  *
2460                                  * We do this for all directory types except for ConfigurationDirectory=,
2461                                  * since they all support the private/ symlink logic at least in some
2462                                  * configurations, see above. */
2463
2464                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2465                                 if (r < 0)
2466                                         goto fail;
2467
2468                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2469                                 if (!q) {
2470                                         r = -ENOMEM;
2471                                         goto fail;
2472                                 }
2473
2474                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2475                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2476                                 if (r < 0)
2477                                         goto fail;
2478
2479                                 if (path_equal(q_resolved, target_resolved)) {
2480
2481                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2482                                          * but is no longer. Let's move the directory back up. */
2483
2484                                         log_exec_info(context,
2485                                                       params,
2486                                                       "Found pre-existing private %s= directory %s, migrating to %s.\n"
2487                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2488                                                       exec_directory_type_to_string(type), q, p);
2489
2490                                         r = RET_NERRNO(unlink(p));
2491                                         if (r < 0)
2492                                                 goto fail;
2493
2494                                         r = RET_NERRNO(rename(q, p));
2495                                         if (r < 0)
2496                                                 goto fail;
2497                                 }
2498                         }
2499
2500                         r = mkdir_label(p, context->directories[type].mode);
2501                         if (r < 0) {
2502                                 if (r != -EEXIST)
2503                                         goto fail;
2504
2505                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2506                                         struct stat st;
2507
2508                                         /* Don't change the owner/access mode of the configuration directory,
2509                                          * as in the common case it is not written to by a service, and shall
2510                                          * not be writable. */
2511
2512                                         r = RET_NERRNO(stat(p, &st));
2513                                         if (r < 0)
2514                                                 goto fail;
2515
2516                                         /* Still complain if the access mode doesn't match */
2517                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2518                                                 log_exec_warning(context,
2519                                                                  params,
2520                                                                  "%s \'%s\' already exists but the mode is different. "
2521                                                                  "(File system: %o %sMode: %o)",
2522                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2523                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2524
2525                                         continue;
2526                                 }
2527                         }
2528                 }
2529
2530                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2531                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2532                  * current UID/GID ownership.) */
2533                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2534                 if (r < 0)
2535                         goto fail;
2536
2537                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2538                  * available to user code anyway */
2539                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2540                         continue;
2541
2542                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2543                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2544                  * assignments to exist. */
2545                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2546                 if (r < 0)
2547                         goto fail;
2548         }
2549
2550         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2551          * they are set up later, to allow configuring empty var/run/etc. */
2552         if (!needs_mount_namespace)
2553                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2554                         r = create_many_symlinks(params->prefix[type],
2555                                                  context->directories[type].items[i].path,
2556                                                  context->directories[type].items[i].symlinks);
2557                         if (r < 0)
2558                                 goto fail;
2559                 }
2560
2561         return 0;
2562
2563 fail:
2564         *exit_status = exit_status_table[type];
2565         return r;
2566 }
2567
2568 #if ENABLE_SMACK
2569 static int setup_smack(
2570                 const ExecParameters *params,
2571                 const ExecContext *context,
2572                 int executable_fd) {
2573         int r;
2574
2575         assert(params);
2576         assert(executable_fd >= 0);
2577
2578         if (context->smack_process_label) {
2579                 r = mac_smack_apply_pid(0, context->smack_process_label);
2580                 if (r < 0)
2581                         return r;
2582         } else if (params->fallback_smack_process_label) {
2583                 _cleanup_free_ char *exec_label = NULL;
2584
2585                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2586                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2587                         return r;
2588
2589                 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2590                 if (r < 0)
2591                         return r;
2592         }
2593
2594         return 0;
2595 }
2596 #endif
2597
2598 static int compile_bind_mounts(
2599                 const ExecContext *context,
2600                 const ExecParameters *params,
2601                 BindMount **ret_bind_mounts,
2602                 size_t *ret_n_bind_mounts,
2603                 char ***ret_empty_directories) {
2604
2605         _cleanup_strv_free_ char **empty_directories = NULL;
2606         BindMount *bind_mounts = NULL;
2607         size_t n, h = 0;
2608         int r;
2609
2610         assert(context);
2611         assert(params);
2612         assert(ret_bind_mounts);
2613         assert(ret_n_bind_mounts);
2614         assert(ret_empty_directories);
2615
2616         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2617
2618         n = context->n_bind_mounts;
2619         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2620                 if (!params->prefix[t])
2621                         continue;
2622
2623                 for (size_t i = 0; i < context->directories[t].n_items; i++)
2624                         n += !context->directories[t].items[i].only_create;
2625         }
2626
2627         if (n <= 0) {
2628                 *ret_bind_mounts = NULL;
2629                 *ret_n_bind_mounts = 0;
2630                 *ret_empty_directories = NULL;
2631                 return 0;
2632         }
2633
2634         bind_mounts = new(BindMount, n);
2635         if (!bind_mounts)
2636                 return -ENOMEM;
2637
2638         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2639                 BindMount *item = context->bind_mounts + i;
2640                 _cleanup_free_ char *s = NULL, *d = NULL;
2641
2642                 s = strdup(item->source);
2643                 if (!s)
2644                         return -ENOMEM;
2645
2646                 d = strdup(item->destination);
2647                 if (!d)
2648                         return -ENOMEM;
2649
2650                 bind_mounts[h++] = (BindMount) {
2651                         .source = TAKE_PTR(s),
2652                         .destination = TAKE_PTR(d),
2653                         .read_only = item->read_only,
2654                         .recursive = item->recursive,
2655                         .ignore_enoent = item->ignore_enoent,
2656                 };
2657         }
2658
2659         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2660                 if (!params->prefix[t])
2661                         continue;
2662
2663                 if (context->directories[t].n_items == 0)
2664                         continue;
2665
2666                 if (exec_directory_is_private(context, t) &&
2667                     !exec_context_with_rootfs(context)) {
2668                         char *private_root;
2669
2670                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2671                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2672                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2673
2674                         private_root = path_join(params->prefix[t], "private");
2675                         if (!private_root)
2676                                 return -ENOMEM;
2677
2678                         r = strv_consume(&empty_directories, private_root);
2679                         if (r < 0)
2680                                 return r;
2681                 }
2682
2683                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2684                         _cleanup_free_ char *s = NULL, *d = NULL;
2685
2686                         /* When one of the parent directories is in the list, we cannot create the symlink
2687                          * for the child directory. See also the comments in setup_exec_directory(). */
2688                         if (context->directories[t].items[i].only_create)
2689                                 continue;
2690
2691                         if (exec_directory_is_private(context, t))
2692                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2693                         else
2694                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2695                         if (!s)
2696                                 return -ENOMEM;
2697
2698                         if (exec_directory_is_private(context, t) &&
2699                             exec_context_with_rootfs(context))
2700                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2701                                  * directory is not created on the root directory. So, let's bind-mount the directory
2702                                  * on the 'non-private' place. */
2703                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2704                         else
2705                                 d = strdup(s);
2706                         if (!d)
2707                                 return -ENOMEM;
2708
2709                         bind_mounts[h++] = (BindMount) {
2710                                 .source = TAKE_PTR(s),
2711                                 .destination = TAKE_PTR(d),
2712                                 .read_only = false,
2713                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2714                                 .recursive = true,
2715                                 .ignore_enoent = false,
2716                         };
2717                 }
2718         }
2719
2720         assert(h == n);
2721
2722         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2723         *ret_n_bind_mounts = n;
2724         *ret_empty_directories = TAKE_PTR(empty_directories);
2725
2726         return (int) n;
2727 }
2728
2729 /* ret_symlinks will contain a list of pairs src:dest that describes
2730  * the symlinks to create later on. For example, the symlinks needed
2731  * to safely give private directories to DynamicUser=1 users. */
2732 static int compile_symlinks(
2733                 const ExecContext *context,
2734                 const ExecParameters *params,
2735                 bool setup_os_release_symlink,
2736                 char ***ret_symlinks) {
2737
2738         _cleanup_strv_free_ char **symlinks = NULL;
2739         int r;
2740
2741         assert(context);
2742         assert(params);
2743         assert(ret_symlinks);
2744
2745         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2746                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2747                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2748
2749                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2750                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2751
2752                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2753                                 dst_abs = path_join(params->prefix[dt], *symlink);
2754                                 if (!src_abs || !dst_abs)
2755                                         return -ENOMEM;
2756
2757                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2758                                 if (r < 0)
2759                                         return r;
2760                         }
2761
2762                         if (!exec_directory_is_private(context, dt) ||
2763                             exec_context_with_rootfs(context) ||
2764                             context->directories[dt].items[i].only_create)
2765                                 continue;
2766
2767                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2768                         if (!private_path)
2769                                 return -ENOMEM;
2770
2771                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2772                         if (!path)
2773                                 return -ENOMEM;
2774
2775                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2776                         if (r < 0)
2777                                 return r;
2778                 }
2779         }
2780
2781         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2782          * and readers will never get a half-written version. Note that, while the paths specified here are
2783          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2784          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2785         if (setup_os_release_symlink) {
2786                 r = strv_extend_many(
2787                                 &symlinks,
2788                                 "/run/host/.os-release-stage/os-release",
2789                                 "/run/host/os-release");
2790                 if (r < 0)
2791                         return r;
2792         }
2793
2794         *ret_symlinks = TAKE_PTR(symlinks);
2795
2796         return 0;
2797 }
2798
2799 static bool insist_on_sandboxing(
2800                 const ExecContext *context,
2801                 const char *root_dir,
2802                 const char *root_image,
2803                 const BindMount *bind_mounts,
2804                 size_t n_bind_mounts) {
2805
2806         assert(context);
2807         assert(n_bind_mounts == 0 || bind_mounts);
2808
2809         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2810          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2811          * rearrange stuff in a way we cannot ignore gracefully. */
2812
2813         if (context->n_temporary_filesystems > 0)
2814                 return true;
2815
2816         if (root_dir || root_image)
2817                 return true;
2818
2819         if (context->n_mount_images > 0)
2820                 return true;
2821
2822         if (context->dynamic_user)
2823                 return true;
2824
2825         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2826                 return true;
2827
2828         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2829          * essential. */
2830         for (size_t i = 0; i < n_bind_mounts; i++)
2831                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2832                         return true;
2833
2834         if (context->log_namespace)
2835                 return true;
2836
2837         return false;
2838 }
2839
2840 static int setup_ephemeral(
2841                 const ExecContext *context,
2842                 ExecRuntime *runtime,
2843                 char **root_image,            /* both input and output! modified if ephemeral logic enabled */
2844                 char **root_directory) {      /* ditto */
2845
2846         _cleanup_close_ int fd = -EBADF;
2847         _cleanup_free_ char *new_root = NULL;
2848         int r;
2849
2850         assert(context);
2851         assert(root_image);
2852         assert(root_directory);
2853
2854         if (!*root_image && !*root_directory)
2855                 return 0;
2856
2857         if (!runtime || !runtime->ephemeral_copy)
2858                 return 0;
2859
2860         assert(runtime->ephemeral_storage_socket[0] >= 0);
2861         assert(runtime->ephemeral_storage_socket[1] >= 0);
2862
2863         new_root = strdup(runtime->ephemeral_copy);
2864         if (!new_root)
2865                 return log_oom_debug();
2866
2867         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2868         if (r < 0)
2869                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2870
2871         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2872
2873         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2874         if (fd >= 0)
2875                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2876                 return 0;
2877         if (fd != -EAGAIN)
2878                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2879
2880         if (*root_image) {
2881                 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
2882
2883                 fd = copy_file(*root_image,
2884                                new_root,
2885                                O_EXCL,
2886                                0600,
2887                                COPY_LOCK_BSD|
2888                                COPY_REFLINK|
2889                                COPY_CRTIME);
2890                 if (fd < 0)
2891                         return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
2892                                                *root_image, new_root);
2893
2894                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2895                  * which tends to not perform well in combination with lots of random writes.
2896                  *
2897                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2898                  * copy, but we at least want to make the intention clear.
2899                  */
2900                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2901                 if (r < 0)
2902                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
2903         } else {
2904                 assert(*root_directory);
2905
2906                 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
2907
2908                 fd = btrfs_subvol_snapshot_at(
2909                                 AT_FDCWD, *root_directory,
2910                                 AT_FDCWD, new_root,
2911                                 BTRFS_SNAPSHOT_FALLBACK_COPY |
2912                                 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2913                                 BTRFS_SNAPSHOT_RECURSIVE |
2914                                 BTRFS_SNAPSHOT_LOCK_BSD);
2915                 if (fd < 0)
2916                         return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
2917                                                *root_directory, new_root);
2918         }
2919
2920         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2921         if (r < 0)
2922                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2923
2924         if (*root_image)
2925                 free_and_replace(*root_image, new_root);
2926         else {
2927                 assert(*root_directory);
2928                 free_and_replace(*root_directory, new_root);
2929         }
2930
2931         return 1;
2932 }
2933
2934 static int verity_settings_prepare(
2935                 VeritySettings *verity,
2936                 const char *root_image,
2937                 const void *root_hash,
2938                 size_t root_hash_size,
2939                 const char *root_hash_path,
2940                 const void *root_hash_sig,
2941                 size_t root_hash_sig_size,
2942                 const char *root_hash_sig_path,
2943                 const char *verity_data_path) {
2944
2945         int r;
2946
2947         assert(verity);
2948
2949         if (root_hash) {
2950                 void *d;
2951
2952                 d = memdup(root_hash, root_hash_size);
2953                 if (!d)
2954                         return -ENOMEM;
2955
2956                 free_and_replace(verity->root_hash, d);
2957                 verity->root_hash_size = root_hash_size;
2958                 verity->designator = PARTITION_ROOT;
2959         }
2960
2961         if (root_hash_sig) {
2962                 void *d;
2963
2964                 d = memdup(root_hash_sig, root_hash_sig_size);
2965                 if (!d)
2966                         return -ENOMEM;
2967
2968                 free_and_replace(verity->root_hash_sig, d);
2969                 verity->root_hash_sig_size = root_hash_sig_size;
2970                 verity->designator = PARTITION_ROOT;
2971         }
2972
2973         if (verity_data_path) {
2974                 r = free_and_strdup(&verity->data_path, verity_data_path);
2975                 if (r < 0)
2976                         return r;
2977         }
2978
2979         r = verity_settings_load(
2980                         verity,
2981                         root_image,
2982                         root_hash_path,
2983                         root_hash_sig_path);
2984         if (r < 0)
2985                 return log_debug_errno(r, "Failed to load root hash: %m");
2986
2987         return 0;
2988 }
2989
2990 static int pick_versions(
2991                 const ExecContext *context,
2992                 const ExecParameters *params,
2993                 char **ret_root_image,
2994                 char **ret_root_directory) {
2995
2996         int r;
2997
2998         assert(context);
2999         assert(params);
3000         assert(ret_root_image);
3001         assert(ret_root_directory);
3002
3003         if (context->root_image) {
3004                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3005
3006                 r = path_pick(/* toplevel_path= */ NULL,
3007                               /* toplevel_fd= */ AT_FDCWD,
3008                               context->root_image,
3009                               &pick_filter_image_raw,
3010                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3011                               &result);
3012                 if (r < 0)
3013                         return r;
3014
3015                 if (!result.path)
3016                         return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3017
3018                 *ret_root_image = TAKE_PTR(result.path);
3019                 *ret_root_directory = NULL;
3020                 return r;
3021         }
3022
3023         if (context->root_directory) {
3024                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3025
3026                 r = path_pick(/* toplevel_path= */ NULL,
3027                               /* toplevel_fd= */ AT_FDCWD,
3028                               context->root_directory,
3029                               &pick_filter_image_dir,
3030                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3031                               &result);
3032                 if (r < 0)
3033                         return r;
3034
3035                 if (!result.path)
3036                         return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3037
3038                 *ret_root_image = NULL;
3039                 *ret_root_directory = TAKE_PTR(result.path);
3040                 return r;
3041         }
3042
3043         *ret_root_image = *ret_root_directory = NULL;
3044         return 0;
3045 }
3046
3047 static int apply_mount_namespace(
3048                 ExecCommandFlags command_flags,
3049                 const ExecContext *context,
3050                 const ExecParameters *params,
3051                 ExecRuntime *runtime,
3052                 const char *memory_pressure_path,
3053                 bool needs_sandboxing,
3054                 char **error_path) {
3055
3056         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3057         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3058                         **read_write_paths_cleanup = NULL;
3059         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3060                 *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3061         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3062         char **read_write_paths;
3063         bool setup_os_release_symlink;
3064         BindMount *bind_mounts = NULL;
3065         size_t n_bind_mounts = 0;
3066         int r;
3067
3068         assert(context);
3069
3070         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3071
3072         if (params->flags & EXEC_APPLY_CHROOT) {
3073                 r = pick_versions(
3074                                 context,
3075                                 params,
3076                                 &root_image,
3077                                 &root_dir);
3078                 if (r < 0)
3079                         return r;
3080
3081                 r = setup_ephemeral(
3082                                 context,
3083                                 runtime,
3084                                 &root_image,
3085                                 &root_dir);
3086                 if (r < 0)
3087                         return r;
3088         }
3089
3090         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3091         if (r < 0)
3092                 return r;
3093
3094         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3095          * service will need to write to it in order to start the notifications. */
3096         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3097                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3098                 if (!read_write_paths_cleanup)
3099                         return -ENOMEM;
3100
3101                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3102                 if (r < 0)
3103                         return r;
3104
3105                 read_write_paths = read_write_paths_cleanup;
3106         } else
3107                 read_write_paths = context->read_write_paths;
3108
3109         if (needs_sandboxing) {
3110                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3111                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3112                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3113
3114                 if (context->private_tmp && runtime && runtime->shared) {
3115                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3116                                 tmp_dir = runtime->shared->tmp_dir;
3117                         else if (runtime->shared->tmp_dir)
3118                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3119
3120                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3121                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3122                         else if (runtime->shared->var_tmp_dir)
3123                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3124                 }
3125         }
3126
3127         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3128         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3129         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3130         if (r < 0)
3131                 return r;
3132
3133         if (context->mount_propagation_flag == MS_SHARED)
3134                 log_exec_debug(context,
3135                                params,
3136                                "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3137
3138         r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3139         if (r < 0)
3140                 return r;
3141
3142         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3143                 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3144                 if (!propagate_dir)
3145                         return -ENOMEM;
3146
3147                 incoming_dir = strdup("/run/systemd/incoming");
3148                 if (!incoming_dir)
3149                         return -ENOMEM;
3150
3151                 extension_dir = strdup("/run/systemd/unit-extensions");
3152                 if (!extension_dir)
3153                         return -ENOMEM;
3154
3155                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3156                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3157                 if (setup_os_release_symlink) {
3158                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3159                         if (!host_os_release_stage)
3160                                 return -ENOMEM;
3161                 }
3162         } else {
3163                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3164
3165                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3166                         return -ENOMEM;
3167
3168                 if (setup_os_release_symlink) {
3169                         if (asprintf(&host_os_release_stage,
3170                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3171                                      geteuid()) < 0)
3172                                 return -ENOMEM;
3173                 }
3174         }
3175
3176         if (root_image) {
3177                 r = verity_settings_prepare(
3178                         &verity,
3179                         root_image,
3180                         context->root_hash, context->root_hash_size, context->root_hash_path,
3181                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3182                         context->root_verity);
3183                 if (r < 0)
3184                         return r;
3185         }
3186
3187         NamespaceParameters parameters = {
3188                 .runtime_scope = params->runtime_scope,
3189
3190                 .root_directory = root_dir,
3191                 .root_image = root_image,
3192                 .root_image_options = context->root_image_options,
3193                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3194
3195                 .read_write_paths = read_write_paths,
3196                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3197                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3198
3199                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3200                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3201
3202                 .empty_directories = empty_directories,
3203                 .symlinks = symlinks,
3204
3205                 .bind_mounts = bind_mounts,
3206                 .n_bind_mounts = n_bind_mounts,
3207
3208                 .temporary_filesystems = context->temporary_filesystems,
3209                 .n_temporary_filesystems = context->n_temporary_filesystems,
3210
3211                 .mount_images = context->mount_images,
3212                 .n_mount_images = context->n_mount_images,
3213                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3214
3215                 .tmp_dir = tmp_dir,
3216                 .var_tmp_dir = var_tmp_dir,
3217
3218                 .creds_path = creds_path,
3219                 .log_namespace = context->log_namespace,
3220                 .mount_propagation_flag = context->mount_propagation_flag,
3221
3222                 .verity = &verity,
3223
3224                 .extension_images = context->extension_images,
3225                 .n_extension_images = context->n_extension_images,
3226                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3227                 .extension_directories = context->extension_directories,
3228
3229                 .propagate_dir = propagate_dir,
3230                 .incoming_dir = incoming_dir,
3231                 .extension_dir = extension_dir,
3232                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3233                 .host_os_release_stage = host_os_release_stage,
3234
3235                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3236                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3237                  * sandbox inside the mount namespace. */
3238                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3239
3240                 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3241                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3242                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3243                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3244                 .protect_hostname = needs_sandboxing && context->protect_hostname,
3245
3246                 .private_dev = needs_sandboxing && context->private_devices,
3247                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3248                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3249
3250                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3251
3252                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3253                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3254
3255                 .protect_home = needs_sandboxing ? context->protect_home : false,
3256                 .protect_system = needs_sandboxing ? context->protect_system : false,
3257                 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3258                 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3259         };
3260
3261         r = setup_namespace(&parameters, error_path);
3262         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3263          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3264          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3265          * completely different execution environment. */
3266         if (r == -ENOANO) {
3267                 if (insist_on_sandboxing(
3268                                     context,
3269                                     root_dir, root_image,
3270                                     bind_mounts,
3271                                     n_bind_mounts))
3272                         return log_exec_debug_errno(context,
3273                                                     params,
3274                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3275                                                     "Failed to set up namespace, and refusing to continue since "
3276                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3277                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3278                                                     n_bind_mounts,
3279                                                     context->n_temporary_filesystems,
3280                                                     yes_no(root_dir),
3281                                                     yes_no(root_image),
3282                                                     yes_no(context->dynamic_user));
3283
3284                 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3285                 return 0;
3286         }
3287
3288         return r;
3289 }
3290
3291 static int apply_working_directory(
3292                 const ExecContext *context,
3293                 const ExecParameters *params,
3294                 ExecRuntime *runtime,
3295                 const char *home,
3296                 int *exit_status) {
3297
3298         const char *wd;
3299         int r;
3300
3301         assert(context);
3302         assert(exit_status);
3303
3304         if (context->working_directory_home) {
3305                 if (!home) {
3306                         *exit_status = EXIT_CHDIR;
3307                         return -ENXIO;
3308                 }
3309
3310                 wd = home;
3311         } else
3312                 wd = empty_to_root(context->working_directory);
3313
3314         if (params->flags & EXEC_APPLY_CHROOT)
3315                 r = RET_NERRNO(chdir(wd));
3316         else {
3317                 _cleanup_close_ int dfd = -EBADF;
3318
3319                 r = chase(wd,
3320                           (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
3321                           CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3322                           /* ret_path= */ NULL,
3323                           &dfd);
3324                 if (r >= 0)
3325                         r = RET_NERRNO(fchdir(dfd));
3326         }
3327
3328         if (r < 0 && !context->working_directory_missing_ok) {
3329                 *exit_status = EXIT_CHDIR;
3330                 return r;
3331         }
3332
3333         return 0;
3334 }
3335
3336 static int apply_root_directory(
3337                 const ExecContext *context,
3338                 const ExecParameters *params,
3339                 ExecRuntime *runtime,
3340                 const bool needs_mount_ns,
3341                 int *exit_status) {
3342
3343         assert(context);
3344         assert(exit_status);
3345
3346         if (params->flags & EXEC_APPLY_CHROOT)
3347                 if (!needs_mount_ns && context->root_directory)
3348                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3349                                 *exit_status = EXIT_CHROOT;
3350                                 return -errno;
3351                         }
3352
3353         return 0;
3354 }
3355
3356 static int setup_keyring(
3357                 const ExecContext *context,
3358                 const ExecParameters *p,
3359                 uid_t uid, gid_t gid) {
3360
3361         key_serial_t keyring;
3362         int r = 0;
3363         uid_t saved_uid;
3364         gid_t saved_gid;
3365
3366         assert(context);
3367         assert(p);
3368
3369         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3370          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3371          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3372          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3373          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3374          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3375
3376         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3377                 return 0;
3378
3379         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3380          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3381          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3382          * & group is just as nasty as acquiring a reference to the user keyring. */
3383
3384         saved_uid = getuid();
3385         saved_gid = getgid();
3386
3387         if (gid_is_valid(gid) && gid != saved_gid) {
3388                 if (setregid(gid, -1) < 0)
3389                         return log_exec_error_errno(context,
3390                                                     p,
3391                                                     errno,
3392                                                     "Failed to change GID for user keyring: %m");
3393         }
3394
3395         if (uid_is_valid(uid) && uid != saved_uid) {
3396                 if (setreuid(uid, -1) < 0) {
3397                         r = log_exec_error_errno(context,
3398                                                  p,
3399                                                  errno,
3400                                                  "Failed to change UID for user keyring: %m");
3401                         goto out;
3402                 }
3403         }
3404
3405         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3406         if (keyring == -1) {
3407                 if (errno == ENOSYS)
3408                         log_exec_debug_errno(context,
3409                                              p,
3410                                              errno,
3411                                              "Kernel keyring not supported, ignoring.");
3412                 else if (ERRNO_IS_PRIVILEGE(errno))
3413                         log_exec_debug_errno(context,
3414                                              p,
3415                                              errno,
3416                                              "Kernel keyring access prohibited, ignoring.");
3417                 else if (errno == EDQUOT)
3418                         log_exec_debug_errno(context,
3419                                              p,
3420                                              errno,
3421                                              "Out of kernel keyrings to allocate, ignoring.");
3422                 else
3423                         r = log_exec_error_errno(context,
3424                                                  p,
3425                                                  errno,
3426                                                  "Setting up kernel keyring failed: %m");
3427
3428                 goto out;
3429         }
3430
3431         /* When requested link the user keyring into the session keyring. */
3432         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3433
3434                 if (keyctl(KEYCTL_LINK,
3435                            KEY_SPEC_USER_KEYRING,
3436                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3437                         r = log_exec_error_errno(context,
3438                                                  p,
3439                                                  errno,
3440                                                  "Failed to link user keyring into session keyring: %m");
3441                         goto out;
3442                 }
3443         }
3444
3445         /* Restore uid/gid back */
3446         if (uid_is_valid(uid) && uid != saved_uid) {
3447                 if (setreuid(saved_uid, -1) < 0) {
3448                         r = log_exec_error_errno(context,
3449                                                  p,
3450                                                  errno,
3451                                                  "Failed to change UID back for user keyring: %m");
3452                         goto out;
3453                 }
3454         }
3455
3456         if (gid_is_valid(gid) && gid != saved_gid) {
3457                 if (setregid(saved_gid, -1) < 0)
3458                         return log_exec_error_errno(context,
3459                                                     p,
3460                                                     errno,
3461                                                     "Failed to change GID back for user keyring: %m");
3462         }
3463
3464         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3465         if (!sd_id128_is_null(p->invocation_id)) {
3466                 key_serial_t key;
3467
3468                 key = add_key("user",
3469                               "invocation_id",
3470                               &p->invocation_id,
3471                               sizeof(p->invocation_id),
3472                               KEY_SPEC_SESSION_KEYRING);
3473                 if (key == -1)
3474                         log_exec_debug_errno(context,
3475                                              p,
3476                                              errno,
3477                                              "Failed to add invocation ID to keyring, ignoring: %m");
3478                 else {
3479                         if (keyctl(KEYCTL_SETPERM, key,
3480                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3481                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3482                                 r = log_exec_error_errno(context,
3483                                                          p,
3484                                                          errno,
3485                                                          "Failed to restrict invocation ID permission: %m");
3486                 }
3487         }
3488
3489 out:
3490         /* Revert back uid & gid for the last time, and exit */
3491         /* no extra logging, as only the first already reported error matters */
3492         if (getuid() != saved_uid)
3493                 (void) setreuid(saved_uid, -1);
3494
3495         if (getgid() != saved_gid)
3496                 (void) setregid(saved_gid, -1);
3497
3498         return r;
3499 }
3500
3501 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3502         assert(array);
3503         assert(n);
3504         assert(pair);
3505
3506         if (pair[0] >= 0)
3507                 array[(*n)++] = pair[0];
3508         if (pair[1] >= 0)
3509                 array[(*n)++] = pair[1];
3510 }
3511
3512 static int close_remaining_fds(
3513                 const ExecParameters *params,
3514                 const ExecRuntime *runtime,
3515                 int socket_fd,
3516                 const int *fds, size_t n_fds) {
3517
3518         size_t n_dont_close = 0;
3519         int dont_close[n_fds + 14];
3520
3521         assert(params);
3522
3523         if (params->stdin_fd >= 0)
3524                 dont_close[n_dont_close++] = params->stdin_fd;
3525         if (params->stdout_fd >= 0)
3526                 dont_close[n_dont_close++] = params->stdout_fd;
3527         if (params->stderr_fd >= 0)
3528                 dont_close[n_dont_close++] = params->stderr_fd;
3529
3530         if (socket_fd >= 0)
3531                 dont_close[n_dont_close++] = socket_fd;
3532         if (n_fds > 0) {
3533                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3534                 n_dont_close += n_fds;
3535         }
3536
3537         if (runtime)
3538                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3539
3540         if (runtime && runtime->shared) {
3541                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3542                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3543         }
3544
3545         if (runtime && runtime->dynamic_creds) {
3546                 if (runtime->dynamic_creds->user)
3547                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3548                 if (runtime->dynamic_creds->group)
3549                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3550         }
3551
3552         if (params->user_lookup_fd >= 0)
3553                 dont_close[n_dont_close++] = params->user_lookup_fd;
3554
3555         return close_all_fds(dont_close, n_dont_close);
3556 }
3557
3558 static int send_user_lookup(
3559                 const char *unit_id,
3560                 int user_lookup_fd,
3561                 uid_t uid,
3562                 gid_t gid) {
3563
3564         assert(unit_id);
3565
3566         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3567          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3568          * specified. */
3569
3570         if (user_lookup_fd < 0)
3571                 return 0;
3572
3573         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3574                 return 0;
3575
3576         if (writev(user_lookup_fd,
3577                (struct iovec[]) {
3578                            IOVEC_MAKE(&uid, sizeof(uid)),
3579                            IOVEC_MAKE(&gid, sizeof(gid)),
3580                            IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3581                 return -errno;
3582
3583         return 0;
3584 }
3585
3586 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3587         int r;
3588
3589         assert(c);
3590         assert(home);
3591         assert(buf);
3592
3593         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3594
3595         if (*home)
3596                 return 0;
3597
3598         if (!c->working_directory_home)
3599                 return 0;
3600
3601         r = get_home_dir(buf);
3602         if (r < 0)
3603                 return r;
3604
3605         *home = *buf;
3606         return 1;
3607 }
3608
3609 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3610         _cleanup_strv_free_ char ** list = NULL;
3611         int r;
3612
3613         assert(c);
3614         assert(p);
3615         assert(ret);
3616
3617         assert(c->dynamic_user);
3618
3619         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3620          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3621          * directories. */
3622
3623         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3624                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3625                         continue;
3626
3627                 if (!p->prefix[t])
3628                         continue;
3629
3630                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3631                         char *e;
3632
3633                         if (exec_directory_is_private(c, t))
3634                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3635                         else
3636                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3637                         if (!e)
3638                                 return -ENOMEM;
3639
3640                         r = strv_consume(&list, e);
3641                         if (r < 0)
3642                                 return r;
3643                 }
3644         }
3645
3646         *ret = TAKE_PTR(list);
3647
3648         return 0;
3649 }
3650
3651 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3652         _cleanup_(cpu_set_reset) CPUSet s = {};
3653         int r;
3654
3655         assert(c);
3656         assert(ret);
3657
3658         if (!c->numa_policy.nodes.set) {
3659                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3660                 return 0;
3661         }
3662
3663         r = numa_to_cpu_set(&c->numa_policy, &s);
3664         if (r < 0)
3665                 return r;
3666
3667         cpu_set_reset(ret);
3668
3669         return cpu_set_add_all(ret, &s);
3670 }
3671
3672 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3673         int r;
3674
3675         assert(fds);
3676         assert(n_fds);
3677         assert(*n_fds < fds_size);
3678         assert(fd);
3679
3680         if (*fd < 0)
3681                return 0;
3682
3683         if (*fd < 3 + (int) *n_fds) {
3684                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3685                  * the fds we pass to the process (or which are closed only during execve). */
3686
3687                 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3688                 if (r < 0)
3689                         return -errno;
3690
3691                 close_and_replace(*fd, r);
3692         }
3693
3694         fds[(*n_fds)++] = *fd;
3695         return 1;
3696 }
3697
3698 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3699         union sockaddr_union addr = {
3700                 .un.sun_family = AF_UNIX,
3701         };
3702         socklen_t sa_len;
3703         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3704         int r;
3705
3706         assert(c);
3707         assert(p);
3708         assert(of);
3709         assert(ofd >= 0);
3710
3711         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3712         if (r < 0)
3713                 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3714
3715         sa_len = r;
3716
3717         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3718                 _cleanup_close_ int fd = -EBADF;
3719
3720                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3721                 if (fd < 0)
3722                         return log_exec_error_errno(c,
3723                                                     p,
3724                                                     errno,
3725                                                     "Failed to create socket for %s: %m",
3726                                                     of->path);
3727
3728                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3729                 if (r == -EPROTOTYPE)
3730                         continue;
3731                 if (r < 0)
3732                         return log_exec_error_errno(c,
3733                                                     p,
3734                                                     r,
3735                                                     "Failed to connect socket for %s: %m",
3736                                                     of->path);
3737
3738                 return TAKE_FD(fd);
3739         }
3740
3741         return log_exec_error_errno(c,
3742                                     p,
3743                                     SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3744                                     of->path);
3745 }
3746
3747 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3748         struct stat st;
3749         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3750
3751         assert(c);
3752         assert(p);
3753         assert(of);
3754
3755         ofd = open(of->path, O_PATH | O_CLOEXEC);
3756         if (ofd < 0)
3757                 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3758
3759         if (fstat(ofd, &st) < 0)
3760                 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3761
3762         if (S_ISSOCK(st.st_mode)) {
3763                 fd = connect_unix_harder(c, p, of, ofd);
3764                 if (fd < 0)
3765                         return fd;
3766
3767                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3768                         return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3769                                                     of->path);
3770
3771                 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3772         } else {
3773                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3774                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3775                         flags |= O_APPEND;
3776                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3777                         flags |= O_TRUNC;
3778
3779                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3780                 if (fd < 0)
3781                         return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3782
3783                 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3784         }
3785
3786         return TAKE_FD(fd);
3787 }
3788
3789 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3790         int r;
3791
3792         assert(c);
3793         assert(p);
3794         assert(n_fds);
3795
3796         LIST_FOREACH(open_files, of, p->open_files) {
3797                 _cleanup_close_ int fd = -EBADF;
3798
3799                 fd = get_open_file_fd(c, p, of);
3800                 if (fd < 0) {
3801                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3802                                 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3803                                 continue;
3804                         }
3805
3806                         return fd;
3807                 }
3808
3809                 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3810                         return -ENOMEM;
3811
3812                 r = strv_extend(&p->fd_names, of->fdname);
3813                 if (r < 0)
3814                         return r;
3815
3816                 p->fds[*n_fds] = TAKE_FD(fd);
3817
3818                 (*n_fds)++;
3819         }
3820
3821         return 0;
3822 }
3823
3824 static void log_command_line(
3825                 const ExecContext *context,
3826                 const ExecParameters *params,
3827                 const char *msg,
3828                 const char *executable,
3829                 char **argv) {
3830
3831         assert(context);
3832         assert(params);
3833         assert(msg);
3834         assert(executable);
3835
3836         if (!DEBUG_LOGGING)
3837                 return;
3838
3839         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3840
3841         log_exec_struct(context, params, LOG_DEBUG,
3842                         "EXECUTABLE=%s", executable,
3843                         LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3844                         LOG_EXEC_INVOCATION_ID(params));
3845 }
3846
3847 static bool exec_context_need_unprivileged_private_users(
3848                 const ExecContext *context,
3849                 const ExecParameters *params) {
3850
3851         assert(context);
3852         assert(params);
3853
3854         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3855          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3856          * (system manager) then we have privileges and don't need this. */
3857         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3858                 return false;
3859
3860         return context->private_users ||
3861                context->private_tmp ||
3862                context->private_devices ||
3863                context->private_network ||
3864                context->network_namespace_path ||
3865                context->private_ipc ||
3866                context->ipc_namespace_path ||
3867                context->private_mounts > 0 ||
3868                context->mount_apivfs ||
3869                context->n_bind_mounts > 0 ||
3870                context->n_temporary_filesystems > 0 ||
3871                context->root_directory ||
3872                !strv_isempty(context->extension_directories) ||
3873                context->protect_system != PROTECT_SYSTEM_NO ||
3874                context->protect_home != PROTECT_HOME_NO ||
3875                context->protect_kernel_tunables ||
3876                context->protect_kernel_modules ||
3877                context->protect_kernel_logs ||
3878                context->protect_control_groups ||
3879                context->protect_clock ||
3880                context->protect_hostname ||
3881                !strv_isempty(context->read_write_paths) ||
3882                !strv_isempty(context->read_only_paths) ||
3883                !strv_isempty(context->inaccessible_paths) ||
3884                !strv_isempty(context->exec_paths) ||
3885                !strv_isempty(context->no_exec_paths);
3886 }
3887
3888 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3889         assert(context);
3890
3891         if (confirm_spawn_disabled())
3892                 return false;
3893
3894         /* For some reasons units remaining in the same process group
3895          * as PID 1 fail to acquire the console even if it's not used
3896          * by any process. So skip the confirmation question for them. */
3897         return !context->same_pgrp;
3898 }
3899
3900 static int exec_context_named_iofds(
3901                 const ExecContext *c,
3902                 const ExecParameters *p,
3903                 int named_iofds[static 3]) {
3904
3905         size_t targets;
3906         const char* stdio_fdname[3];
3907         size_t n_fds;
3908
3909         assert(c);
3910         assert(p);
3911         assert(named_iofds);
3912
3913         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3914                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3915                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3916
3917         for (size_t i = 0; i < 3; i++)
3918                 stdio_fdname[i] = exec_context_fdname(c, i);
3919
3920         n_fds = p->n_storage_fds + p->n_socket_fds;
3921
3922         for (size_t i = 0; i < n_fds  && targets > 0; i++)
3923                 if (named_iofds[STDIN_FILENO] < 0 &&
3924                     c->std_input == EXEC_INPUT_NAMED_FD &&
3925                     stdio_fdname[STDIN_FILENO] &&
3926                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3927
3928                         named_iofds[STDIN_FILENO] = p->fds[i];
3929                         targets--;
3930
3931                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3932                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3933                            stdio_fdname[STDOUT_FILENO] &&
3934                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3935
3936                         named_iofds[STDOUT_FILENO] = p->fds[i];
3937                         targets--;
3938
3939                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3940                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3941                            stdio_fdname[STDERR_FILENO] &&
3942                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3943
3944                         named_iofds[STDERR_FILENO] = p->fds[i];
3945                         targets--;
3946                 }
3947
3948         return targets == 0 ? 0 : -ENOENT;
3949 }
3950
3951 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3952         if (!shared)
3953                 return;
3954
3955         safe_close_pair(shared->netns_storage_socket);
3956         safe_close_pair(shared->ipcns_storage_socket);
3957 }
3958
3959 static void exec_runtime_close(ExecRuntime *rt) {
3960         if (!rt)
3961                 return;
3962
3963         safe_close_pair(rt->ephemeral_storage_socket);
3964
3965         exec_shared_runtime_close(rt->shared);
3966         dynamic_creds_close(rt->dynamic_creds);
3967 }
3968
3969 static void exec_params_close(ExecParameters *p) {
3970         if (!p)
3971                 return;
3972
3973         p->stdin_fd = safe_close(p->stdin_fd);
3974         p->stdout_fd = safe_close(p->stdout_fd);
3975         p->stderr_fd = safe_close(p->stderr_fd);
3976 }
3977
3978 int exec_invoke(
3979                 const ExecCommand *command,
3980                 const ExecContext *context,
3981                 ExecParameters *params,
3982                 ExecRuntime *runtime,
3983                 const CGroupContext *cgroup_context,
3984                 int *exit_status) {
3985
3986         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3987         int r, ngids = 0;
3988         _cleanup_free_ gid_t *supplementary_gids = NULL;
3989         const char *username = NULL, *groupname = NULL;
3990         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3991         const char *home = NULL, *shell = NULL;
3992         char **final_argv = NULL;
3993         dev_t journal_stream_dev = 0;
3994         ino_t journal_stream_ino = 0;
3995         bool userns_set_up = false;
3996         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3997                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3998                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3999                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4000         bool keep_seccomp_privileges = false;
4001 #if HAVE_SELINUX
4002         _cleanup_free_ char *mac_selinux_context_net = NULL;
4003         bool use_selinux = false;
4004 #endif
4005 #if ENABLE_SMACK
4006         bool use_smack = false;
4007 #endif
4008 #if HAVE_APPARMOR
4009         bool use_apparmor = false;
4010 #endif
4011 #if HAVE_SECCOMP
4012         uint64_t saved_bset = 0;
4013 #endif
4014         uid_t saved_uid = getuid();
4015         gid_t saved_gid = getgid();
4016         uid_t uid = UID_INVALID;
4017         gid_t gid = GID_INVALID;
4018         size_t n_fds, /* fds to pass to the child */
4019                n_keep_fds; /* total number of fds not to close */
4020         int secure_bits;
4021         _cleanup_free_ gid_t *gids_after_pam = NULL;
4022         int ngids_after_pam = 0;
4023
4024         int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4025         size_t n_storage_fds, n_socket_fds;
4026
4027         assert(command);
4028         assert(context);
4029         assert(params);
4030         assert(exit_status);
4031
4032         /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4033          * and is already applied earlier. Just for safety. */
4034         if (context->log_level_max >= 0)
4035                 log_set_max_level(context->log_level_max);
4036
4037         /* Explicitly test for CVE-2021-4034 inspired invocations */
4038         if (!command->path || strv_isempty(command->argv)) {
4039                 *exit_status = EXIT_EXEC;
4040                 return log_exec_error_errno(
4041                                 context,
4042                                 params,
4043                                 SYNTHETIC_ERRNO(EINVAL),
4044                                 "Invalid command line arguments.");
4045         }
4046
4047         LOG_CONTEXT_PUSH_EXEC(context, params);
4048
4049         if (context->std_input == EXEC_INPUT_SOCKET ||
4050             context->std_output == EXEC_OUTPUT_SOCKET ||
4051             context->std_error == EXEC_OUTPUT_SOCKET) {
4052
4053                 if (params->n_socket_fds > 1)
4054                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4055
4056                 if (params->n_socket_fds == 0)
4057                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4058
4059                 socket_fd = params->fds[0];
4060                 n_storage_fds = n_socket_fds = 0;
4061         } else {
4062                 n_socket_fds = params->n_socket_fds;
4063                 n_storage_fds = params->n_storage_fds;
4064         }
4065         n_fds = n_socket_fds + n_storage_fds;
4066
4067         r = exec_context_named_iofds(context, params, named_iofds);
4068         if (r < 0)
4069                 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4070
4071         rename_process_from_path(command->path);
4072
4073         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4074          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4075          * both of which will be demoted to SIG_DFL. */
4076         (void) default_signals(SIGNALS_CRASH_HANDLER,
4077                                SIGNALS_IGNORE);
4078
4079         if (context->ignore_sigpipe)
4080                 (void) ignore_signals(SIGPIPE);
4081
4082         r = reset_signal_mask();
4083         if (r < 0) {
4084                 *exit_status = EXIT_SIGNAL_MASK;
4085                 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4086         }
4087
4088         if (params->idle_pipe)
4089                 do_idle_pipe_dance(params->idle_pipe);
4090
4091         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4092          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4093          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4094          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4095
4096         log_forget_fds();
4097         log_set_open_when_needed(true);
4098         log_settle_target();
4099
4100         /* In case anything used libc syslog(), close this here, too */
4101         closelog();
4102
4103         r = collect_open_file_fds(context, params, &n_fds);
4104         if (r < 0) {
4105                 *exit_status = EXIT_FDS;
4106                 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4107         }
4108
4109         int keep_fds[n_fds + 3];
4110         memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4111         n_keep_fds = n_fds;
4112
4113         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4114         if (r < 0) {
4115                 *exit_status = EXIT_FDS;
4116                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4117         }
4118
4119 #if HAVE_LIBBPF
4120         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4121         if (r < 0) {
4122                 *exit_status = EXIT_FDS;
4123                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4124         }
4125 #endif
4126
4127         r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4128         if (r < 0) {
4129                 *exit_status = EXIT_FDS;
4130                 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4131         }
4132
4133         if (!context->same_pgrp &&
4134             setsid() < 0) {
4135                 *exit_status = EXIT_SETSID;
4136                 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4137         }
4138
4139         exec_context_tty_reset(context, params);
4140
4141         if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4142                 _cleanup_free_ char *cmdline = NULL;
4143
4144                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4145                 if (!cmdline) {
4146                         *exit_status = EXIT_MEMORY;
4147                         return log_oom();
4148                 }
4149
4150                 r = ask_for_confirmation(context, params, cmdline);
4151                 if (r != CONFIRM_EXECUTE) {
4152                         if (r == CONFIRM_PRETEND_SUCCESS) {
4153                                 *exit_status = EXIT_SUCCESS;
4154                                 return 0;
4155                         }
4156
4157                         *exit_status = EXIT_CONFIRM;
4158                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4159                                                     "Execution cancelled by the user");
4160                 }
4161         }
4162
4163         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4164          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4165          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4166          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4167          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4168         if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4169             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4170                 *exit_status = EXIT_MEMORY;
4171                 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4172         }
4173
4174         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4175                 _cleanup_strv_free_ char **suggested_paths = NULL;
4176
4177                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4178                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4179                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4180                         *exit_status = EXIT_USER;
4181                         return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4182                 }
4183
4184                 r = compile_suggested_paths(context, params, &suggested_paths);
4185                 if (r < 0) {
4186                         *exit_status = EXIT_MEMORY;
4187                         return log_oom();
4188                 }
4189
4190                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4191                 if (r < 0) {
4192                         *exit_status = EXIT_USER;
4193                         if (r == -EILSEQ)
4194                                 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4195                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4196                         return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4197                 }
4198
4199                 if (!uid_is_valid(uid)) {
4200                         *exit_status = EXIT_USER;
4201                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4202                 }
4203
4204                 if (!gid_is_valid(gid)) {
4205                         *exit_status = EXIT_USER;
4206                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4207                 }
4208
4209                 if (runtime->dynamic_creds->user)
4210                         username = runtime->dynamic_creds->user->name;
4211
4212         } else {
4213                 if (context->user) {
4214                         r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4215                         if (r < 0) {
4216                                 *exit_status = EXIT_USER;
4217                                 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4218                         }
4219                 }
4220
4221                 if (context->group) {
4222                         r = get_fixed_group(context->group, &groupname, &gid);
4223                         if (r < 0) {
4224                                 *exit_status = EXIT_GROUP;
4225                                 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4226                         }
4227                 }
4228         }
4229
4230         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4231         r = get_supplementary_groups(context, username, groupname, gid,
4232                                      &supplementary_gids, &ngids);
4233         if (r < 0) {
4234                 *exit_status = EXIT_GROUP;
4235                 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4236         }
4237
4238         r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4239         if (r < 0) {
4240                 *exit_status = EXIT_USER;
4241                 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4242         }
4243
4244         params->user_lookup_fd = safe_close(params->user_lookup_fd);
4245
4246         r = acquire_home(context, uid, &home, &home_buffer);
4247         if (r < 0) {
4248                 *exit_status = EXIT_CHDIR;
4249                 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4250         }
4251
4252         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4253         if (socket_fd >= 0)
4254                 (void) fd_nonblock(socket_fd, false);
4255
4256         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4257          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4258         if (params->cgroup_path) {
4259                 _cleanup_free_ char *p = NULL;
4260
4261                 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4262                 if (r < 0) {
4263                         *exit_status = EXIT_CGROUP;
4264                         return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4265                 }
4266
4267                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4268                 if (r == -EUCLEAN) {
4269                         *exit_status = EXIT_CGROUP;
4270                         return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4271                                                     "because the cgroup or one of its parents or "
4272                                                     "siblings is in the threaded mode: %m", p);
4273                 }
4274                 if (r < 0) {
4275                         *exit_status = EXIT_CGROUP;
4276                         return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4277                 }
4278         }
4279
4280         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4281                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4282                 if (r < 0) {
4283                         *exit_status = EXIT_NETWORK;
4284                         return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4285                 }
4286         }
4287
4288         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4289                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4290                 if (r < 0) {
4291                         *exit_status = EXIT_NAMESPACE;
4292                         return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4293                 }
4294         }
4295
4296         r = setup_input(context, params, socket_fd, named_iofds);
4297         if (r < 0) {
4298                 *exit_status = EXIT_STDIN;
4299                 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4300         }
4301
4302         r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4303         if (r < 0) {
4304                 *exit_status = EXIT_STDOUT;
4305                 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4306         }
4307
4308         r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4309         if (r < 0) {
4310                 *exit_status = EXIT_STDERR;
4311                 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4312         }
4313
4314         if (context->oom_score_adjust_set) {
4315                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4316                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4317                 r = set_oom_score_adjust(context->oom_score_adjust);
4318                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4319                         log_exec_debug_errno(context, params, r,
4320                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4321                 else if (r < 0) {
4322                         *exit_status = EXIT_OOM_ADJUST;
4323                         return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4324                 }
4325         }
4326
4327         if (context->coredump_filter_set) {
4328                 r = set_coredump_filter(context->coredump_filter);
4329                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4330                         log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4331                 else if (r < 0) {
4332                         *exit_status = EXIT_LIMITS;
4333                         return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4334                 }
4335         }
4336
4337         if (context->nice_set) {
4338                 r = setpriority_closest(context->nice);
4339                 if (r < 0) {
4340                         *exit_status = EXIT_NICE;
4341                         return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4342                 }
4343         }
4344
4345         if (context->cpu_sched_set) {
4346                 struct sched_param param = {
4347                         .sched_priority = context->cpu_sched_priority,
4348                 };
4349
4350                 r = sched_setscheduler(0,
4351                                        context->cpu_sched_policy |
4352                                        (context->cpu_sched_reset_on_fork ?
4353                                         SCHED_RESET_ON_FORK : 0),
4354                                        &param);
4355                 if (r < 0) {
4356                         *exit_status = EXIT_SETSCHEDULER;
4357                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4358                 }
4359         }
4360
4361         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4362                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4363                 const CPUSet *cpu_set;
4364
4365                 if (context->cpu_affinity_from_numa) {
4366                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4367                         if (r < 0) {
4368                                 *exit_status = EXIT_CPUAFFINITY;
4369                                 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4370                         }
4371
4372                         cpu_set = &converted_cpu_set;
4373                 } else
4374                         cpu_set = &context->cpu_set;
4375
4376                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4377                         *exit_status = EXIT_CPUAFFINITY;
4378                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4379                 }
4380         }
4381
4382         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4383                 r = apply_numa_policy(&context->numa_policy);
4384                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4385                         log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4386                 else if (r < 0) {
4387                         *exit_status = EXIT_NUMA_POLICY;
4388                         return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4389                 }
4390         }
4391
4392         if (context->ioprio_set)
4393                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4394                         *exit_status = EXIT_IOPRIO;
4395                         return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4396                 }
4397
4398         if (context->timer_slack_nsec != NSEC_INFINITY)
4399                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4400                         *exit_status = EXIT_TIMERSLACK;
4401                         return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4402                 }
4403
4404         if (context->personality != PERSONALITY_INVALID) {
4405                 r = safe_personality(context->personality);
4406                 if (r < 0) {
4407                         *exit_status = EXIT_PERSONALITY;
4408                         return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4409                 }
4410         }
4411
4412 #if ENABLE_UTMP
4413         if (context->utmp_id) {
4414                 _cleanup_free_ char *username_alloc = NULL;
4415
4416                 if (!username && context->utmp_mode == EXEC_UTMP_USER) {
4417                         username_alloc = uid_to_name(uid_is_valid(uid) ? uid : saved_uid);
4418                         if (!username_alloc) {
4419                                 *exit_status = EXIT_USER;
4420                                 return log_oom();
4421                         }
4422                 }
4423
4424                 const char *line = context->tty_path ?
4425                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4426                         NULL;
4427                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4428                                       line,
4429                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4430                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4431                                       USER_PROCESS,
4432                                       username ?: username_alloc);
4433         }
4434 #endif
4435
4436         if (uid_is_valid(uid)) {
4437                 r = chown_terminal(STDIN_FILENO, uid);
4438                 if (r < 0) {
4439                         *exit_status = EXIT_STDIN;
4440                         return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4441                 }
4442         }
4443
4444         if (params->cgroup_path) {
4445                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4446                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4447                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4448                  * touch a single hierarchy too. */
4449
4450                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4451                         _cleanup_free_ char *p = NULL;
4452
4453                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4454                         if (r < 0) {
4455                                 *exit_status = EXIT_CGROUP;
4456                                 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4457                         }
4458
4459                         r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4460                         if (r < 0) {
4461                                 *exit_status = EXIT_CGROUP;
4462                                 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4463                         }
4464                         if (r > 0) {
4465                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4466                                 if (r < 0) {
4467                                         *exit_status = EXIT_CGROUP;
4468                                         return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4469                                 }
4470                         }
4471                 }
4472
4473                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4474                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4475                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4476                                 if (r < 0) {
4477                                         *exit_status = EXIT_MEMORY;
4478                                         return log_oom();
4479                                 }
4480
4481                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4482                                 if (r < 0) {
4483                                         log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4484                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4485                                         memory_pressure_path = mfree(memory_pressure_path);
4486                                 }
4487                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4488                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4489                                 if (!memory_pressure_path) {
4490                                         *exit_status = EXIT_MEMORY;
4491                                         return log_oom();
4492                                 }
4493                         }
4494                 }
4495         }
4496
4497         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4498
4499         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4500                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4501                 if (r < 0)
4502                         return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4503         }
4504
4505         r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4506         if (r < 0) {
4507                 *exit_status = EXIT_CREDENTIALS;
4508                 return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4509         }
4510
4511         r = build_environment(
4512                         context,
4513                         params,
4514                         cgroup_context,
4515                         n_fds,
4516                         home,
4517                         username,
4518                         shell,
4519                         journal_stream_dev,
4520                         journal_stream_ino,
4521                         memory_pressure_path,
4522                         &our_env);
4523         if (r < 0) {
4524                 *exit_status = EXIT_MEMORY;
4525                 return log_oom();
4526         }
4527
4528         r = build_pass_environment(context, &pass_env);
4529         if (r < 0) {
4530                 *exit_status = EXIT_MEMORY;
4531                 return log_oom();
4532         }
4533
4534         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4535          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4536          * not specify PATH but the unit has ExecSearchPath. */
4537         if (!strv_isempty(context->exec_search_path)) {
4538                 _cleanup_free_ char *joined = NULL;
4539
4540                 joined = strv_join(context->exec_search_path, ":");
4541                 if (!joined) {
4542                         *exit_status = EXIT_MEMORY;
4543                         return log_oom();
4544                 }
4545
4546                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4547                 if (r < 0) {
4548                         *exit_status = EXIT_MEMORY;
4549                         return log_oom();
4550                 }
4551         }
4552
4553         accum_env = strv_env_merge(params->environment,
4554                                    our_env,
4555                                    joined_exec_search_path,
4556                                    pass_env,
4557                                    context->environment,
4558                                    params->files_env);
4559         if (!accum_env) {
4560                 *exit_status = EXIT_MEMORY;
4561                 return log_oom();
4562         }
4563         accum_env = strv_env_clean(accum_env);
4564
4565         (void) umask(context->umask);
4566
4567         r = setup_keyring(context, params, uid, gid);
4568         if (r < 0) {
4569                 *exit_status = EXIT_KEYRING;
4570                 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4571         }
4572
4573         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4574          * from it. */
4575         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4576
4577         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4578          * for it, and the kernel doesn't actually support ambient caps. */
4579         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4580
4581         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4582          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4583          * desired. */
4584         if (needs_ambient_hack)
4585                 needs_setuid = false;
4586         else
4587                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4588
4589         uint64_t capability_ambient_set = context->capability_ambient_set;
4590
4591         if (needs_sandboxing) {
4592                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4593                  * /sys being present. The actual MAC context application will happen later, as late as
4594                  * possible, to avoid impacting our own code paths. */
4595
4596 #if HAVE_SELINUX
4597                 use_selinux = mac_selinux_use();
4598 #endif
4599 #if ENABLE_SMACK
4600                 use_smack = mac_smack_use();
4601 #endif
4602 #if HAVE_APPARMOR
4603                 use_apparmor = mac_apparmor_use();
4604 #endif
4605         }
4606
4607         if (needs_sandboxing) {
4608                 int which_failed;
4609
4610                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4611                  * is set here. (See below.) */
4612
4613                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4614                 if (r < 0) {
4615                         *exit_status = EXIT_LIMITS;
4616                         return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4617                 }
4618         }
4619
4620         if (needs_setuid && context->pam_name && username) {
4621                 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4622                  * wins here. (See above.) */
4623
4624                 /* All fds passed in the fds array will be closed in the pam child process. */
4625                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
4626                 if (r < 0) {
4627                         *exit_status = EXIT_PAM;
4628                         return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4629                 }
4630
4631                 if (ambient_capabilities_supported()) {
4632                         uint64_t ambient_after_pam;
4633
4634                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4635                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4636                         r = capability_get_ambient(&ambient_after_pam);
4637                         if (r < 0) {
4638                                 *exit_status = EXIT_CAPABILITIES;
4639                                 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4640                         }
4641
4642                         capability_ambient_set |= ambient_after_pam;
4643                 }
4644
4645                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4646                 if (ngids_after_pam < 0) {
4647                         *exit_status = EXIT_GROUP;
4648                         return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4649                 }
4650         }
4651
4652         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4653                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4654                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4655                  * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4656
4657                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4658                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4659                  * the actual requested operations fail (or silently continue). */
4660                 if (r < 0 && context->private_users) {
4661                         *exit_status = EXIT_USER;
4662                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4663                 }
4664                 if (r < 0)
4665                         log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4666                 else
4667                         userns_set_up = true;
4668         }
4669
4670         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4671
4672                 /* Try to enable network namespacing if network namespacing is available and we have
4673                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4674                  * new network namespace. And if we don't have that, then we could only create a network
4675                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4676                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4677                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4678                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4679                                 log_exec_notice_errno(context, params, r,
4680                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4681                         else if (r < 0) {
4682                                 *exit_status = EXIT_NETWORK;
4683                                 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4684                         }
4685                 } else if (context->network_namespace_path) {
4686                         *exit_status = EXIT_NETWORK;
4687                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4688                                                     "NetworkNamespacePath= is not supported, refusing.");
4689                 } else
4690                         log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4691         }
4692
4693         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4694
4695                 if (ns_type_supported(NAMESPACE_IPC)) {
4696                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4697                         if (r == -EPERM)
4698                                 log_exec_warning_errno(context, params, r,
4699                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4700                         else if (r < 0) {
4701                                 *exit_status = EXIT_NAMESPACE;
4702                                 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4703                         }
4704                 } else if (context->ipc_namespace_path) {
4705                         *exit_status = EXIT_NAMESPACE;
4706                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4707                                                     "IPCNamespacePath= is not supported, refusing.");
4708                 } else
4709                         log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4710         }
4711
4712         if (needs_mount_namespace) {
4713                 _cleanup_free_ char *error_path = NULL;
4714
4715                 r = apply_mount_namespace(command->flags,
4716                                           context,
4717                                           params,
4718                                           runtime,
4719                                           memory_pressure_path,
4720                                           needs_sandboxing,
4721                                           &error_path);
4722                 if (r < 0) {
4723                         *exit_status = EXIT_NAMESPACE;
4724                         return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4725                                                     error_path ? ": " : "", strempty(error_path));
4726                 }
4727         }
4728
4729         if (needs_sandboxing) {
4730                 r = apply_protect_hostname(context, params, exit_status);
4731                 if (r < 0)
4732                         return r;
4733         }
4734
4735         if (context->memory_ksm >= 0)
4736                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4737                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4738                                 log_exec_debug_errno(context,
4739                                                      params,
4740                                                      errno,
4741                                                      "KSM support not available, ignoring.");
4742                         else {
4743                                 *exit_status = EXIT_KSM;
4744                                 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4745                         }
4746                 }
4747
4748         /* Drop groups as early as possible.
4749          * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4750          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4751         if (needs_setuid) {
4752                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4753                 int ngids_to_enforce = 0;
4754
4755                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4756                                                    ngids,
4757                                                    gids_after_pam,
4758                                                    ngids_after_pam,
4759                                                    &gids_to_enforce);
4760                 if (ngids_to_enforce < 0) {
4761                         *exit_status = EXIT_GROUP;
4762                         return log_exec_error_errno(context, params,
4763                                                     ngids_to_enforce,
4764                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4765                 }
4766
4767                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4768                 if (r < 0) {
4769                         *exit_status = EXIT_GROUP;
4770                         return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4771                 }
4772         }
4773
4774         /* If the user namespace was not set up above, try to do it now.
4775          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4776          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4777          * case of mount namespaces being less privileged when the mount point list is copied from a
4778          * different user namespace). */
4779
4780         if (needs_sandboxing && context->private_users && !userns_set_up) {
4781                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4782                 if (r < 0) {
4783                         *exit_status = EXIT_USER;
4784                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4785                 }
4786         }
4787
4788         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4789          * shall execute. */
4790
4791         _cleanup_free_ char *executable = NULL;
4792         _cleanup_close_ int executable_fd = -EBADF;
4793         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4794         if (r < 0) {
4795                 *exit_status = EXIT_EXEC;
4796                 log_exec_struct_errno(context, params, LOG_NOTICE, r,
4797                                       "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4798                                       LOG_EXEC_MESSAGE(params,
4799                                                        "Unable to locate executable '%s': %m",
4800                                                        command->path),
4801                                       "EXECUTABLE=%s", command->path);
4802                 /* If the error will be ignored by manager, tune down the log level here. Missing executable
4803                  * is very much expected in this case. */
4804                 return r != -ENOMEM && FLAGS_SET(command->flags, EXEC_COMMAND_IGNORE_FAILURE) ? 1 : r;
4805         }
4806
4807         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4808         if (r < 0) {
4809                 *exit_status = EXIT_FDS;
4810                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4811         }
4812
4813 #if HAVE_SELINUX
4814         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4815                 int fd = -EBADF;
4816
4817                 if (socket_fd >= 0)
4818                         fd = socket_fd;
4819                 else if (params->n_socket_fds == 1)
4820                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4821                          * use context from that fd to compute the label. */
4822                         fd = params->fds[0];
4823
4824                 if (fd >= 0) {
4825                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4826                         if (r < 0) {
4827                                 if (!context->selinux_context_ignore) {
4828                                         *exit_status = EXIT_SELINUX_CONTEXT;
4829                                         return log_exec_error_errno(context,
4830                                                                     params,
4831                                                                     r,
4832                                                                     "Failed to determine SELinux context: %m");
4833                                 }
4834                                 log_exec_debug_errno(context,
4835                                                      params,
4836                                                      r,
4837                                                      "Failed to determine SELinux context, ignoring: %m");
4838                         }
4839                 }
4840         }
4841 #endif
4842
4843         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4844          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4845          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4846          * execve(). But first, close the remaining sockets in the context objects. */
4847
4848         exec_runtime_close(runtime);
4849         exec_params_close(params);
4850
4851         r = close_all_fds(keep_fds, n_keep_fds);
4852         if (r >= 0)
4853                 r = pack_fds(params->fds, n_fds);
4854         if (r >= 0)
4855                 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4856         if (r < 0) {
4857                 *exit_status = EXIT_FDS;
4858                 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4859         }
4860
4861         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4862          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4863          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4864          * came this far. */
4865
4866         secure_bits = context->secure_bits;
4867
4868         if (needs_sandboxing) {
4869                 uint64_t bset;
4870
4871                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4872                  * (Note this is placed after the general resource limit initialization, see above, in order
4873                  * to take precedence.) */
4874                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4875                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4876                                 *exit_status = EXIT_LIMITS;
4877                                 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4878                         }
4879                 }
4880
4881 #if ENABLE_SMACK
4882                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4883                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4884                 if (use_smack && context->smack_process_label) {
4885                         r = setup_smack(params, context, executable_fd);
4886                         if (r < 0 && !context->smack_process_label_ignore) {
4887                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4888                                 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4889                         }
4890                 }
4891 #endif
4892
4893                 bset = context->capability_bounding_set;
4894                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4895                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4896                  * instead of us doing that */
4897                 if (needs_ambient_hack)
4898                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4899                                 (UINT64_C(1) << CAP_SETUID) |
4900                                 (UINT64_C(1) << CAP_SETGID);
4901
4902 #if HAVE_SECCOMP
4903                 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4904                  * keep the needed privileges to apply it even if we're not root. */
4905                 if (needs_setuid &&
4906                     uid_is_valid(uid) &&
4907                     context_has_seccomp(context) &&
4908                     seccomp_allows_drop_privileges(context)) {
4909                         keep_seccomp_privileges = true;
4910
4911                         if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4912                                 *exit_status = EXIT_USER;
4913                                 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4914                         }
4915
4916                         /* Save the current bounding set so we can restore it after applying the seccomp
4917                          * filter */
4918                         saved_bset = bset;
4919                         bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4920                                 (UINT64_C(1) << CAP_SETPCAP);
4921                 }
4922 #endif
4923
4924                 if (!cap_test_all(bset)) {
4925                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4926                         if (r < 0) {
4927                                 *exit_status = EXIT_CAPABILITIES;
4928                                 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4929                         }
4930                 }
4931
4932                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4933                  * keep-caps set.
4934                  *
4935                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4936                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4937                  * the ambient capabilities can be raised as they are present in the permitted and
4938                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4939                  * without changing the user, so we also set the ambient capabilities here.
4940                  *
4941                  * The requested ambient capabilities are raised in the inheritable set if the second
4942                  * argument is true. */
4943                 if (!needs_ambient_hack) {
4944                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4945                         if (r < 0) {
4946                                 *exit_status = EXIT_CAPABILITIES;
4947                                 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4948                         }
4949                 }
4950         }
4951
4952         /* chroot to root directory first, before we lose the ability to chroot */
4953         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4954         if (r < 0)
4955                 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
4956
4957         if (needs_setuid) {
4958                 if (uid_is_valid(uid)) {
4959                         r = enforce_user(context, uid, capability_ambient_set);
4960                         if (r < 0) {
4961                                 *exit_status = EXIT_USER;
4962                                 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
4963                         }
4964
4965                         if (keep_seccomp_privileges) {
4966                                 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
4967                                         r = drop_capability(CAP_SETUID);
4968                                         if (r < 0) {
4969                                                 *exit_status = EXIT_USER;
4970                                                 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
4971                                         }
4972                                 }
4973
4974                                 r = keep_capability(CAP_SYS_ADMIN);
4975                                 if (r < 0) {
4976                                         *exit_status = EXIT_USER;
4977                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
4978                                 }
4979
4980                                 r = keep_capability(CAP_SETPCAP);
4981                                 if (r < 0) {
4982                                         *exit_status = EXIT_USER;
4983                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
4984                                 }
4985                         }
4986
4987                         if (!needs_ambient_hack && capability_ambient_set != 0) {
4988
4989                                 /* Raise the ambient capabilities after user change. */
4990                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4991                                 if (r < 0) {
4992                                         *exit_status = EXIT_CAPABILITIES;
4993                                         return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
4994                                 }
4995                         }
4996                 }
4997         }
4998
4999         /* Apply working directory here, because the working directory might be on NFS and only the user
5000          * running this service might have the correct privilege to change to the working directory. Also, it
5001          * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5002          * the cwd cannot be used to pin directories outside of the sandbox. */
5003         r = apply_working_directory(context, params, runtime, home, exit_status);
5004         if (r < 0)
5005                 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
5006
5007         if (needs_sandboxing) {
5008                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5009                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5010                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5011                  * are restricted. */
5012
5013 #if HAVE_SELINUX
5014                 if (use_selinux) {
5015                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5016
5017                         if (exec_context) {
5018                                 r = setexeccon(exec_context);
5019                                 if (r < 0) {
5020                                         if (!context->selinux_context_ignore) {
5021                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5022                                                 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
5023                                         }
5024                                         log_exec_debug_errno(context,
5025                                                              params,
5026                                                              r,
5027                                                              "Failed to change SELinux context to %s, ignoring: %m",
5028                                                              exec_context);
5029                                 }
5030                         }
5031                 }
5032 #endif
5033
5034 #if HAVE_APPARMOR
5035                 if (use_apparmor && context->apparmor_profile) {
5036                         r = aa_change_onexec(context->apparmor_profile);
5037                         if (r < 0 && !context->apparmor_profile_ignore) {
5038                                 *exit_status = EXIT_APPARMOR_PROFILE;
5039                                 return log_exec_error_errno(context,
5040                                                             params,
5041                                                             errno,
5042                                                             "Failed to prepare AppArmor profile change to %s: %m",
5043                                                             context->apparmor_profile);
5044                         }
5045                 }
5046 #endif
5047
5048                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5049                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5050                  * requires CAP_SETPCAP. */
5051                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5052                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5053                          * effective set here.
5054                          *
5055                          * The effective set is overwritten during execve() with the following values:
5056                          *
5057                          * - ambient set (for non-root processes)
5058                          *
5059                          * - (inheritable | bounding) set for root processes)
5060                          *
5061                          * Hence there is no security impact to raise it in the effective set before execve
5062                          */
5063                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5064                         if (r < 0) {
5065                                 *exit_status = EXIT_CAPABILITIES;
5066                                 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5067                         }
5068                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5069                                 *exit_status = EXIT_SECUREBITS;
5070                                 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5071                         }
5072                 }
5073
5074                 if (context_has_no_new_privileges(context))
5075                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5076                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5077                                 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5078                         }
5079
5080 #if HAVE_SECCOMP
5081                 r = apply_address_families(context, params);
5082                 if (r < 0) {
5083                         *exit_status = EXIT_ADDRESS_FAMILIES;
5084                         return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5085                 }
5086
5087                 r = apply_memory_deny_write_execute(context, params);
5088                 if (r < 0) {
5089                         *exit_status = EXIT_SECCOMP;
5090                         return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5091                 }
5092
5093                 r = apply_restrict_realtime(context, params);
5094                 if (r < 0) {
5095                         *exit_status = EXIT_SECCOMP;
5096                         return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5097                 }
5098
5099                 r = apply_restrict_suid_sgid(context, params);
5100                 if (r < 0) {
5101                         *exit_status = EXIT_SECCOMP;
5102                         return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5103                 }
5104
5105                 r = apply_restrict_namespaces(context, params);
5106                 if (r < 0) {
5107                         *exit_status = EXIT_SECCOMP;
5108                         return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5109                 }
5110
5111                 r = apply_protect_sysctl(context, params);
5112                 if (r < 0) {
5113                         *exit_status = EXIT_SECCOMP;
5114                         return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5115                 }
5116
5117                 r = apply_protect_kernel_modules(context, params);
5118                 if (r < 0) {
5119                         *exit_status = EXIT_SECCOMP;
5120                         return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5121                 }
5122
5123                 r = apply_protect_kernel_logs(context, params);
5124                 if (r < 0) {
5125                         *exit_status = EXIT_SECCOMP;
5126                         return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5127                 }
5128
5129                 r = apply_protect_clock(context, params);
5130                 if (r < 0) {
5131                         *exit_status = EXIT_SECCOMP;
5132                         return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5133                 }
5134
5135                 r = apply_private_devices(context, params);
5136                 if (r < 0) {
5137                         *exit_status = EXIT_SECCOMP;
5138                         return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5139                 }
5140
5141                 r = apply_syscall_archs(context, params);
5142                 if (r < 0) {
5143                         *exit_status = EXIT_SECCOMP;
5144                         return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5145                 }
5146
5147                 r = apply_lock_personality(context, params);
5148                 if (r < 0) {
5149                         *exit_status = EXIT_SECCOMP;
5150                         return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5151                 }
5152
5153                 r = apply_syscall_log(context, params);
5154                 if (r < 0) {
5155                         *exit_status = EXIT_SECCOMP;
5156                         return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5157                 }
5158 #endif
5159
5160 #if HAVE_LIBBPF
5161                 r = apply_restrict_filesystems(context, params);
5162                 if (r < 0) {
5163                         *exit_status = EXIT_BPF;
5164                         return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5165                 }
5166 #endif
5167
5168 #if HAVE_SECCOMP
5169                 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5170                  * by the filter as little as possible. */
5171                 r = apply_syscall_filter(context, params, needs_ambient_hack);
5172                 if (r < 0) {
5173                         *exit_status = EXIT_SECCOMP;
5174                         return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5175                 }
5176
5177                 if (keep_seccomp_privileges) {
5178                         /* Restore the capability bounding set with what's expected from the service + the
5179                          * ambient capabilities hack */
5180                         if (!cap_test_all(saved_bset)) {
5181                                 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5182                                 if (r < 0) {
5183                                         *exit_status = EXIT_CAPABILITIES;
5184                                         return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5185                                 }
5186                         }
5187
5188                         /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5189                          * applications that use it. */
5190                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5191                                 r = drop_capability(CAP_SYS_ADMIN);
5192                                 if (r < 0) {
5193                                         *exit_status = EXIT_USER;
5194                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5195                                 }
5196                         }
5197
5198                         /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5199                          * applications that use it. */
5200                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5201                                 r = drop_capability(CAP_SETPCAP);
5202                                 if (r < 0) {
5203                                         *exit_status = EXIT_USER;
5204                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5205                                 }
5206                         }
5207
5208                         if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5209                                 *exit_status = EXIT_USER;
5210                                 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5211                         }
5212                 }
5213 #endif
5214
5215         }
5216
5217         if (!strv_isempty(context->unset_environment)) {
5218                 char **ee = NULL;
5219
5220                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5221                 if (!ee) {
5222                         *exit_status = EXIT_MEMORY;
5223                         return log_oom();
5224                 }
5225
5226                 strv_free_and_replace(accum_env, ee);
5227         }
5228
5229         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5230                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5231
5232                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5233                 if (r < 0) {
5234                         *exit_status = EXIT_MEMORY;
5235                         return log_exec_error_errno(context,
5236                                                     params,
5237                                                     r,
5238                                                     "Failed to replace environment variables: %m");
5239                 }
5240                 final_argv = replaced_argv;
5241
5242                 if (!strv_isempty(unset_variables)) {
5243                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5244                         log_exec_warning(context,
5245                                          params,
5246                                          "Referenced but unset environment variable evaluates to an empty string: %s",
5247                                          strna(ju));
5248                 }
5249
5250                 if (!strv_isempty(bad_variables)) {
5251                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5252                         log_exec_warning(context,
5253                                          params,
5254                                          "Invalid environment variable name evaluates to an empty string: %s",
5255                                          strna(jb));
5256                 }
5257         } else
5258                 final_argv = command->argv;
5259
5260         log_command_line(context, params, "Executing", executable, final_argv);
5261
5262         if (params->exec_fd >= 0) {
5263                 uint8_t hot = 1;
5264
5265                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5266                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5267
5268                 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5269                         *exit_status = EXIT_EXEC;
5270                         return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5271                 }
5272         }
5273
5274         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5275
5276         if (params->exec_fd >= 0) {
5277                 uint8_t hot = 0;
5278
5279                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5280                  * that POLLHUP on it no longer means execve() succeeded. */
5281
5282                 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5283                         *exit_status = EXIT_EXEC;
5284                         return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5285                 }
5286         }
5287
5288         *exit_status = EXIT_EXEC;
5289         return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5290 }