src/core/exec-invoke.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/eventfd.h>
   4 #include <sys/ioctl.h>
   5 #include <sys/mount.h>
   6 #include <sys/prctl.h>
   7
   8 #if HAVE_PAM
   9 #include <security/pam_appl.h>
  10 #include <security/pam_misc.h>
  11 #endif
  12
  13 #if HAVE_APPARMOR
  14 #include <sys/apparmor.h>
  15 #endif
  16
  17 #include "sd-messages.h"
  18
  19 #if HAVE_APPARMOR
  20 #include "apparmor-util.h"
  21 #endif
  22 #include "argv-util.h"
  23 #include "barrier.h"
  24 #include "bpf-dlopen.h"
  25 #include "bpf-restrict-fs.h"
  26 #include "btrfs-util.h"
  27 #include "capability-util.h"
  28 #include "cgroup-setup.h"
  29 #include "chase.h"
  30 #include "chattr-util.h"
  31 #include "chown-recursive.h"
  32 #include "copy.h"
  33 #include "data-fd-util.h"
  34 #include "env-util.h"
  35 #include "escape.h"
  36 #include "exec-credential.h"
  37 #include "exec-invoke.h"
  38 #include "execute.h"
  39 #include "exit-status.h"
  40 #include "fd-util.h"
  41 #include "hexdecoct.h"
  42 #include "io-util.h"
  43 #include "iovec-util.h"
  44 #include "missing_ioprio.h"
  45 #include "missing_prctl.h"
  46 #include "missing_securebits.h"
  47 #include "missing_syscall.h"
  48 #include "mkdir-label.h"
  49 #include "proc-cmdline.h"
  50 #include "process-util.h"
  51 #include "psi-util.h"
  52 #include "rlimit-util.h"
  53 #include "seccomp-util.h"
  54 #include "selinux-util.h"
  55 #include "signal-util.h"
  56 #include "smack-util.h"
  57 #include "socket-util.h"
  58 #include "string-table.h"
  59 #include "strv.h"
  60 #include "terminal-util.h"
  61 #include "utmp-wtmp.h"
  62 #include "vpick.h"
  63
  64 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  65 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  66
  67 #define SNDBUF_SIZE (8*1024*1024)
  68
  69 static int shift_fds(int fds[], size_t n_fds) {
  70         if (n_fds <= 0)
  71                 return 0;
  72
  73         /* Modifies the fds array! (sorts it) */
  74
  75         assert(fds);
  76
  77         for (int start = 0;;) {
  78                 int restart_from = -1;
  79
  80                 for (int i = start; i < (int) n_fds; i++) {
  81                         int nfd;
  82
  83                         /* Already at right index? */
  84                         if (fds[i] == i+3)
  85                                 continue;
  86
  87                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
  88                         if (nfd < 0)
  89                                 return -errno;
  90
  91                         safe_close(fds[i]);
  92                         fds[i] = nfd;
  93
  94                         /* Hmm, the fd we wanted isn't free? Then
  95                          * let's remember that and try again from here */
  96                         if (nfd != i+3 && restart_from < 0)
  97                                 restart_from = i;
  98                 }
  99
 100                 if (restart_from < 0)
 101                         break;
 102
 103                 start = restart_from;
 104         }
 105
 106         return 0;
 107 }
 108
 109 static int flag_fds(
 110                 const int fds[],
 111                 size_t n_socket_fds,
 112                 size_t n_fds,
 113                 bool nonblock) {
 114
 115         int r;
 116
 117         assert(fds || n_fds == 0);
 118
 119         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 120          * O_NONBLOCK only applies to socket activation though. */
 121
 122         for (size_t i = 0; i < n_fds; i++) {
 123
 124                 if (i < n_socket_fds) {
 125                         r = fd_nonblock(fds[i], nonblock);
 126                         if (r < 0)
 127                                 return r;
 128                 }
 129
 130                 /* We unconditionally drop FD_CLOEXEC from the fds,
 131                  * since after all we want to pass these fds to our
 132                  * children */
 133
 134                 r = fd_cloexec(fds[i], false);
 135                 if (r < 0)
 136                         return r;
 137         }
 138
 139         return 0;
 140 }
 141
 142 static bool is_terminal_input(ExecInput i) {
 143         return IN_SET(i,
 144                       EXEC_INPUT_TTY,
 145                       EXEC_INPUT_TTY_FORCE,
 146                       EXEC_INPUT_TTY_FAIL);
 147 }
 148
 149 static bool is_terminal_output(ExecOutput o) {
 150         return IN_SET(o,
 151                       EXEC_OUTPUT_TTY,
 152                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 153                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 154 }
 155
 156 static bool is_kmsg_output(ExecOutput o) {
 157         return IN_SET(o,
 158                       EXEC_OUTPUT_KMSG,
 159                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 160 }
 161
 162 static bool exec_context_needs_term(const ExecContext *c) {
 163         assert(c);
 164
 165         /* Return true if the execution context suggests we should set $TERM to something useful. */
 166
 167         if (is_terminal_input(c->std_input))
 168                 return true;
 169
 170         if (is_terminal_output(c->std_output))
 171                 return true;
 172
 173         if (is_terminal_output(c->std_error))
 174                 return true;
 175
 176         return !!c->tty_path;
 177 }
 178
 179 static int open_null_as(int flags, int nfd) {
 180         int fd;
 181
 182         assert(nfd >= 0);
 183
 184         fd = open("/dev/null", flags|O_NOCTTY);
 185         if (fd < 0)
 186                 return -errno;
 187
 188         return move_fd(fd, nfd, false);
 189 }
 190
 191 static int connect_journal_socket(
 192                 int fd,
 193                 const char *log_namespace,
 194                 uid_t uid,
 195                 gid_t gid) {
 196
 197         uid_t olduid = UID_INVALID;
 198         gid_t oldgid = GID_INVALID;
 199         const char *j;
 200         int r;
 201
 202         j = log_namespace ?
 203                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 204                 "/run/systemd/journal/stdout";
 205
 206         if (gid_is_valid(gid)) {
 207                 oldgid = getgid();
 208
 209                 if (setegid(gid) < 0)
 210                         return -errno;
 211         }
 212
 213         if (uid_is_valid(uid)) {
 214                 olduid = getuid();
 215
 216                 if (seteuid(uid) < 0) {
 217                         r = -errno;
 218                         goto restore_gid;
 219                 }
 220         }
 221
 222         r = connect_unix_path(fd, AT_FDCWD, j);
 223
 224         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 225            an LSM interferes. */
 226
 227         if (uid_is_valid(uid))
 228                 (void) seteuid(olduid);
 229
 230  restore_gid:
 231         if (gid_is_valid(gid))
 232                 (void) setegid(oldgid);
 233
 234         return r;
 235 }
 236
 237 static int connect_logger_as(
 238                 const ExecContext *context,
 239                 const ExecParameters *params,
 240                 ExecOutput output,
 241                 const char *ident,
 242                 int nfd,
 243                 uid_t uid,
 244                 gid_t gid) {
 245
 246         _cleanup_close_ int fd = -EBADF;
 247         int r;
 248
 249         assert(context);
 250         assert(params);
 251         assert(output < _EXEC_OUTPUT_MAX);
 252         assert(ident);
 253         assert(nfd >= 0);
 254
 255         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 256         if (fd < 0)
 257                 return -errno;
 258
 259         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 260         if (r < 0)
 261                 return r;
 262
 263         if (shutdown(fd, SHUT_RD) < 0)
 264                 return -errno;
 265
 266         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 267
 268         if (dprintf(fd,
 269                 "%s\n"
 270                 "%s\n"
 271                 "%i\n"
 272                 "%i\n"
 273                 "%i\n"
 274                 "%i\n"
 275                 "%i\n",
 276                 context->syslog_identifier ?: ident,
 277                 params->flags & EXEC_PASS_LOG_UNIT ? params->unit_id : "",
 278                 context->syslog_priority,
 279                 !!context->syslog_level_prefix,
 280                 false,
 281                 is_kmsg_output(output),
 282                 is_terminal_output(output)) < 0)
 283                 return -errno;
 284
 285         return move_fd(TAKE_FD(fd), nfd, false);
 286 }
 287
 288 static int open_terminal_as(const char *path, int flags, int nfd) {
 289         int fd;
 290
 291         assert(path);
 292         assert(nfd >= 0);
 293
 294         fd = open_terminal(path, flags | O_NOCTTY);
 295         if (fd < 0)
 296                 return fd;
 297
 298         return move_fd(fd, nfd, false);
 299 }
 300
 301 static int acquire_path(const char *path, int flags, mode_t mode) {
 302         _cleanup_close_ int fd = -EBADF;
 303         int r;
 304
 305         assert(path);
 306
 307         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 308                 flags |= O_CREAT;
 309
 310         fd = open(path, flags|O_NOCTTY, mode);
 311         if (fd >= 0)
 312                 return TAKE_FD(fd);
 313
 314         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 315                 return -errno;
 316
 317         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 318
 319         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 320         if (fd < 0)
 321                 return -errno;
 322
 323         r = connect_unix_path(fd, AT_FDCWD, path);
 324         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 325                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 326                  * wasn't an AF_UNIX socket after all */
 327                 return -ENXIO;
 328         if (r < 0)
 329                 return r;
 330
 331         if ((flags & O_ACCMODE) == O_RDONLY)
 332                 r = shutdown(fd, SHUT_WR);
 333         else if ((flags & O_ACCMODE) == O_WRONLY)
 334                 r = shutdown(fd, SHUT_RD);
 335         else
 336                 r = 0;
 337         if (r < 0)
 338                 return -errno;
 339
 340         return TAKE_FD(fd);
 341 }
 342
 343 static int fixup_input(
 344                 const ExecContext *context,
 345                 int socket_fd,
 346                 bool apply_tty_stdin) {
 347
 348         ExecInput std_input;
 349
 350         assert(context);
 351
 352         std_input = context->std_input;
 353
 354         if (is_terminal_input(std_input) && !apply_tty_stdin)
 355                 return EXEC_INPUT_NULL;
 356
 357         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 358                 return EXEC_INPUT_NULL;
 359
 360         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 361                 return EXEC_INPUT_NULL;
 362
 363         return std_input;
 364 }
 365
 366 static int fixup_output(ExecOutput output, int socket_fd) {
 367
 368         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 369                 return EXEC_OUTPUT_INHERIT;
 370
 371         return output;
 372 }
 373
 374 static int setup_input(
 375                 const ExecContext *context,
 376                 const ExecParameters *params,
 377                 int socket_fd,
 378                 const int named_iofds[static 3]) {
 379
 380         ExecInput i;
 381         int r;
 382
 383         assert(context);
 384         assert(params);
 385         assert(named_iofds);
 386
 387         if (params->stdin_fd >= 0) {
 388                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 389                         return -errno;
 390
 391                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 392                 if (isatty(STDIN_FILENO)) {
 393                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 394
 395                         if (context->tty_reset)
 396                                 (void) reset_terminal_fd(STDIN_FILENO, /* switch_to_text= */ true);
 397
 398                         (void) exec_context_apply_tty_size(context, STDIN_FILENO, /* tty_path= */ NULL);
 399                 }
 400
 401                 return STDIN_FILENO;
 402         }
 403
 404         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 405
 406         switch (i) {
 407
 408         case EXEC_INPUT_NULL:
 409                 return open_null_as(O_RDONLY, STDIN_FILENO);
 410
 411         case EXEC_INPUT_TTY:
 412         case EXEC_INPUT_TTY_FORCE:
 413         case EXEC_INPUT_TTY_FAIL: {
 414                 _cleanup_close_ int tty_fd = -EBADF;
 415                 const char *tty_path;
 416
 417                 tty_path = ASSERT_PTR(exec_context_tty_path(context));
 418
 419                 tty_fd = acquire_terminal(tty_path,
 420                                           i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 421                                           i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 422                                                                       ACQUIRE_TERMINAL_WAIT,
 423                                           USEC_INFINITY);
 424                 if (tty_fd < 0)
 425                         return tty_fd;
 426
 427                 r = exec_context_apply_tty_size(context, tty_fd, tty_path);
 428                 if (r < 0)
 429                         return r;
 430
 431                 r = move_fd(tty_fd, STDIN_FILENO, /* cloexec= */ false);
 432                 if (r < 0)
 433                         return r;
 434
 435                 TAKE_FD(tty_fd);
 436                 return r;
 437         }
 438
 439         case EXEC_INPUT_SOCKET:
 440                 assert(socket_fd >= 0);
 441
 442                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 443
 444         case EXEC_INPUT_NAMED_FD:
 445                 assert(named_iofds[STDIN_FILENO] >= 0);
 446
 447                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 448                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 449
 450         case EXEC_INPUT_DATA: {
 451                 int fd;
 452
 453                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 454                 if (fd < 0)
 455                         return fd;
 456
 457                 return move_fd(fd, STDIN_FILENO, false);
 458         }
 459
 460         case EXEC_INPUT_FILE: {
 461                 bool rw;
 462                 int fd;
 463
 464                 assert(context->stdio_file[STDIN_FILENO]);
 465
 466                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 467                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 468
 469                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 470                 if (fd < 0)
 471                         return fd;
 472
 473                 return move_fd(fd, STDIN_FILENO, false);
 474         }
 475
 476         default:
 477                 assert_not_reached();
 478         }
 479 }
 480
 481 static bool can_inherit_stderr_from_stdout(
 482                 const ExecContext *context,
 483                 ExecOutput o,
 484                 ExecOutput e) {
 485
 486         assert(context);
 487
 488         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 489          * stderr fd */
 490
 491         if (e == EXEC_OUTPUT_INHERIT)
 492                 return true;
 493         if (e != o)
 494                 return false;
 495
 496         if (e == EXEC_OUTPUT_NAMED_FD)
 497                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 498
 499         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 500                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 501
 502         return true;
 503 }
 504
 505 static int setup_output(
 506                 const ExecContext *context,
 507                 const ExecParameters *params,
 508                 int fileno,
 509                 int socket_fd,
 510                 const int named_iofds[static 3],
 511                 const char *ident,
 512                 uid_t uid,
 513                 gid_t gid,
 514                 dev_t *journal_stream_dev,
 515                 ino_t *journal_stream_ino) {
 516
 517         ExecOutput o;
 518         ExecInput i;
 519         int r;
 520
 521         assert(context);
 522         assert(params);
 523         assert(ident);
 524         assert(journal_stream_dev);
 525         assert(journal_stream_ino);
 526
 527         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 528
 529                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 530                         return -errno;
 531
 532                 return STDOUT_FILENO;
 533         }
 534
 535         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 536                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 537                         return -errno;
 538
 539                 return STDERR_FILENO;
 540         }
 541
 542         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 543         o = fixup_output(context->std_output, socket_fd);
 544
 545         if (fileno == STDERR_FILENO) {
 546                 ExecOutput e;
 547                 e = fixup_output(context->std_error, socket_fd);
 548
 549                 /* This expects the input and output are already set up */
 550
 551                 /* Don't change the stderr file descriptor if we inherit all
 552                  * the way and are not on a tty */
 553                 if (e == EXEC_OUTPUT_INHERIT &&
 554                     o == EXEC_OUTPUT_INHERIT &&
 555                     i == EXEC_INPUT_NULL &&
 556                     !is_terminal_input(context->std_input) &&
 557                     getppid() != 1)
 558                         return fileno;
 559
 560                 /* Duplicate from stdout if possible */
 561                 if (can_inherit_stderr_from_stdout(context, o, e))
 562                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 563
 564                 o = e;
 565
 566         } else if (o == EXEC_OUTPUT_INHERIT) {
 567                 /* If input got downgraded, inherit the original value */
 568                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 569                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 570
 571                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 572                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 573                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 574
 575                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 576                 if (getppid() != 1)
 577                         return fileno;
 578
 579                 /* We need to open /dev/null here anew, to get the right access mode. */
 580                 return open_null_as(O_WRONLY, fileno);
 581         }
 582
 583         switch (o) {
 584
 585         case EXEC_OUTPUT_NULL:
 586                 return open_null_as(O_WRONLY, fileno);
 587
 588         case EXEC_OUTPUT_TTY:
 589                 if (is_terminal_input(i))
 590                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 591
 592                 /* We don't reset the terminal if this is just about output */
 593                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 594
 595         case EXEC_OUTPUT_KMSG:
 596         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 597         case EXEC_OUTPUT_JOURNAL:
 598         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 599                 r = connect_logger_as(context, params, o, ident, fileno, uid, gid);
 600                 if (r < 0) {
 601                         log_exec_warning_errno(context,
 602                                                params,
 603                                                r,
 604                                                "Failed to connect %s to the journal socket, ignoring: %m",
 605                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 606                         r = open_null_as(O_WRONLY, fileno);
 607                 } else {
 608                         struct stat st;
 609
 610                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 611                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 612                          * services to detect whether they are connected to the journal or not.
 613                          *
 614                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 615                          * about STDERR as that's usually the best way to do logging. */
 616
 617                         if (fstat(fileno, &st) >= 0 &&
 618                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 619                                 *journal_stream_dev = st.st_dev;
 620                                 *journal_stream_ino = st.st_ino;
 621                         }
 622                 }
 623                 return r;
 624
 625         case EXEC_OUTPUT_SOCKET:
 626                 assert(socket_fd >= 0);
 627
 628                 return RET_NERRNO(dup2(socket_fd, fileno));
 629
 630         case EXEC_OUTPUT_NAMED_FD:
 631                 assert(named_iofds[fileno] >= 0);
 632
 633                 (void) fd_nonblock(named_iofds[fileno], false);
 634                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 635
 636         case EXEC_OUTPUT_FILE:
 637         case EXEC_OUTPUT_FILE_APPEND:
 638         case EXEC_OUTPUT_FILE_TRUNCATE: {
 639                 bool rw;
 640                 int fd, flags;
 641
 642                 assert(context->stdio_file[fileno]);
 643
 644                 rw = context->std_input == EXEC_INPUT_FILE &&
 645                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 646
 647                 if (rw)
 648                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 649
 650                 flags = O_WRONLY;
 651                 if (o == EXEC_OUTPUT_FILE_APPEND)
 652                         flags |= O_APPEND;
 653                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 654                         flags |= O_TRUNC;
 655
 656                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 657                 if (fd < 0)
 658                         return fd;
 659
 660                 return move_fd(fd, fileno, 0);
 661         }
 662
 663         default:
 664                 assert_not_reached();
 665         }
 666 }
 667
 668 static int chown_terminal(int fd, uid_t uid) {
 669         int r;
 670
 671         assert(fd >= 0);
 672
 673         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 674         if (!isatty_safe(fd))
 675                 return 0;
 676
 677         /* This might fail. What matters are the results. */
 678         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 679         if (r < 0)
 680                 return r;
 681
 682         return 1;
 683 }
 684
 685 static int setup_confirm_stdio(
 686                 const ExecContext *context,
 687                 const char *vc,
 688                 int *ret_saved_stdin,
 689                 int *ret_saved_stdout) {
 690
 691         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 692         int r;
 693
 694         assert(ret_saved_stdin);
 695         assert(ret_saved_stdout);
 696
 697         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 698         if (saved_stdin < 0)
 699                 return -errno;
 700
 701         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 702         if (saved_stdout < 0)
 703                 return -errno;
 704
 705         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 706         if (fd < 0)
 707                 return fd;
 708
 709         r = chown_terminal(fd, getuid());
 710         if (r < 0)
 711                 return r;
 712
 713         r = reset_terminal_fd(fd, /* switch_to_text= */ true);
 714         if (r < 0)
 715                 return r;
 716
 717         r = exec_context_apply_tty_size(context, fd, vc);
 718         if (r < 0)
 719                 return r;
 720
 721         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 722         TAKE_FD(fd);
 723         if (r < 0)
 724                 return r;
 725
 726         *ret_saved_stdin = TAKE_FD(saved_stdin);
 727         *ret_saved_stdout = TAKE_FD(saved_stdout);
 728         return 0;
 729 }
 730
 731 static void write_confirm_error_fd(int err, int fd, const char *unit_id) {
 732         assert(err < 0);
 733         assert(unit_id);
 734
 735         if (err == -ETIMEDOUT)
 736                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", unit_id);
 737         else {
 738                 errno = -err;
 739                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", unit_id);
 740         }
 741 }
 742
 743 static void write_confirm_error(int err, const char *vc, const char *unit_id) {
 744         _cleanup_close_ int fd = -EBADF;
 745
 746         assert(vc);
 747
 748         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 749         if (fd < 0)
 750                 return;
 751
 752         write_confirm_error_fd(err, fd, unit_id);
 753 }
 754
 755 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 756         int r = 0;
 757
 758         assert(saved_stdin);
 759         assert(saved_stdout);
 760
 761         release_terminal();
 762
 763         if (*saved_stdin >= 0)
 764                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 765                         r = -errno;
 766
 767         if (*saved_stdout >= 0)
 768                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 769                         r = -errno;
 770
 771         *saved_stdin = safe_close(*saved_stdin);
 772         *saved_stdout = safe_close(*saved_stdout);
 773
 774         return r;
 775 }
 776
 777 enum {
 778         CONFIRM_PRETEND_FAILURE = -1,
 779         CONFIRM_PRETEND_SUCCESS =  0,
 780         CONFIRM_EXECUTE = 1,
 781 };
 782
 783 static bool confirm_spawn_disabled(void) {
 784         return access("/run/systemd/confirm_spawn_disabled", F_OK) >= 0;
 785 }
 786
 787 static int ask_for_confirmation(const ExecContext *context, const ExecParameters *params, const char *cmdline) {
 788         int saved_stdout = -1, saved_stdin = -1, r;
 789         _cleanup_free_ char *e = NULL;
 790         char c;
 791
 792         assert(context);
 793         assert(params);
 794
 795         /* For any internal errors, assume a positive response. */
 796         r = setup_confirm_stdio(context, params->confirm_spawn, &saved_stdin, &saved_stdout);
 797         if (r < 0) {
 798                 write_confirm_error(r, params->confirm_spawn, params->unit_id);
 799                 return CONFIRM_EXECUTE;
 800         }
 801
 802         /* confirm_spawn might have been disabled while we were sleeping. */
 803         if (!params->confirm_spawn || confirm_spawn_disabled()) {
 804                 r = 1;
 805                 goto restore_stdio;
 806         }
 807
 808         e = ellipsize(cmdline, 60, 100);
 809         if (!e) {
 810                 log_oom();
 811                 r = CONFIRM_EXECUTE;
 812                 goto restore_stdio;
 813         }
 814
 815         for (;;) {
 816                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 817                 if (r < 0) {
 818                         write_confirm_error_fd(r, STDOUT_FILENO, params->unit_id);
 819                         r = CONFIRM_EXECUTE;
 820                         goto restore_stdio;
 821                 }
 822
 823                 switch (c) {
 824                 case 'c':
 825                         printf("Resuming normal execution.\n");
 826                         manager_disable_confirm_spawn();
 827                         r = 1;
 828                         break;
 829                 case 'D':
 830                         printf("  Unit: %s\n",
 831                                params->unit_id);
 832                         exec_context_dump(context, stdout, "  ");
 833                         exec_params_dump(params, stdout, "  ");
 834                         continue; /* ask again */
 835                 case 'f':
 836                         printf("Failing execution.\n");
 837                         r = CONFIRM_PRETEND_FAILURE;
 838                         break;
 839                 case 'h':
 840                         printf("  c - continue, proceed without asking anymore\n"
 841                                "  D - dump, show the state of the unit\n"
 842                                "  f - fail, don't execute the command and pretend it failed\n"
 843                                "  h - help\n"
 844                                "  i - info, show a short summary of the unit\n"
 845                                "  j - jobs, show jobs that are in progress\n"
 846                                "  s - skip, don't execute the command and pretend it succeeded\n"
 847                                "  y - yes, execute the command\n");
 848                         continue; /* ask again */
 849                 case 'i':
 850                         printf("  Unit:        %s\n"
 851                                "  Command:     %s\n",
 852                                params->unit_id, cmdline);
 853                         continue; /* ask again */
 854                 case 'j':
 855                         if (sigqueue(getppid(),
 856                                      SIGRTMIN+18,
 857                                      (const union sigval) { .sival_int = MANAGER_SIGNAL_COMMAND_DUMP_JOBS }) < 0)
 858                                 return -errno;
 859
 860                         continue; /* ask again */
 861                 case 'n':
 862                         /* 'n' was removed in favor of 'f'. */
 863                         printf("Didn't understand 'n', did you mean 'f'?\n");
 864                         continue; /* ask again */
 865                 case 's':
 866                         printf("Skipping execution.\n");
 867                         r = CONFIRM_PRETEND_SUCCESS;
 868                         break;
 869                 case 'y':
 870                         r = CONFIRM_EXECUTE;
 871                         break;
 872                 default:
 873                         assert_not_reached();
 874                 }
 875                 break;
 876         }
 877
 878 restore_stdio:
 879         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 880         return r;
 881 }
 882
 883 static int get_fixed_user(
 884                 const char *user_or_uid,
 885                 const char **ret_username,
 886                 uid_t *ret_uid,
 887                 gid_t *ret_gid,
 888                 const char **ret_home,
 889                 const char **ret_shell) {
 890
 891         int r;
 892
 893         assert(user_or_uid);
 894         assert(ret_username);
 895
 896         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 897          * (i.e. are "/" or "/bin/nologin"). */
 898
 899         r = get_user_creds(&user_or_uid, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
 900         if (r < 0)
 901                 return r;
 902
 903         /* user_or_uid is normalized by get_user_creds to username */
 904         *ret_username = user_or_uid;
 905
 906         return 0;
 907 }
 908
 909 static int get_fixed_group(
 910                 const char *group_or_gid,
 911                 const char **ret_groupname,
 912                 gid_t *ret_gid) {
 913
 914         int r;
 915
 916         assert(group_or_gid);
 917         assert(ret_groupname);
 918
 919         r = get_group_creds(&group_or_gid, ret_gid, /* flags = */ 0);
 920         if (r < 0)
 921                 return r;
 922
 923         /* group_or_gid is normalized by get_group_creds to groupname */
 924         *ret_groupname = group_or_gid;
 925
 926         return 0;
 927 }
 928
 929 static int get_supplementary_groups(const ExecContext *c, const char *user,
 930                                     const char *group, gid_t gid,
 931                                     gid_t **supplementary_gids, int *ngids) {
 932         int r, k = 0;
 933         int ngroups_max;
 934         bool keep_groups = false;
 935         gid_t *groups = NULL;
 936         _cleanup_free_ gid_t *l_gids = NULL;
 937
 938         assert(c);
 939
 940         /*
 941          * If user is given, then lookup GID and supplementary groups list.
 942          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 943          * here and as early as possible so we keep the list of supplementary
 944          * groups of the caller.
 945          */
 946         if (user && gid_is_valid(gid) && gid != 0) {
 947                 /* First step, initialize groups from /etc/groups */
 948                 if (initgroups(user, gid) < 0)
 949                         return -errno;
 950
 951                 keep_groups = true;
 952         }
 953
 954         if (strv_isempty(c->supplementary_groups))
 955                 return 0;
 956
 957         /*
 958          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 959          * be positive, otherwise fail.
 960          */
 961         errno = 0;
 962         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 963         if (ngroups_max <= 0)
 964                 return errno_or_else(EOPNOTSUPP);
 965
 966         l_gids = new(gid_t, ngroups_max);
 967         if (!l_gids)
 968                 return -ENOMEM;
 969
 970         if (keep_groups) {
 971                 /*
 972                  * Lookup the list of groups that the user belongs to, we
 973                  * avoid NSS lookups here too for gid=0.
 974                  */
 975                 k = ngroups_max;
 976                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 977                         return -EINVAL;
 978         } else
 979                 k = 0;
 980
 981         STRV_FOREACH(i, c->supplementary_groups) {
 982                 const char *g;
 983
 984                 if (k >= ngroups_max)
 985                         return -E2BIG;
 986
 987                 g = *i;
 988                 r = get_group_creds(&g, l_gids+k, 0);
 989                 if (r < 0)
 990                         return r;
 991
 992                 k++;
 993         }
 994
 995         /*
 996          * Sets ngids to zero to drop all supplementary groups, happens
 997          * when we are under root and SupplementaryGroups= is empty.
 998          */
 999         if (k == 0) {
1000                 *ngids = 0;
1001                 return 0;
1002         }
1003
1004         /* Otherwise get the final list of supplementary groups */
1005         groups = memdup(l_gids, sizeof(gid_t) * k);
1006         if (!groups)
1007                 return -ENOMEM;
1008
1009         *supplementary_gids = groups;
1010         *ngids = k;
1011
1012         groups = NULL;
1013
1014         return 0;
1015 }
1016
1017 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1018         int r;
1019
1020         /* Handle SupplementaryGroups= if it is not empty */
1021         if (ngids > 0) {
1022                 r = maybe_setgroups(ngids, supplementary_gids);
1023                 if (r < 0)
1024                         return r;
1025         }
1026
1027         if (gid_is_valid(gid)) {
1028                 /* Then set our gids */
1029                 if (setresgid(gid, gid, gid) < 0)
1030                         return -errno;
1031         }
1032
1033         return 0;
1034 }
1035
1036 static int set_securebits(unsigned bits, unsigned mask) {
1037         unsigned applied;
1038         int current;
1039
1040         current = prctl(PR_GET_SECUREBITS);
1041         if (current < 0)
1042                 return -errno;
1043
1044         /* Clear all securebits defined in mask and set bits */
1045         applied = ((unsigned) current & ~mask) | bits;
1046         if ((unsigned) current == applied)
1047                 return 0;
1048
1049         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1050                 return -errno;
1051
1052         return 1;
1053 }
1054
1055 static int enforce_user(
1056                 const ExecContext *context,
1057                 uid_t uid,
1058                 uint64_t capability_ambient_set) {
1059         assert(context);
1060         int r;
1061
1062         if (!uid_is_valid(uid))
1063                 return 0;
1064
1065         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1066          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1067          * case. */
1068
1069         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1070
1071                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1072                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1073                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1074                 if (r < 0)
1075                         return r;
1076         }
1077
1078         /* Second step: actually set the uids */
1079         if (setresuid(uid, uid, uid) < 0)
1080                 return -errno;
1081
1082         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1083          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1084          * outside of this call. */
1085         return 0;
1086 }
1087
1088 #if HAVE_PAM
1089
1090 static int null_conv(
1091                 int num_msg,
1092                 const struct pam_message **msg,
1093                 struct pam_response **resp,
1094                 void *appdata_ptr) {
1095
1096         /* We don't support conversations */
1097
1098         return PAM_CONV_ERR;
1099 }
1100
1101 static int pam_close_session_and_delete_credentials(pam_handle_t *handle, int flags) {
1102         int r, s;
1103
1104         assert(handle);
1105
1106         r = pam_close_session(handle, flags);
1107         if (r != PAM_SUCCESS)
1108                 log_debug("pam_close_session() failed: %s", pam_strerror(handle, r));
1109
1110         s = pam_setcred(handle, PAM_DELETE_CRED | flags);
1111         if (s != PAM_SUCCESS)
1112                 log_debug("pam_setcred(PAM_DELETE_CRED) failed: %s", pam_strerror(handle, s));
1113
1114         return r != PAM_SUCCESS ? r : s;
1115 }
1116
1117 #endif
1118
1119 static int setup_pam(
1120                 const char *name,
1121                 const char *user,
1122                 uid_t uid,
1123                 gid_t gid,
1124                 const char *tty,
1125                 char ***env, /* updated on success */
1126                 const int fds[], size_t n_fds,
1127                 int exec_fd) {
1128
1129 #if HAVE_PAM
1130
1131         static const struct pam_conv conv = {
1132                 .conv = null_conv,
1133                 .appdata_ptr = NULL
1134         };
1135
1136         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1137         _cleanup_strv_free_ char **e = NULL;
1138         pam_handle_t *handle = NULL;
1139         sigset_t old_ss;
1140         int pam_code = PAM_SUCCESS, r;
1141         bool close_session = false;
1142         pid_t parent_pid;
1143         int flags = 0;
1144
1145         assert(name);
1146         assert(user);
1147         assert(env);
1148
1149         /* We set up PAM in the parent process, then fork. The child
1150          * will then stay around until killed via PR_GET_PDEATHSIG or
1151          * systemd via the cgroup logic. It will then remove the PAM
1152          * session again. The parent process will exec() the actual
1153          * daemon. We do things this way to ensure that the main PID
1154          * of the daemon is the one we initially fork()ed. */
1155
1156         r = barrier_create(&barrier);
1157         if (r < 0)
1158                 goto fail;
1159
1160         if (log_get_max_level() < LOG_DEBUG)
1161                 flags |= PAM_SILENT;
1162
1163         pam_code = pam_start(name, user, &conv, &handle);
1164         if (pam_code != PAM_SUCCESS) {
1165                 handle = NULL;
1166                 goto fail;
1167         }
1168
1169         if (!tty) {
1170                 _cleanup_free_ char *q = NULL;
1171
1172                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1173                  * out if that's the case, and read the TTY off it. */
1174
1175                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1176                         tty = strjoina("/dev/", q);
1177         }
1178
1179         if (tty) {
1180                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1181                 if (pam_code != PAM_SUCCESS)
1182                         goto fail;
1183         }
1184
1185         STRV_FOREACH(nv, *env) {
1186                 pam_code = pam_putenv(handle, *nv);
1187                 if (pam_code != PAM_SUCCESS)
1188                         goto fail;
1189         }
1190
1191         pam_code = pam_acct_mgmt(handle, flags);
1192         if (pam_code != PAM_SUCCESS)
1193                 goto fail;
1194
1195         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1196         if (pam_code != PAM_SUCCESS)
1197                 log_debug("pam_setcred(PAM_ESTABLISH_CRED) failed, ignoring: %s", pam_strerror(handle, pam_code));
1198
1199         pam_code = pam_open_session(handle, flags);
1200         if (pam_code != PAM_SUCCESS)
1201                 goto fail;
1202
1203         close_session = true;
1204
1205         e = pam_getenvlist(handle);
1206         if (!e) {
1207                 pam_code = PAM_BUF_ERR;
1208                 goto fail;
1209         }
1210
1211         /* Block SIGTERM, so that we know that it won't get lost in the child */
1212
1213         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1214
1215         parent_pid = getpid_cached();
1216
1217         r = safe_fork("(sd-pam)", 0, NULL);
1218         if (r < 0)
1219                 goto fail;
1220         if (r == 0) {
1221                 int ret = EXIT_PAM;
1222
1223                 /* The child's job is to reset the PAM session on termination */
1224                 barrier_set_role(&barrier, BARRIER_CHILD);
1225
1226                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1227                  * those fds are open here that have been opened by PAM. */
1228                 (void) close_many(fds, n_fds);
1229
1230                 /* Also close the 'exec_fd' in the child, since the service manager waits for the EOF induced
1231                  * by the execve() to wait for completion, and if we'd keep the fd open here in the child
1232                  * we'd never signal completion. */
1233                 exec_fd = safe_close(exec_fd);
1234
1235                 /* Drop privileges - we don't need any to pam_close_session and this will make
1236                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1237                  * threads to fail to exit normally */
1238
1239                 r = fully_set_uid_gid(uid, gid, /* supplementary_gids= */ NULL, /* n_supplementary_gids= */ 0);
1240                 if (r < 0)
1241                         log_warning_errno(r, "Failed to drop privileges in sd-pam: %m");
1242
1243                 (void) ignore_signals(SIGPIPE);
1244
1245                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1246                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1247                  * this way. We rely on the control groups kill logic to do the rest for us. */
1248                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1249                         goto child_finish;
1250
1251                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1252                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1253                  *
1254                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1255                 (void) barrier_place(&barrier);
1256
1257                 /* Check if our parent process might already have died? */
1258                 if (getppid() == parent_pid) {
1259                         sigset_t ss;
1260                         int sig;
1261
1262                         assert_se(sigemptyset(&ss) >= 0);
1263                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1264
1265                         assert_se(sigwait(&ss, &sig) == 0);
1266                         assert(sig == SIGTERM);
1267                 }
1268
1269                 /* If our parent died we'll end the session */
1270                 if (getppid() != parent_pid) {
1271                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1272                         if (pam_code != PAM_SUCCESS)
1273                                 goto child_finish;
1274                 }
1275
1276                 ret = 0;
1277
1278         child_finish:
1279                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1280                  * know about this. See pam_end(3) */
1281                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1282                 _exit(ret);
1283         }
1284
1285         barrier_set_role(&barrier, BARRIER_PARENT);
1286
1287         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1288          * here. */
1289         handle = NULL;
1290
1291         /* Unblock SIGTERM again in the parent */
1292         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1293
1294         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1295          * this fd around. */
1296         closelog();
1297
1298         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1299          * recover. However, warn loudly if it happens. */
1300         if (!barrier_place_and_sync(&barrier))
1301                 log_error("PAM initialization failed");
1302
1303         return strv_free_and_replace(*env, e);
1304
1305 fail:
1306         if (pam_code != PAM_SUCCESS) {
1307                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1308                 r = -EPERM;  /* PAM errors do not map to errno */
1309         } else
1310                 log_error_errno(r, "PAM failed: %m");
1311
1312         if (handle) {
1313                 if (close_session)
1314                         pam_code = pam_close_session_and_delete_credentials(handle, flags);
1315
1316                 (void) pam_end(handle, pam_code | flags);
1317         }
1318
1319         closelog();
1320         return r;
1321 #else
1322         return 0;
1323 #endif
1324 }
1325
1326 static void rename_process_from_path(const char *path) {
1327         _cleanup_free_ char *buf = NULL;
1328         const char *p;
1329
1330         assert(path);
1331
1332         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1333          * /bin/ps */
1334
1335         if (path_extract_filename(path, &buf) < 0) {
1336                 rename_process("(...)");
1337                 return;
1338         }
1339
1340         size_t l = strlen(buf);
1341         if (l > 8) {
1342                 /* The end of the process name is usually more interesting, since the first bit might just be
1343                  * "systemd-" */
1344                 p = buf + l - 8;
1345                 l = 8;
1346         } else
1347                 p = buf;
1348
1349         char process_name[11];
1350         process_name[0] = '(';
1351         memcpy(process_name+1, p, l);
1352         process_name[1+l] = ')';
1353         process_name[1+l+1] = 0;
1354
1355         (void) rename_process(process_name);
1356 }
1357
1358 static bool context_has_address_families(const ExecContext *c) {
1359         assert(c);
1360
1361         return c->address_families_allow_list ||
1362                 !set_isempty(c->address_families);
1363 }
1364
1365 static bool context_has_syscall_filters(const ExecContext *c) {
1366         assert(c);
1367
1368         return c->syscall_allow_list ||
1369                 !hashmap_isempty(c->syscall_filter);
1370 }
1371
1372 static bool context_has_syscall_logs(const ExecContext *c) {
1373         assert(c);
1374
1375         return c->syscall_log_allow_list ||
1376                 !hashmap_isempty(c->syscall_log);
1377 }
1378
1379 static bool context_has_seccomp(const ExecContext *c) {
1380         /* We need NNP if we have any form of seccomp and are unprivileged */
1381         return c->lock_personality ||
1382                 c->memory_deny_write_execute ||
1383                 c->private_devices ||
1384                 c->protect_clock ||
1385                 c->protect_hostname ||
1386                 c->protect_kernel_tunables ||
1387                 c->protect_kernel_modules ||
1388                 c->protect_kernel_logs ||
1389                 context_has_address_families(c) ||
1390                 exec_context_restrict_namespaces_set(c) ||
1391                 c->restrict_realtime ||
1392                 c->restrict_suid_sgid ||
1393                 !set_isempty(c->syscall_archs) ||
1394                 context_has_syscall_filters(c) ||
1395                 context_has_syscall_logs(c);
1396 }
1397
1398 static bool context_has_no_new_privileges(const ExecContext *c) {
1399         assert(c);
1400
1401         if (c->no_new_privileges)
1402                 return true;
1403
1404         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1405                 return false;
1406
1407         return context_has_seccomp(c);
1408 }
1409
1410 #if HAVE_SECCOMP
1411
1412 static bool seccomp_allows_drop_privileges(const ExecContext *c) {
1413         void *id, *val;
1414         bool has_capget = false, has_capset = false, has_prctl = false;
1415
1416         assert(c);
1417
1418         /* No syscall filter, we are allowed to drop privileges */
1419         if (hashmap_isempty(c->syscall_filter))
1420                 return true;
1421
1422         HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
1423                 _cleanup_free_ char *name = NULL;
1424
1425                 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
1426
1427                 if (streq(name, "capget"))
1428                         has_capget = true;
1429                 else if (streq(name, "capset"))
1430                         has_capset = true;
1431                 else if (streq(name, "prctl"))
1432                         has_prctl = true;
1433         }
1434
1435         if (c->syscall_allow_list)
1436                 return has_capget && has_capset && has_prctl;
1437         else
1438                 return !(has_capget || has_capset || has_prctl);
1439 }
1440
1441 static bool skip_seccomp_unavailable(const ExecContext *c, const ExecParameters *p, const char* msg) {
1442
1443         if (is_seccomp_available())
1444                 return false;
1445
1446         log_exec_debug(c, p, "SECCOMP features not detected in the kernel, skipping %s", msg);
1447         return true;
1448 }
1449
1450 static int apply_syscall_filter(const ExecContext *c, const ExecParameters *p, bool needs_ambient_hack) {
1451         uint32_t negative_action, default_action, action;
1452         int r;
1453
1454         assert(c);
1455         assert(p);
1456
1457         if (!context_has_syscall_filters(c))
1458                 return 0;
1459
1460         if (skip_seccomp_unavailable(c, p, "SystemCallFilter="))
1461                 return 0;
1462
1463         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1464
1465         if (c->syscall_allow_list) {
1466                 default_action = negative_action;
1467                 action = SCMP_ACT_ALLOW;
1468         } else {
1469                 default_action = SCMP_ACT_ALLOW;
1470                 action = negative_action;
1471         }
1472
1473         if (needs_ambient_hack) {
1474                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1475                 if (r < 0)
1476                         return r;
1477         }
1478
1479         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1480 }
1481
1482 static int apply_syscall_log(const ExecContext *c, const ExecParameters *p) {
1483 #ifdef SCMP_ACT_LOG
1484         uint32_t default_action, action;
1485 #endif
1486
1487         assert(c);
1488         assert(p);
1489
1490         if (!context_has_syscall_logs(c))
1491                 return 0;
1492
1493 #ifdef SCMP_ACT_LOG
1494         if (skip_seccomp_unavailable(c, p, "SystemCallLog="))
1495                 return 0;
1496
1497         if (c->syscall_log_allow_list) {
1498                 /* Log nothing but the ones listed */
1499                 default_action = SCMP_ACT_ALLOW;
1500                 action = SCMP_ACT_LOG;
1501         } else {
1502                 /* Log everything but the ones listed */
1503                 default_action = SCMP_ACT_LOG;
1504                 action = SCMP_ACT_ALLOW;
1505         }
1506
1507         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1508 #else
1509         /* old libseccomp */
1510         log_exec_debug(c, p, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1511         return 0;
1512 #endif
1513 }
1514
1515 static int apply_syscall_archs(const ExecContext *c, const ExecParameters *p) {
1516         assert(c);
1517         assert(p);
1518
1519         if (set_isempty(c->syscall_archs))
1520                 return 0;
1521
1522         if (skip_seccomp_unavailable(c, p, "SystemCallArchitectures="))
1523                 return 0;
1524
1525         return seccomp_restrict_archs(c->syscall_archs);
1526 }
1527
1528 static int apply_address_families(const ExecContext *c, const ExecParameters *p) {
1529         assert(c);
1530         assert(p);
1531
1532         if (!context_has_address_families(c))
1533                 return 0;
1534
1535         if (skip_seccomp_unavailable(c, p, "RestrictAddressFamilies="))
1536                 return 0;
1537
1538         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1539 }
1540
1541 static int apply_memory_deny_write_execute(const ExecContext *c, const ExecParameters *p) {
1542         int r;
1543
1544         assert(c);
1545         assert(p);
1546
1547         if (!c->memory_deny_write_execute)
1548                 return 0;
1549
1550         /* use prctl() if kernel supports it (6.3) */
1551         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1552         if (r == 0) {
1553                 log_exec_debug(c, p, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1554                 return 0;
1555         }
1556         if (r < 0 && errno != EINVAL)
1557                 return log_exec_debug_errno(c,
1558                                             p,
1559                                             errno,
1560                                             "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1561         /* else use seccomp */
1562         log_exec_debug(c, p, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1563
1564         if (skip_seccomp_unavailable(c, p, "MemoryDenyWriteExecute="))
1565                 return 0;
1566
1567         return seccomp_memory_deny_write_execute();
1568 }
1569
1570 static int apply_restrict_realtime(const ExecContext *c, const ExecParameters *p) {
1571         assert(c);
1572         assert(p);
1573
1574         if (!c->restrict_realtime)
1575                 return 0;
1576
1577         if (skip_seccomp_unavailable(c, p, "RestrictRealtime="))
1578                 return 0;
1579
1580         return seccomp_restrict_realtime();
1581 }
1582
1583 static int apply_restrict_suid_sgid(const ExecContext *c, const ExecParameters *p) {
1584         assert(c);
1585         assert(p);
1586
1587         if (!c->restrict_suid_sgid)
1588                 return 0;
1589
1590         if (skip_seccomp_unavailable(c, p, "RestrictSUIDSGID="))
1591                 return 0;
1592
1593         return seccomp_restrict_suid_sgid();
1594 }
1595
1596 static int apply_protect_sysctl(const ExecContext *c, const ExecParameters *p) {
1597         assert(c);
1598         assert(p);
1599
1600         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1601          * let's protect even those systems where this is left on in the kernel. */
1602
1603         if (!c->protect_kernel_tunables)
1604                 return 0;
1605
1606         if (skip_seccomp_unavailable(c, p, "ProtectKernelTunables="))
1607                 return 0;
1608
1609         return seccomp_protect_sysctl();
1610 }
1611
1612 static int apply_protect_kernel_modules(const ExecContext *c, const ExecParameters *p) {
1613         assert(c);
1614         assert(p);
1615
1616         /* Turn off module syscalls on ProtectKernelModules=yes */
1617
1618         if (!c->protect_kernel_modules)
1619                 return 0;
1620
1621         if (skip_seccomp_unavailable(c, p, "ProtectKernelModules="))
1622                 return 0;
1623
1624         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1625 }
1626
1627 static int apply_protect_kernel_logs(const ExecContext *c, const ExecParameters *p) {
1628         assert(c);
1629         assert(p);
1630
1631         if (!c->protect_kernel_logs)
1632                 return 0;
1633
1634         if (skip_seccomp_unavailable(c, p, "ProtectKernelLogs="))
1635                 return 0;
1636
1637         return seccomp_protect_syslog();
1638 }
1639
1640 static int apply_protect_clock(const ExecContext *c, const ExecParameters *p) {
1641         assert(c);
1642         assert(p);
1643
1644         if (!c->protect_clock)
1645                 return 0;
1646
1647         if (skip_seccomp_unavailable(c, p, "ProtectClock="))
1648                 return 0;
1649
1650         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1651 }
1652
1653 static int apply_private_devices(const ExecContext *c, const ExecParameters *p) {
1654         assert(c);
1655         assert(p);
1656
1657         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1658
1659         if (!c->private_devices)
1660                 return 0;
1661
1662         if (skip_seccomp_unavailable(c, p, "PrivateDevices="))
1663                 return 0;
1664
1665         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1666 }
1667
1668 static int apply_restrict_namespaces(const ExecContext *c, const ExecParameters *p) {
1669         assert(c);
1670         assert(p);
1671
1672         if (!exec_context_restrict_namespaces_set(c))
1673                 return 0;
1674
1675         if (skip_seccomp_unavailable(c, p, "RestrictNamespaces="))
1676                 return 0;
1677
1678         return seccomp_restrict_namespaces(c->restrict_namespaces);
1679 }
1680
1681 static int apply_lock_personality(const ExecContext *c, const ExecParameters *p) {
1682         unsigned long personality;
1683         int r;
1684
1685         assert(c);
1686         assert(p);
1687
1688         if (!c->lock_personality)
1689                 return 0;
1690
1691         if (skip_seccomp_unavailable(c, p, "LockPersonality="))
1692                 return 0;
1693
1694         personality = c->personality;
1695
1696         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1697         if (personality == PERSONALITY_INVALID) {
1698
1699                 r = opinionated_personality(&personality);
1700                 if (r < 0)
1701                         return r;
1702         }
1703
1704         return seccomp_lock_personality(personality);
1705 }
1706
1707 #endif
1708
1709 #if HAVE_LIBBPF
1710 static int apply_restrict_filesystems(const ExecContext *c, const ExecParameters *p) {
1711         int r;
1712
1713         assert(c);
1714         assert(p);
1715
1716         if (!exec_context_restrict_filesystems_set(c))
1717                 return 0;
1718
1719         if (p->bpf_restrict_fs_map_fd < 0) {
1720                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1721                 log_exec_debug(c, p, "LSM BPF not supported, skipping RestrictFileSystems=");
1722                 return 0;
1723         }
1724
1725         /* We are in a new binary, so dl-open again */
1726         r = dlopen_bpf();
1727         if (r < 0)
1728                 return r;
1729
1730         return bpf_restrict_fs_update(c->restrict_filesystems, p->cgroup_id, p->bpf_restrict_fs_map_fd, c->restrict_filesystems_allow_list);
1731 }
1732 #endif
1733
1734 static int apply_protect_hostname(const ExecContext *c, const ExecParameters *p, int *ret_exit_status) {
1735         assert(c);
1736         assert(p);
1737
1738         if (!c->protect_hostname)
1739                 return 0;
1740
1741         if (ns_type_supported(NAMESPACE_UTS)) {
1742                 if (unshare(CLONE_NEWUTS) < 0) {
1743                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1744                                 *ret_exit_status = EXIT_NAMESPACE;
1745                                 return log_exec_error_errno(c,
1746                                                             p,
1747                                                             errno,
1748                                                             "Failed to set up UTS namespacing: %m");
1749                         }
1750
1751                         log_exec_warning(c,
1752                                          p,
1753                                          "ProtectHostname=yes is configured, but UTS namespace setup is "
1754                                          "prohibited (container manager?), ignoring namespace setup.");
1755                 }
1756         } else
1757                 log_exec_warning(c,
1758                                  p,
1759                                  "ProtectHostname=yes is configured, but the kernel does not "
1760                                  "support UTS namespaces, ignoring namespace setup.");
1761
1762 #if HAVE_SECCOMP
1763         int r;
1764
1765         if (skip_seccomp_unavailable(c, p, "ProtectHostname="))
1766                 return 0;
1767
1768         r = seccomp_protect_hostname();
1769         if (r < 0) {
1770                 *ret_exit_status = EXIT_SECCOMP;
1771                 return log_exec_error_errno(c, p, r, "Failed to apply hostname restrictions: %m");
1772         }
1773 #endif
1774
1775         return 0;
1776 }
1777
1778 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1779         assert(idle_pipe);
1780
1781         idle_pipe[1] = safe_close(idle_pipe[1]);
1782         idle_pipe[2] = safe_close(idle_pipe[2]);
1783
1784         if (idle_pipe[0] >= 0) {
1785                 int r;
1786
1787                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1788
1789                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1790                         ssize_t n;
1791
1792                         /* Signal systemd that we are bored and want to continue. */
1793                         n = write(idle_pipe[3], "x", 1);
1794                         if (n > 0)
1795                                 /* Wait for systemd to react to the signal above. */
1796                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1797                 }
1798
1799                 idle_pipe[0] = safe_close(idle_pipe[0]);
1800
1801         }
1802
1803         idle_pipe[3] = safe_close(idle_pipe[3]);
1804 }
1805
1806 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1807
1808 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
1809  * the service payload in. */
1810 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1811         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
1812         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
1813         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
1814         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
1815         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
1816 };
1817
1818 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
1819
1820 static int build_environment(
1821                 const ExecContext *c,
1822                 const ExecParameters *p,
1823                 const CGroupContext *cgroup_context,
1824                 size_t n_fds,
1825                 const char *home,
1826                 const char *username,
1827                 const char *shell,
1828                 dev_t journal_stream_dev,
1829                 ino_t journal_stream_ino,
1830                 const char *memory_pressure_path,
1831                 char ***ret) {
1832
1833         _cleanup_strv_free_ char **our_env = NULL;
1834         size_t n_env = 0;
1835         char *x;
1836         int r;
1837
1838         assert(c);
1839         assert(p);
1840         assert(ret);
1841
1842 #define N_ENV_VARS 19
1843         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1844         if (!our_env)
1845                 return -ENOMEM;
1846
1847         if (n_fds > 0) {
1848                 _cleanup_free_ char *joined = NULL;
1849
1850                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1851                         return -ENOMEM;
1852                 our_env[n_env++] = x;
1853
1854                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1855                         return -ENOMEM;
1856                 our_env[n_env++] = x;
1857
1858                 joined = strv_join(p->fd_names, ":");
1859                 if (!joined)
1860                         return -ENOMEM;
1861
1862                 x = strjoin("LISTEN_FDNAMES=", joined);
1863                 if (!x)
1864                         return -ENOMEM;
1865                 our_env[n_env++] = x;
1866         }
1867
1868         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1869                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1870                         return -ENOMEM;
1871                 our_env[n_env++] = x;
1872
1873                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1874                         return -ENOMEM;
1875                 our_env[n_env++] = x;
1876         }
1877
1878         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1879          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1880          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1881         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1882                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1883                 if (!x)
1884                         return -ENOMEM;
1885                 our_env[n_env++] = x;
1886         }
1887
1888         /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1889          * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1890          * really make much sense since we're not logged in. Hence we conditionalize the three based on
1891          * SetLoginEnvironment= switch. */
1892         if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1893                 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1894                 if (r < 0)
1895                         return log_exec_debug_errno(c,
1896                                                     p,
1897                                                     r,
1898                                                     "Failed to determine user credentials for root: %m");
1899         }
1900
1901         bool set_user_login_env = exec_context_get_set_login_environment(c);
1902
1903         if (username) {
1904                 x = strjoin("USER=", username);
1905                 if (!x)
1906                         return -ENOMEM;
1907                 our_env[n_env++] = x;
1908
1909                 if (set_user_login_env) {
1910                         x = strjoin("LOGNAME=", username);
1911                         if (!x)
1912                                 return -ENOMEM;
1913                         our_env[n_env++] = x;
1914                 }
1915         }
1916
1917         if (home && set_user_login_env) {
1918                 x = strjoin("HOME=", home);
1919                 if (!x)
1920                         return -ENOMEM;
1921
1922                 path_simplify(x + 5);
1923                 our_env[n_env++] = x;
1924         }
1925
1926         if (shell && set_user_login_env) {
1927                 x = strjoin("SHELL=", shell);
1928                 if (!x)
1929                         return -ENOMEM;
1930
1931                 path_simplify(x + 6);
1932                 our_env[n_env++] = x;
1933         }
1934
1935         if (!sd_id128_is_null(p->invocation_id)) {
1936                 assert(p->invocation_id_string);
1937
1938                 x = strjoin("INVOCATION_ID=", p->invocation_id_string);
1939                 if (!x)
1940                         return -ENOMEM;
1941
1942                 our_env[n_env++] = x;
1943         }
1944
1945         if (exec_context_needs_term(c)) {
1946                 _cleanup_free_ char *cmdline = NULL;
1947                 const char *tty_path, *term = NULL;
1948
1949                 tty_path = exec_context_tty_path(c);
1950
1951                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1952                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1953                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1954
1955                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1956                         term = getenv("TERM");
1957                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1958                         _cleanup_free_ char *key = NULL;
1959
1960                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1961                         if (!key)
1962                                 return -ENOMEM;
1963
1964                         r = proc_cmdline_get_key(key, 0, &cmdline);
1965                         if (r < 0)
1966                                 log_exec_debug_errno(c,
1967                                                      p,
1968                                                      r,
1969                                                      "Failed to read %s from kernel cmdline, ignoring: %m",
1970                                                      key);
1971                         else if (r > 0)
1972                                 term = cmdline;
1973                 }
1974
1975                 if (!term)
1976                         term = default_term_for_tty(tty_path);
1977
1978                 x = strjoin("TERM=", term);
1979                 if (!x)
1980                         return -ENOMEM;
1981                 our_env[n_env++] = x;
1982         }
1983
1984         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1985                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1986                         return -ENOMEM;
1987
1988                 our_env[n_env++] = x;
1989         }
1990
1991         if (c->log_namespace) {
1992                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1993                 if (!x)
1994                         return -ENOMEM;
1995
1996                 our_env[n_env++] = x;
1997         }
1998
1999         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2000                 _cleanup_free_ char *joined = NULL;
2001                 const char *n;
2002
2003                 if (!p->prefix[t])
2004                         continue;
2005
2006                 if (c->directories[t].n_items == 0)
2007                         continue;
2008
2009                 n = exec_directory_env_name_to_string(t);
2010                 if (!n)
2011                         continue;
2012
2013                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2014                         _cleanup_free_ char *prefixed = NULL;
2015
2016                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2017                         if (!prefixed)
2018                                 return -ENOMEM;
2019
2020                         if (!strextend_with_separator(&joined, ":", prefixed))
2021                                 return -ENOMEM;
2022                 }
2023
2024                 x = strjoin(n, "=", joined);
2025                 if (!x)
2026                         return -ENOMEM;
2027
2028                 our_env[n_env++] = x;
2029         }
2030
2031         _cleanup_free_ char *creds_dir = NULL;
2032         r = exec_context_get_credential_directory(c, p, p->unit_id, &creds_dir);
2033         if (r < 0)
2034                 return r;
2035         if (r > 0) {
2036                 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
2037                 if (!x)
2038                         return -ENOMEM;
2039
2040                 our_env[n_env++] = x;
2041         }
2042
2043         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2044                 return -ENOMEM;
2045
2046         our_env[n_env++] = x;
2047
2048         if (memory_pressure_path) {
2049                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2050                 if (!x)
2051                         return -ENOMEM;
2052
2053                 our_env[n_env++] = x;
2054
2055                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2056                         _cleanup_free_ char *b = NULL, *e = NULL;
2057
2058                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2059                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2060                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2061                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2062                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2063                                 return -ENOMEM;
2064
2065                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2066                                 return -ENOMEM;
2067
2068                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2069                         if (!x)
2070                                 return -ENOMEM;
2071
2072                         our_env[n_env++] = x;
2073                 }
2074         }
2075
2076         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2077 #undef N_ENV_VARS
2078
2079         *ret = TAKE_PTR(our_env);
2080
2081         return 0;
2082 }
2083
2084 static int build_pass_environment(const ExecContext *c, char ***ret) {
2085         _cleanup_strv_free_ char **pass_env = NULL;
2086         size_t n_env = 0;
2087
2088         STRV_FOREACH(i, c->pass_environment) {
2089                 _cleanup_free_ char *x = NULL;
2090                 char *v;
2091
2092                 v = getenv(*i);
2093                 if (!v)
2094                         continue;
2095                 x = strjoin(*i, "=", v);
2096                 if (!x)
2097                         return -ENOMEM;
2098
2099                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2100                         return -ENOMEM;
2101
2102                 pass_env[n_env++] = TAKE_PTR(x);
2103                 pass_env[n_env] = NULL;
2104         }
2105
2106         *ret = TAKE_PTR(pass_env);
2107
2108         return 0;
2109 }
2110
2111 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2112         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2113         _cleanup_close_pair_ int errno_pipe[2] = EBADF_PAIR;
2114         _cleanup_close_ int unshare_ready_fd = -EBADF;
2115         _cleanup_(sigkill_waitp) pid_t pid = 0;
2116         uint64_t c = 1;
2117         ssize_t n;
2118         int r;
2119
2120         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2121          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2122          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2123          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2124          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2125          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2126          * continues execution normally.
2127          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2128          * does not need CAP_SETUID to write the single line mapping to itself. */
2129
2130         /* Can only set up multiple mappings with CAP_SETUID. */
2131         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2132                 r = asprintf(&uid_map,
2133                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2134                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2135                              ouid, ouid, uid, uid);
2136         else
2137                 r = asprintf(&uid_map,
2138                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2139                              ouid, ouid);
2140
2141         if (r < 0)
2142                 return -ENOMEM;
2143
2144         /* Can only set up multiple mappings with CAP_SETGID. */
2145         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2146                 r = asprintf(&gid_map,
2147                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2148                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2149                              ogid, ogid, gid, gid);
2150         else
2151                 r = asprintf(&gid_map,
2152                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2153                              ogid, ogid);
2154
2155         if (r < 0)
2156                 return -ENOMEM;
2157
2158         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2159          * namespace. */
2160         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2161         if (unshare_ready_fd < 0)
2162                 return -errno;
2163
2164         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2165          * failed. */
2166         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2167                 return -errno;
2168
2169         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL, &pid);
2170         if (r < 0)
2171                 return r;
2172         if (r == 0) {
2173                 _cleanup_close_ int fd = -EBADF;
2174                 const char *a;
2175                 pid_t ppid;
2176
2177                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2178                  * here, after the parent opened its own user namespace. */
2179
2180                 ppid = getppid();
2181                 errno_pipe[0] = safe_close(errno_pipe[0]);
2182
2183                 /* Wait until the parent unshared the user namespace */
2184                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2185                         r = -errno;
2186                         goto child_fail;
2187                 }
2188
2189                 /* Disable the setgroups() system call in the child user namespace, for good. */
2190                 a = procfs_file_alloca(ppid, "setgroups");
2191                 fd = open(a, O_WRONLY|O_CLOEXEC);
2192                 if (fd < 0) {
2193                         if (errno != ENOENT) {
2194                                 r = -errno;
2195                                 goto child_fail;
2196                         }
2197
2198                         /* If the file is missing the kernel is too old, let's continue anyway. */
2199                 } else {
2200                         if (write(fd, "deny\n", 5) < 0) {
2201                                 r = -errno;
2202                                 goto child_fail;
2203                         }
2204
2205                         fd = safe_close(fd);
2206                 }
2207
2208                 /* First write the GID map */
2209                 a = procfs_file_alloca(ppid, "gid_map");
2210                 fd = open(a, O_WRONLY|O_CLOEXEC);
2211                 if (fd < 0) {
2212                         r = -errno;
2213                         goto child_fail;
2214                 }
2215                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2216                         r = -errno;
2217                         goto child_fail;
2218                 }
2219                 fd = safe_close(fd);
2220
2221                 /* The write the UID map */
2222                 a = procfs_file_alloca(ppid, "uid_map");
2223                 fd = open(a, O_WRONLY|O_CLOEXEC);
2224                 if (fd < 0) {
2225                         r = -errno;
2226                         goto child_fail;
2227                 }
2228                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2229                         r = -errno;
2230                         goto child_fail;
2231                 }
2232
2233                 _exit(EXIT_SUCCESS);
2234
2235         child_fail:
2236                 (void) write(errno_pipe[1], &r, sizeof(r));
2237                 _exit(EXIT_FAILURE);
2238         }
2239
2240         errno_pipe[1] = safe_close(errno_pipe[1]);
2241
2242         if (unshare(CLONE_NEWUSER) < 0)
2243                 return -errno;
2244
2245         /* Let the child know that the namespace is ready now */
2246         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2247                 return -errno;
2248
2249         /* Try to read an error code from the child */
2250         n = read(errno_pipe[0], &r, sizeof(r));
2251         if (n < 0)
2252                 return -errno;
2253         if (n == sizeof(r)) { /* an error code was sent to us */
2254                 if (r < 0)
2255                         return r;
2256                 return -EIO;
2257         }
2258         if (n != 0) /* on success we should have read 0 bytes */
2259                 return -EIO;
2260
2261         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2262         if (r < 0)
2263                 return r;
2264         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2265                 return -EIO;
2266
2267         return 0;
2268 }
2269
2270 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2271         _cleanup_free_ char *src_abs = NULL;
2272         int r;
2273
2274         assert(source);
2275
2276         src_abs = path_join(root, source);
2277         if (!src_abs)
2278                 return -ENOMEM;
2279
2280         STRV_FOREACH(dst, symlinks) {
2281                 _cleanup_free_ char *dst_abs = NULL;
2282
2283                 dst_abs = path_join(root, *dst);
2284                 if (!dst_abs)
2285                         return -ENOMEM;
2286
2287                 r = mkdir_parents_label(dst_abs, 0755);
2288                 if (r < 0)
2289                         return r;
2290
2291                 r = symlink_idempotent(src_abs, dst_abs, true);
2292                 if (r < 0)
2293                         return r;
2294         }
2295
2296         return 0;
2297 }
2298
2299 static int setup_exec_directory(
2300                 const ExecContext *context,
2301                 const ExecParameters *params,
2302                 uid_t uid,
2303                 gid_t gid,
2304                 ExecDirectoryType type,
2305                 bool needs_mount_namespace,
2306                 int *exit_status) {
2307
2308         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2309                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2310                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2311                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2312                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2313                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2314         };
2315         int r;
2316
2317         assert(context);
2318         assert(params);
2319         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2320         assert(exit_status);
2321
2322         if (!params->prefix[type])
2323                 return 0;
2324
2325         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2326                 if (!uid_is_valid(uid))
2327                         uid = 0;
2328                 if (!gid_is_valid(gid))
2329                         gid = 0;
2330         }
2331
2332         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2333                 _cleanup_free_ char *p = NULL, *pp = NULL;
2334
2335                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2336                 if (!p) {
2337                         r = -ENOMEM;
2338                         goto fail;
2339                 }
2340
2341                 r = mkdir_parents_label(p, 0755);
2342                 if (r < 0)
2343                         goto fail;
2344
2345                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2346
2347                         /* If we are in user mode, and a configuration directory exists but a state directory
2348                          * doesn't exist, then we likely are upgrading from an older systemd version that
2349                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2350                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2351                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2352                          * separated. If a service has both dirs configured but only the configuration dir
2353                          * exists and the state dir does not, we assume we are looking at an update
2354                          * situation. Hence, create a compatibility symlink, so that all expectations are
2355                          * met.
2356                          *
2357                          * (We also do something similar with the log directory, which still doesn't exist in
2358                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2359
2360                         /* this assumes the state dir is always created before the configuration dir */
2361                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2362                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2363
2364                         r = laccess(p, F_OK);
2365                         if (r == -ENOENT) {
2366                                 _cleanup_free_ char *q = NULL;
2367
2368                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2369                                  * under the configuration hierarchy. */
2370
2371                                 if (type == EXEC_DIRECTORY_STATE)
2372                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2373                                 else if (type == EXEC_DIRECTORY_LOGS)
2374                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2375                                 else
2376                                         assert_not_reached();
2377                                 if (!q) {
2378                                         r = -ENOMEM;
2379                                         goto fail;
2380                                 }
2381
2382                                 r = laccess(q, F_OK);
2383                                 if (r >= 0) {
2384                                         /* It does exist! This hence looks like an update. Symlink the
2385                                          * configuration directory into the state directory. */
2386
2387                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2388                                         if (r < 0)
2389                                                 goto fail;
2390
2391                                         log_exec_notice(context, params, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2392                                         continue;
2393                                 } else if (r != -ENOENT)
2394                                         log_exec_warning_errno(context, params, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2395
2396                         } else if (r < 0)
2397                                 log_exec_warning_errno(context, params, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2398                 }
2399
2400                 if (exec_directory_is_private(context, type)) {
2401                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2402                          * case we want to avoid leaving a directory around fully accessible that is owned by
2403                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2404                          * trick used by container managers to prohibit host users to get access to files of
2405                          * the same UID in containers: we place everything inside a directory that has an
2406                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2407                          * for unprivileged host code. We then use fs namespacing to make this directory
2408                          * permeable for the service itself.
2409                          *
2410                          * Specifically: for a service which wants a special directory "foo/" we first create
2411                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2412                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2413                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2414                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2415                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2416                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2417                          * for the service and making sure it only gets access to the dirs it needs but no
2418                          * others. Tricky? Yes, absolutely, but it works!
2419                          *
2420                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2421                          * to be owned by the service itself.
2422                          *
2423                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2424                          * for sharing files or sockets with other services. */
2425
2426                         pp = path_join(params->prefix[type], "private");
2427                         if (!pp) {
2428                                 r = -ENOMEM;
2429                                 goto fail;
2430                         }
2431
2432                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2433                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2434                         if (r < 0)
2435                                 goto fail;
2436
2437                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2438                                 r = -ENOMEM;
2439                                 goto fail;
2440                         }
2441
2442                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2443                         r = mkdir_parents_label(pp, 0755);
2444                         if (r < 0)
2445                                 goto fail;
2446
2447                         if (is_dir(p, false) > 0 &&
2448                             (laccess(pp, F_OK) == -ENOENT)) {
2449
2450                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2451                                  * it over. Most likely the service has been upgraded from one that didn't use
2452                                  * DynamicUser=1, to one that does. */
2453
2454                                 log_exec_info(context,
2455                                               params,
2456                                               "Found pre-existing public %s= directory %s, migrating to %s.\n"
2457                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2458                                               exec_directory_type_to_string(type), p, pp);
2459
2460                                 r = RET_NERRNO(rename(p, pp));
2461                                 if (r < 0)
2462                                         goto fail;
2463                         } else {
2464                                 /* Otherwise, create the actual directory for the service */
2465
2466                                 r = mkdir_label(pp, context->directories[type].mode);
2467                                 if (r < 0 && r != -EEXIST)
2468                                         goto fail;
2469                         }
2470
2471                         if (!context->directories[type].items[i].only_create) {
2472                                 /* And link it up from the original place.
2473                                  * Notes
2474                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2475                                  *    the host, and a new one for the child namespace will be created later.
2476                                  * 2) It is not necessary to create this symlink when one of its parent
2477                                  *    directories is specified and already created. E.g.
2478                                  *        StateDirectory=foo foo/bar
2479                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2480                                  *        pp = "/var/lib/private/foo/bar"
2481                                  *        p = "/var/lib/foo/bar"
2482                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2483                                  *    we do not need to create the symlink, but we cannot create the symlink.
2484                                  *    See issue #24783. */
2485                                 r = symlink_idempotent(pp, p, true);
2486                                 if (r < 0)
2487                                         goto fail;
2488                         }
2489
2490                 } else {
2491                         _cleanup_free_ char *target = NULL;
2492
2493                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2494                             readlink_and_make_absolute(p, &target) >= 0) {
2495                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2496
2497                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2498                                  * by DynamicUser=1 (see above)?
2499                                  *
2500                                  * We do this for all directory types except for ConfigurationDirectory=,
2501                                  * since they all support the private/ symlink logic at least in some
2502                                  * configurations, see above. */
2503
2504                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2505                                 if (r < 0)
2506                                         goto fail;
2507
2508                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2509                                 if (!q) {
2510                                         r = -ENOMEM;
2511                                         goto fail;
2512                                 }
2513
2514                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2515                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2516                                 if (r < 0)
2517                                         goto fail;
2518
2519                                 if (path_equal(q_resolved, target_resolved)) {
2520
2521                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2522                                          * but is no longer. Let's move the directory back up. */
2523
2524                                         log_exec_info(context,
2525                                                       params,
2526                                                       "Found pre-existing private %s= directory %s, migrating to %s.\n"
2527                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2528                                                       exec_directory_type_to_string(type), q, p);
2529
2530                                         r = RET_NERRNO(unlink(p));
2531                                         if (r < 0)
2532                                                 goto fail;
2533
2534                                         r = RET_NERRNO(rename(q, p));
2535                                         if (r < 0)
2536                                                 goto fail;
2537                                 }
2538                         }
2539
2540                         r = mkdir_label(p, context->directories[type].mode);
2541                         if (r < 0) {
2542                                 if (r != -EEXIST)
2543                                         goto fail;
2544
2545                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2546                                         struct stat st;
2547
2548                                         /* Don't change the owner/access mode of the configuration directory,
2549                                          * as in the common case it is not written to by a service, and shall
2550                                          * not be writable. */
2551
2552                                         r = RET_NERRNO(stat(p, &st));
2553                                         if (r < 0)
2554                                                 goto fail;
2555
2556                                         /* Still complain if the access mode doesn't match */
2557                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2558                                                 log_exec_warning(context,
2559                                                                  params,
2560                                                                  "%s \'%s\' already exists but the mode is different. "
2561                                                                  "(File system: %o %sMode: %o)",
2562                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2563                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2564
2565                                         continue;
2566                                 }
2567                         }
2568                 }
2569
2570                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2571                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2572                  * current UID/GID ownership.) */
2573                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2574                 if (r < 0)
2575                         goto fail;
2576
2577                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2578                  * available to user code anyway */
2579                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2580                         continue;
2581
2582                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2583                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2584                  * assignments to exist. */
2585                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2586                 if (r < 0)
2587                         goto fail;
2588         }
2589
2590         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2591          * they are set up later, to allow configuring empty var/run/etc. */
2592         if (!needs_mount_namespace)
2593                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2594                         r = create_many_symlinks(params->prefix[type],
2595                                                  context->directories[type].items[i].path,
2596                                                  context->directories[type].items[i].symlinks);
2597                         if (r < 0)
2598                                 goto fail;
2599                 }
2600
2601         return 0;
2602
2603 fail:
2604         *exit_status = exit_status_table[type];
2605         return r;
2606 }
2607
2608 #if ENABLE_SMACK
2609 static int setup_smack(
2610                 const ExecParameters *params,
2611                 const ExecContext *context,
2612                 int executable_fd) {
2613         int r;
2614
2615         assert(params);
2616         assert(executable_fd >= 0);
2617
2618         if (context->smack_process_label) {
2619                 r = mac_smack_apply_pid(0, context->smack_process_label);
2620                 if (r < 0)
2621                         return r;
2622         } else if (params->fallback_smack_process_label) {
2623                 _cleanup_free_ char *exec_label = NULL;
2624
2625                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2626                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2627                         return r;
2628
2629                 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
2630                 if (r < 0)
2631                         return r;
2632         }
2633
2634         return 0;
2635 }
2636 #endif
2637
2638 static int compile_bind_mounts(
2639                 const ExecContext *context,
2640                 const ExecParameters *params,
2641                 BindMount **ret_bind_mounts,
2642                 size_t *ret_n_bind_mounts,
2643                 char ***ret_empty_directories) {
2644
2645         _cleanup_strv_free_ char **empty_directories = NULL;
2646         BindMount *bind_mounts = NULL;
2647         size_t n, h = 0;
2648         int r;
2649
2650         assert(context);
2651         assert(params);
2652         assert(ret_bind_mounts);
2653         assert(ret_n_bind_mounts);
2654         assert(ret_empty_directories);
2655
2656         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2657
2658         n = context->n_bind_mounts;
2659         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2660                 if (!params->prefix[t])
2661                         continue;
2662
2663                 for (size_t i = 0; i < context->directories[t].n_items; i++)
2664                         n += !context->directories[t].items[i].only_create;
2665         }
2666
2667         if (n <= 0) {
2668                 *ret_bind_mounts = NULL;
2669                 *ret_n_bind_mounts = 0;
2670                 *ret_empty_directories = NULL;
2671                 return 0;
2672         }
2673
2674         bind_mounts = new(BindMount, n);
2675         if (!bind_mounts)
2676                 return -ENOMEM;
2677
2678         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2679                 BindMount *item = context->bind_mounts + i;
2680                 _cleanup_free_ char *s = NULL, *d = NULL;
2681
2682                 s = strdup(item->source);
2683                 if (!s)
2684                         return -ENOMEM;
2685
2686                 d = strdup(item->destination);
2687                 if (!d)
2688                         return -ENOMEM;
2689
2690                 bind_mounts[h++] = (BindMount) {
2691                         .source = TAKE_PTR(s),
2692                         .destination = TAKE_PTR(d),
2693                         .read_only = item->read_only,
2694                         .recursive = item->recursive,
2695                         .ignore_enoent = item->ignore_enoent,
2696                 };
2697         }
2698
2699         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2700                 if (!params->prefix[t])
2701                         continue;
2702
2703                 if (context->directories[t].n_items == 0)
2704                         continue;
2705
2706                 if (exec_directory_is_private(context, t) &&
2707                     !exec_context_with_rootfs(context)) {
2708                         char *private_root;
2709
2710                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2711                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2712                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2713
2714                         private_root = path_join(params->prefix[t], "private");
2715                         if (!private_root)
2716                                 return -ENOMEM;
2717
2718                         r = strv_consume(&empty_directories, private_root);
2719                         if (r < 0)
2720                                 return r;
2721                 }
2722
2723                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2724                         _cleanup_free_ char *s = NULL, *d = NULL;
2725
2726                         /* When one of the parent directories is in the list, we cannot create the symlink
2727                          * for the child directory. See also the comments in setup_exec_directory(). */
2728                         if (context->directories[t].items[i].only_create)
2729                                 continue;
2730
2731                         if (exec_directory_is_private(context, t))
2732                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2733                         else
2734                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2735                         if (!s)
2736                                 return -ENOMEM;
2737
2738                         if (exec_directory_is_private(context, t) &&
2739                             exec_context_with_rootfs(context))
2740                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2741                                  * directory is not created on the root directory. So, let's bind-mount the directory
2742                                  * on the 'non-private' place. */
2743                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2744                         else
2745                                 d = strdup(s);
2746                         if (!d)
2747                                 return -ENOMEM;
2748
2749                         bind_mounts[h++] = (BindMount) {
2750                                 .source = TAKE_PTR(s),
2751                                 .destination = TAKE_PTR(d),
2752                                 .read_only = false,
2753                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2754                                 .recursive = true,
2755                                 .ignore_enoent = false,
2756                         };
2757                 }
2758         }
2759
2760         assert(h == n);
2761
2762         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2763         *ret_n_bind_mounts = n;
2764         *ret_empty_directories = TAKE_PTR(empty_directories);
2765
2766         return (int) n;
2767 }
2768
2769 /* ret_symlinks will contain a list of pairs src:dest that describes
2770  * the symlinks to create later on. For example, the symlinks needed
2771  * to safely give private directories to DynamicUser=1 users. */
2772 static int compile_symlinks(
2773                 const ExecContext *context,
2774                 const ExecParameters *params,
2775                 bool setup_os_release_symlink,
2776                 char ***ret_symlinks) {
2777
2778         _cleanup_strv_free_ char **symlinks = NULL;
2779         int r;
2780
2781         assert(context);
2782         assert(params);
2783         assert(ret_symlinks);
2784
2785         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2786                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2787                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2788
2789                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2790                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2791
2792                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2793                                 dst_abs = path_join(params->prefix[dt], *symlink);
2794                                 if (!src_abs || !dst_abs)
2795                                         return -ENOMEM;
2796
2797                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2798                                 if (r < 0)
2799                                         return r;
2800                         }
2801
2802                         if (!exec_directory_is_private(context, dt) ||
2803                             exec_context_with_rootfs(context) ||
2804                             context->directories[dt].items[i].only_create)
2805                                 continue;
2806
2807                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2808                         if (!private_path)
2809                                 return -ENOMEM;
2810
2811                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2812                         if (!path)
2813                                 return -ENOMEM;
2814
2815                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2816                         if (r < 0)
2817                                 return r;
2818                 }
2819         }
2820
2821         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2822          * and readers will never get a half-written version. Note that, while the paths specified here are
2823          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2824          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2825         if (setup_os_release_symlink) {
2826                 r = strv_extend_many(
2827                                 &symlinks,
2828                                 "/run/host/.os-release-stage/os-release",
2829                                 "/run/host/os-release");
2830                 if (r < 0)
2831                         return r;
2832         }
2833
2834         *ret_symlinks = TAKE_PTR(symlinks);
2835
2836         return 0;
2837 }
2838
2839 static bool insist_on_sandboxing(
2840                 const ExecContext *context,
2841                 const char *root_dir,
2842                 const char *root_image,
2843                 const BindMount *bind_mounts,
2844                 size_t n_bind_mounts) {
2845
2846         assert(context);
2847         assert(n_bind_mounts == 0 || bind_mounts);
2848
2849         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2850          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2851          * rearrange stuff in a way we cannot ignore gracefully. */
2852
2853         if (context->n_temporary_filesystems > 0)
2854                 return true;
2855
2856         if (root_dir || root_image)
2857                 return true;
2858
2859         if (context->n_mount_images > 0)
2860                 return true;
2861
2862         if (context->dynamic_user)
2863                 return true;
2864
2865         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2866                 return true;
2867
2868         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2869          * essential. */
2870         for (size_t i = 0; i < n_bind_mounts; i++)
2871                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2872                         return true;
2873
2874         if (context->log_namespace)
2875                 return true;
2876
2877         return false;
2878 }
2879
2880 static int setup_ephemeral(
2881                 const ExecContext *context,
2882                 ExecRuntime *runtime,
2883                 char **root_image,            /* both input and output! modified if ephemeral logic enabled */
2884                 char **root_directory) {      /* ditto */
2885
2886         _cleanup_close_ int fd = -EBADF;
2887         _cleanup_free_ char *new_root = NULL;
2888         int r;
2889
2890         assert(context);
2891         assert(root_image);
2892         assert(root_directory);
2893
2894         if (!*root_image && !*root_directory)
2895                 return 0;
2896
2897         if (!runtime || !runtime->ephemeral_copy)
2898                 return 0;
2899
2900         assert(runtime->ephemeral_storage_socket[0] >= 0);
2901         assert(runtime->ephemeral_storage_socket[1] >= 0);
2902
2903         new_root = strdup(runtime->ephemeral_copy);
2904         if (!new_root)
2905                 return log_oom_debug();
2906
2907         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
2908         if (r < 0)
2909                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
2910
2911         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
2912
2913         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
2914         if (fd >= 0)
2915                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
2916                 return 0;
2917         if (fd != -EAGAIN)
2918                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
2919
2920         if (*root_image) {
2921                 log_debug("Making ephemeral copy of %s to %s", *root_image, new_root);
2922
2923                 fd = copy_file(*root_image,
2924                                new_root,
2925                                O_EXCL,
2926                                0600,
2927                                COPY_LOCK_BSD|
2928                                COPY_REFLINK|
2929                                COPY_CRTIME);
2930                 if (fd < 0)
2931                         return log_debug_errno(fd, "Failed to copy image %s to %s: %m",
2932                                                *root_image, new_root);
2933
2934                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
2935                  * which tends to not perform well in combination with lots of random writes.
2936                  *
2937                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
2938                  * copy, but we at least want to make the intention clear.
2939                  */
2940                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
2941                 if (r < 0)
2942                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", new_root);
2943         } else {
2944                 assert(*root_directory);
2945
2946                 log_debug("Making ephemeral snapshot of %s to %s", *root_directory, new_root);
2947
2948                 fd = btrfs_subvol_snapshot_at(
2949                                 AT_FDCWD, *root_directory,
2950                                 AT_FDCWD, new_root,
2951                                 BTRFS_SNAPSHOT_FALLBACK_COPY |
2952                                 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
2953                                 BTRFS_SNAPSHOT_RECURSIVE |
2954                                 BTRFS_SNAPSHOT_LOCK_BSD);
2955                 if (fd < 0)
2956                         return log_debug_errno(fd, "Failed to snapshot directory %s to %s: %m",
2957                                                *root_directory, new_root);
2958         }
2959
2960         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
2961         if (r < 0)
2962                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
2963
2964         if (*root_image)
2965                 free_and_replace(*root_image, new_root);
2966         else {
2967                 assert(*root_directory);
2968                 free_and_replace(*root_directory, new_root);
2969         }
2970
2971         return 1;
2972 }
2973
2974 static int verity_settings_prepare(
2975                 VeritySettings *verity,
2976                 const char *root_image,
2977                 const void *root_hash,
2978                 size_t root_hash_size,
2979                 const char *root_hash_path,
2980                 const void *root_hash_sig,
2981                 size_t root_hash_sig_size,
2982                 const char *root_hash_sig_path,
2983                 const char *verity_data_path) {
2984
2985         int r;
2986
2987         assert(verity);
2988
2989         if (root_hash) {
2990                 void *d;
2991
2992                 d = memdup(root_hash, root_hash_size);
2993                 if (!d)
2994                         return -ENOMEM;
2995
2996                 free_and_replace(verity->root_hash, d);
2997                 verity->root_hash_size = root_hash_size;
2998                 verity->designator = PARTITION_ROOT;
2999         }
3000
3001         if (root_hash_sig) {
3002                 void *d;
3003
3004                 d = memdup(root_hash_sig, root_hash_sig_size);
3005                 if (!d)
3006                         return -ENOMEM;
3007
3008                 free_and_replace(verity->root_hash_sig, d);
3009                 verity->root_hash_sig_size = root_hash_sig_size;
3010                 verity->designator = PARTITION_ROOT;
3011         }
3012
3013         if (verity_data_path) {
3014                 r = free_and_strdup(&verity->data_path, verity_data_path);
3015                 if (r < 0)
3016                         return r;
3017         }
3018
3019         r = verity_settings_load(
3020                         verity,
3021                         root_image,
3022                         root_hash_path,
3023                         root_hash_sig_path);
3024         if (r < 0)
3025                 return log_debug_errno(r, "Failed to load root hash: %m");
3026
3027         return 0;
3028 }
3029
3030 static int pick_versions(
3031                 const ExecContext *context,
3032                 const ExecParameters *params,
3033                 char **ret_root_image,
3034                 char **ret_root_directory) {
3035
3036         int r;
3037
3038         assert(context);
3039         assert(params);
3040         assert(ret_root_image);
3041         assert(ret_root_directory);
3042
3043         if (context->root_image) {
3044                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3045
3046                 r = path_pick(/* toplevel_path= */ NULL,
3047                               /* toplevel_fd= */ AT_FDCWD,
3048                               context->root_image,
3049                               &pick_filter_image_raw,
3050                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3051                               &result);
3052                 if (r < 0)
3053                         return r;
3054
3055                 if (!result.path)
3056                         return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_image);
3057
3058                 *ret_root_image = TAKE_PTR(result.path);
3059                 *ret_root_directory = NULL;
3060                 return r;
3061         }
3062
3063         if (context->root_directory) {
3064                 _cleanup_(pick_result_done) PickResult result = PICK_RESULT_NULL;
3065
3066                 r = path_pick(/* toplevel_path= */ NULL,
3067                               /* toplevel_fd= */ AT_FDCWD,
3068                               context->root_directory,
3069                               &pick_filter_image_dir,
3070                               PICK_ARCHITECTURE|PICK_TRIES|PICK_RESOLVE,
3071                               &result);
3072                 if (r < 0)
3073                         return r;
3074
3075                 if (!result.path)
3076                         return log_exec_debug_errno(context, params, SYNTHETIC_ERRNO(ENOENT), "No matching entry in .v/ directory %s found.", context->root_directory);
3077
3078                 *ret_root_image = NULL;
3079                 *ret_root_directory = TAKE_PTR(result.path);
3080                 return r;
3081         }
3082
3083         *ret_root_image = *ret_root_directory = NULL;
3084         return 0;
3085 }
3086
3087 static int apply_mount_namespace(
3088                 ExecCommandFlags command_flags,
3089                 const ExecContext *context,
3090                 const ExecParameters *params,
3091                 ExecRuntime *runtime,
3092                 const char *memory_pressure_path,
3093                 bool needs_sandboxing,
3094                 char **error_path) {
3095
3096         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3097         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3098                         **read_write_paths_cleanup = NULL;
3099         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3100                 *extension_dir = NULL, *host_os_release_stage = NULL, *root_image = NULL, *root_dir = NULL;
3101         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3102         char **read_write_paths;
3103         bool setup_os_release_symlink;
3104         BindMount *bind_mounts = NULL;
3105         size_t n_bind_mounts = 0;
3106         int r;
3107
3108         assert(context);
3109
3110         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3111
3112         if (params->flags & EXEC_APPLY_CHROOT) {
3113                 r = pick_versions(
3114                                 context,
3115                                 params,
3116                                 &root_image,
3117                                 &root_dir);
3118                 if (r < 0)
3119                         return r;
3120
3121                 r = setup_ephemeral(
3122                                 context,
3123                                 runtime,
3124                                 &root_image,
3125                                 &root_dir);
3126                 if (r < 0)
3127                         return r;
3128         }
3129
3130         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3131         if (r < 0)
3132                 return r;
3133
3134         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3135          * service will need to write to it in order to start the notifications. */
3136         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3137                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3138                 if (!read_write_paths_cleanup)
3139                         return -ENOMEM;
3140
3141                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3142                 if (r < 0)
3143                         return r;
3144
3145                 read_write_paths = read_write_paths_cleanup;
3146         } else
3147                 read_write_paths = context->read_write_paths;
3148
3149         if (needs_sandboxing) {
3150                 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3151                  * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3152                  * use here.  This does not apply when we are using /run/systemd/empty as fallback. */
3153
3154                 if (context->private_tmp && runtime && runtime->shared) {
3155                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3156                                 tmp_dir = runtime->shared->tmp_dir;
3157                         else if (runtime->shared->tmp_dir)
3158                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3159
3160                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3161                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3162                         else if (runtime->shared->var_tmp_dir)
3163                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3164                 }
3165         }
3166
3167         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3168         setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
3169         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3170         if (r < 0)
3171                 return r;
3172
3173         if (context->mount_propagation_flag == MS_SHARED)
3174                 log_exec_debug(context,
3175                                params,
3176                                "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3177
3178         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3179                 r = exec_context_get_credential_directory(context, params, params->unit_id, &creds_path);
3180                 if (r < 0)
3181                         return r;
3182         }
3183
3184         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3185                 propagate_dir = path_join("/run/systemd/propagate/", params->unit_id);
3186                 if (!propagate_dir)
3187                         return -ENOMEM;
3188
3189                 incoming_dir = strdup("/run/systemd/incoming");
3190                 if (!incoming_dir)
3191                         return -ENOMEM;
3192
3193                 extension_dir = strdup("/run/systemd/unit-extensions");
3194                 if (!extension_dir)
3195                         return -ENOMEM;
3196
3197                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3198                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3199                 if (setup_os_release_symlink) {
3200                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3201                         if (!host_os_release_stage)
3202                                 return -ENOMEM;
3203                 }
3204         } else {
3205                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3206
3207                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3208                         return -ENOMEM;
3209
3210                 if (setup_os_release_symlink) {
3211                         if (asprintf(&host_os_release_stage,
3212                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3213                                      geteuid()) < 0)
3214                                 return -ENOMEM;
3215                 }
3216         }
3217
3218         if (root_image) {
3219                 r = verity_settings_prepare(
3220                         &verity,
3221                         root_image,
3222                         context->root_hash, context->root_hash_size, context->root_hash_path,
3223                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3224                         context->root_verity);
3225                 if (r < 0)
3226                         return r;
3227         }
3228
3229         NamespaceParameters parameters = {
3230                 .runtime_scope = params->runtime_scope,
3231
3232                 .root_directory = root_dir,
3233                 .root_image = root_image,
3234                 .root_image_options = context->root_image_options,
3235                 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3236
3237                 .read_write_paths = read_write_paths,
3238                 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3239                 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3240
3241                 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3242                 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3243
3244                 .empty_directories = empty_directories,
3245                 .symlinks = symlinks,
3246
3247                 .bind_mounts = bind_mounts,
3248                 .n_bind_mounts = n_bind_mounts,
3249
3250                 .temporary_filesystems = context->temporary_filesystems,
3251                 .n_temporary_filesystems = context->n_temporary_filesystems,
3252
3253                 .mount_images = context->mount_images,
3254                 .n_mount_images = context->n_mount_images,
3255                 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3256
3257                 .tmp_dir = tmp_dir,
3258                 .var_tmp_dir = var_tmp_dir,
3259
3260                 .creds_path = creds_path,
3261                 .log_namespace = context->log_namespace,
3262                 .mount_propagation_flag = context->mount_propagation_flag,
3263
3264                 .verity = &verity,
3265
3266                 .extension_images = context->extension_images,
3267                 .n_extension_images = context->n_extension_images,
3268                 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3269                 .extension_directories = context->extension_directories,
3270
3271                 .propagate_dir = propagate_dir,
3272                 .incoming_dir = incoming_dir,
3273                 .extension_dir = extension_dir,
3274                 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3275                 .host_os_release_stage = host_os_release_stage,
3276
3277                 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3278                  * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3279                  * sandbox inside the mount namespace. */
3280                 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3281
3282                 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3283                 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3284                 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3285                 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3286                 .protect_hostname = needs_sandboxing && context->protect_hostname,
3287
3288                 .private_dev = needs_sandboxing && context->private_devices,
3289                 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3290                 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3291
3292                 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3293
3294                 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3295                 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3296
3297                 .protect_home = needs_sandboxing ? context->protect_home : false,
3298                 .protect_system = needs_sandboxing ? context->protect_system : false,
3299                 .protect_proc = needs_sandboxing ? context->protect_proc : false,
3300                 .proc_subset = needs_sandboxing ? context->proc_subset : false,
3301         };
3302
3303         r = setup_namespace(&parameters, error_path);
3304         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3305          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3306          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3307          * completely different execution environment. */
3308         if (r == -ENOANO) {
3309                 if (insist_on_sandboxing(
3310                                     context,
3311                                     root_dir, root_image,
3312                                     bind_mounts,
3313                                     n_bind_mounts))
3314                         return log_exec_debug_errno(context,
3315                                                     params,
3316                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3317                                                     "Failed to set up namespace, and refusing to continue since "
3318                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3319                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3320                                                     n_bind_mounts,
3321                                                     context->n_temporary_filesystems,
3322                                                     yes_no(root_dir),
3323                                                     yes_no(root_image),
3324                                                     yes_no(context->dynamic_user));
3325
3326                 log_exec_debug(context, params, "Failed to set up namespace, assuming containerized execution and ignoring.");
3327                 return 0;
3328         }
3329
3330         return r;
3331 }
3332
3333 static int apply_working_directory(
3334                 const ExecContext *context,
3335                 const ExecParameters *params,
3336                 ExecRuntime *runtime,
3337                 const char *home,
3338                 int *exit_status) {
3339
3340         const char *wd;
3341         int r;
3342
3343         assert(context);
3344         assert(exit_status);
3345
3346         if (context->working_directory_home) {
3347                 if (!home) {
3348                         *exit_status = EXIT_CHDIR;
3349                         return -ENXIO;
3350                 }
3351
3352                 wd = home;
3353         } else
3354                 wd = empty_to_root(context->working_directory);
3355
3356         if (params->flags & EXEC_APPLY_CHROOT)
3357                 r = RET_NERRNO(chdir(wd));
3358         else {
3359                 _cleanup_close_ int dfd = -EBADF;
3360
3361                 r = chase(wd,
3362                           (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory,
3363                           CHASE_PREFIX_ROOT|CHASE_AT_RESOLVE_IN_ROOT,
3364                           /* ret_path= */ NULL,
3365                           &dfd);
3366                 if (r >= 0)
3367                         r = RET_NERRNO(fchdir(dfd));
3368         }
3369
3370         if (r < 0 && !context->working_directory_missing_ok) {
3371                 *exit_status = EXIT_CHDIR;
3372                 return r;
3373         }
3374
3375         return 0;
3376 }
3377
3378 static int apply_root_directory(
3379                 const ExecContext *context,
3380                 const ExecParameters *params,
3381                 ExecRuntime *runtime,
3382                 const bool needs_mount_ns,
3383                 int *exit_status) {
3384
3385         assert(context);
3386         assert(exit_status);
3387
3388         if (params->flags & EXEC_APPLY_CHROOT)
3389                 if (!needs_mount_ns && context->root_directory)
3390                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3391                                 *exit_status = EXIT_CHROOT;
3392                                 return -errno;
3393                         }
3394
3395         return 0;
3396 }
3397
3398 static int setup_keyring(
3399                 const ExecContext *context,
3400                 const ExecParameters *p,
3401                 uid_t uid, gid_t gid) {
3402
3403         key_serial_t keyring;
3404         int r = 0;
3405         uid_t saved_uid;
3406         gid_t saved_gid;
3407
3408         assert(context);
3409         assert(p);
3410
3411         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3412          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3413          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3414          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3415          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3416          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3417
3418         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3419                 return 0;
3420
3421         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3422          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3423          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3424          * & group is just as nasty as acquiring a reference to the user keyring. */
3425
3426         saved_uid = getuid();
3427         saved_gid = getgid();
3428
3429         if (gid_is_valid(gid) && gid != saved_gid) {
3430                 if (setregid(gid, -1) < 0)
3431                         return log_exec_error_errno(context,
3432                                                     p,
3433                                                     errno,
3434                                                     "Failed to change GID for user keyring: %m");
3435         }
3436
3437         if (uid_is_valid(uid) && uid != saved_uid) {
3438                 if (setreuid(uid, -1) < 0) {
3439                         r = log_exec_error_errno(context,
3440                                                  p,
3441                                                  errno,
3442                                                  "Failed to change UID for user keyring: %m");
3443                         goto out;
3444                 }
3445         }
3446
3447         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3448         if (keyring == -1) {
3449                 if (errno == ENOSYS)
3450                         log_exec_debug_errno(context,
3451                                              p,
3452                                              errno,
3453                                              "Kernel keyring not supported, ignoring.");
3454                 else if (ERRNO_IS_PRIVILEGE(errno))
3455                         log_exec_debug_errno(context,
3456                                              p,
3457                                              errno,
3458                                              "Kernel keyring access prohibited, ignoring.");
3459                 else if (errno == EDQUOT)
3460                         log_exec_debug_errno(context,
3461                                              p,
3462                                              errno,
3463                                              "Out of kernel keyrings to allocate, ignoring.");
3464                 else
3465                         r = log_exec_error_errno(context,
3466                                                  p,
3467                                                  errno,
3468                                                  "Setting up kernel keyring failed: %m");
3469
3470                 goto out;
3471         }
3472
3473         /* When requested link the user keyring into the session keyring. */
3474         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3475
3476                 if (keyctl(KEYCTL_LINK,
3477                            KEY_SPEC_USER_KEYRING,
3478                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3479                         r = log_exec_error_errno(context,
3480                                                  p,
3481                                                  errno,
3482                                                  "Failed to link user keyring into session keyring: %m");
3483                         goto out;
3484                 }
3485         }
3486
3487         /* Restore uid/gid back */
3488         if (uid_is_valid(uid) && uid != saved_uid) {
3489                 if (setreuid(saved_uid, -1) < 0) {
3490                         r = log_exec_error_errno(context,
3491                                                  p,
3492                                                  errno,
3493                                                  "Failed to change UID back for user keyring: %m");
3494                         goto out;
3495                 }
3496         }
3497
3498         if (gid_is_valid(gid) && gid != saved_gid) {
3499                 if (setregid(saved_gid, -1) < 0)
3500                         return log_exec_error_errno(context,
3501                                                     p,
3502                                                     errno,
3503                                                     "Failed to change GID back for user keyring: %m");
3504         }
3505
3506         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3507         if (!sd_id128_is_null(p->invocation_id)) {
3508                 key_serial_t key;
3509
3510                 key = add_key("user",
3511                               "invocation_id",
3512                               &p->invocation_id,
3513                               sizeof(p->invocation_id),
3514                               KEY_SPEC_SESSION_KEYRING);
3515                 if (key == -1)
3516                         log_exec_debug_errno(context,
3517                                              p,
3518                                              errno,
3519                                              "Failed to add invocation ID to keyring, ignoring: %m");
3520                 else {
3521                         if (keyctl(KEYCTL_SETPERM, key,
3522                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3523                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3524                                 r = log_exec_error_errno(context,
3525                                                          p,
3526                                                          errno,
3527                                                          "Failed to restrict invocation ID permission: %m");
3528                 }
3529         }
3530
3531 out:
3532         /* Revert back uid & gid for the last time, and exit */
3533         /* no extra logging, as only the first already reported error matters */
3534         if (getuid() != saved_uid)
3535                 (void) setreuid(saved_uid, -1);
3536
3537         if (getgid() != saved_gid)
3538                 (void) setregid(saved_gid, -1);
3539
3540         return r;
3541 }
3542
3543 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3544         assert(array);
3545         assert(n);
3546         assert(pair);
3547
3548         if (pair[0] >= 0)
3549                 array[(*n)++] = pair[0];
3550         if (pair[1] >= 0)
3551                 array[(*n)++] = pair[1];
3552 }
3553
3554 static int close_remaining_fds(
3555                 const ExecParameters *params,
3556                 const ExecRuntime *runtime,
3557                 int socket_fd,
3558                 const int *fds, size_t n_fds) {
3559
3560         size_t n_dont_close = 0;
3561         int dont_close[n_fds + 14];
3562
3563         assert(params);
3564
3565         if (params->stdin_fd >= 0)
3566                 dont_close[n_dont_close++] = params->stdin_fd;
3567         if (params->stdout_fd >= 0)
3568                 dont_close[n_dont_close++] = params->stdout_fd;
3569         if (params->stderr_fd >= 0)
3570                 dont_close[n_dont_close++] = params->stderr_fd;
3571
3572         if (socket_fd >= 0)
3573                 dont_close[n_dont_close++] = socket_fd;
3574         if (n_fds > 0) {
3575                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3576                 n_dont_close += n_fds;
3577         }
3578
3579         if (runtime)
3580                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3581
3582         if (runtime && runtime->shared) {
3583                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3584                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3585         }
3586
3587         if (runtime && runtime->dynamic_creds) {
3588                 if (runtime->dynamic_creds->user)
3589                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3590                 if (runtime->dynamic_creds->group)
3591                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3592         }
3593
3594         if (params->user_lookup_fd >= 0)
3595                 dont_close[n_dont_close++] = params->user_lookup_fd;
3596
3597         return close_all_fds(dont_close, n_dont_close);
3598 }
3599
3600 static int send_user_lookup(
3601                 const char *unit_id,
3602                 int user_lookup_fd,
3603                 uid_t uid,
3604                 gid_t gid) {
3605
3606         assert(unit_id);
3607
3608         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3609          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3610          * specified. */
3611
3612         if (user_lookup_fd < 0)
3613                 return 0;
3614
3615         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3616                 return 0;
3617
3618         if (writev(user_lookup_fd,
3619                (struct iovec[]) {
3620                            IOVEC_MAKE(&uid, sizeof(uid)),
3621                            IOVEC_MAKE(&gid, sizeof(gid)),
3622                            IOVEC_MAKE_STRING(unit_id) }, 3) < 0)
3623                 return -errno;
3624
3625         return 0;
3626 }
3627
3628 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3629         int r;
3630
3631         assert(c);
3632         assert(home);
3633         assert(buf);
3634
3635         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3636
3637         if (*home)
3638                 return 0;
3639
3640         if (!c->working_directory_home)
3641                 return 0;
3642
3643         r = get_home_dir(buf);
3644         if (r < 0)
3645                 return r;
3646
3647         *home = *buf;
3648         return 1;
3649 }
3650
3651 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3652         _cleanup_strv_free_ char ** list = NULL;
3653         int r;
3654
3655         assert(c);
3656         assert(p);
3657         assert(ret);
3658
3659         assert(c->dynamic_user);
3660
3661         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3662          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3663          * directories. */
3664
3665         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3666                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3667                         continue;
3668
3669                 if (!p->prefix[t])
3670                         continue;
3671
3672                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3673                         char *e;
3674
3675                         if (exec_directory_is_private(c, t))
3676                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3677                         else
3678                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3679                         if (!e)
3680                                 return -ENOMEM;
3681
3682                         r = strv_consume(&list, e);
3683                         if (r < 0)
3684                                 return r;
3685                 }
3686         }
3687
3688         *ret = TAKE_PTR(list);
3689
3690         return 0;
3691 }
3692
3693 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3694         _cleanup_(cpu_set_reset) CPUSet s = {};
3695         int r;
3696
3697         assert(c);
3698         assert(ret);
3699
3700         if (!c->numa_policy.nodes.set) {
3701                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3702                 return 0;
3703         }
3704
3705         r = numa_to_cpu_set(&c->numa_policy, &s);
3706         if (r < 0)
3707                 return r;
3708
3709         cpu_set_reset(ret);
3710
3711         return cpu_set_add_all(ret, &s);
3712 }
3713
3714 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int *fd) {
3715         int r;
3716
3717         assert(fds);
3718         assert(n_fds);
3719         assert(*n_fds < fds_size);
3720         assert(fd);
3721
3722         if (*fd < 0)
3723                return 0;
3724
3725         if (*fd < 3 + (int) *n_fds) {
3726                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3727                  * the fds we pass to the process (or which are closed only during execve). */
3728
3729                 r = fcntl(*fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3730                 if (r < 0)
3731                         return -errno;
3732
3733                 close_and_replace(*fd, r);
3734         }
3735
3736         fds[(*n_fds)++] = *fd;
3737         return 1;
3738 }
3739
3740 static int connect_unix_harder(const ExecContext *c, const ExecParameters *p, const OpenFile *of, int ofd) {
3741         union sockaddr_union addr = {
3742                 .un.sun_family = AF_UNIX,
3743         };
3744         socklen_t sa_len;
3745         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3746         int r;
3747
3748         assert(c);
3749         assert(p);
3750         assert(of);
3751         assert(ofd >= 0);
3752
3753         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3754         if (r < 0)
3755                 return log_exec_error_errno(c, p, r, "Failed to set sockaddr for %s: %m", of->path);
3756
3757         sa_len = r;
3758
3759         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3760                 _cleanup_close_ int fd = -EBADF;
3761
3762                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3763                 if (fd < 0)
3764                         return log_exec_error_errno(c,
3765                                                     p,
3766                                                     errno,
3767                                                     "Failed to create socket for %s: %m",
3768                                                     of->path);
3769
3770                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3771                 if (r == -EPROTOTYPE)
3772                         continue;
3773                 if (r < 0)
3774                         return log_exec_error_errno(c,
3775                                                     p,
3776                                                     r,
3777                                                     "Failed to connect socket for %s: %m",
3778                                                     of->path);
3779
3780                 return TAKE_FD(fd);
3781         }
3782
3783         return log_exec_error_errno(c,
3784                                     p,
3785                                     SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".",
3786                                     of->path);
3787 }
3788
3789 static int get_open_file_fd(const ExecContext *c, const ExecParameters *p, const OpenFile *of) {
3790         struct stat st;
3791         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3792
3793         assert(c);
3794         assert(p);
3795         assert(of);
3796
3797         ofd = open(of->path, O_PATH | O_CLOEXEC);
3798         if (ofd < 0)
3799                 return log_exec_error_errno(c, p, errno, "Could not open \"%s\": %m", of->path);
3800
3801         if (fstat(ofd, &st) < 0)
3802                 return log_exec_error_errno(c, p, errno, "Failed to stat %s: %m", of->path);
3803
3804         if (S_ISSOCK(st.st_mode)) {
3805                 fd = connect_unix_harder(c, p, of, ofd);
3806                 if (fd < 0)
3807                         return fd;
3808
3809                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3810                         return log_exec_error_errno(c, p, errno, "Failed to shutdown send for socket %s: %m",
3811                                                     of->path);
3812
3813                 log_exec_debug(c, p, "socket %s opened (fd=%d)", of->path, fd);
3814         } else {
3815                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3816                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3817                         flags |= O_APPEND;
3818                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3819                         flags |= O_TRUNC;
3820
3821                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3822                 if (fd < 0)
3823                         return log_exec_error_errno(c, p, fd, "Failed to open file %s: %m", of->path);
3824
3825                 log_exec_debug(c, p, "file %s opened (fd=%d)", of->path, fd);
3826         }
3827
3828         return TAKE_FD(fd);
3829 }
3830
3831 static int collect_open_file_fds(const ExecContext *c, ExecParameters *p, size_t *n_fds) {
3832         int r;
3833
3834         assert(c);
3835         assert(p);
3836         assert(n_fds);
3837
3838         LIST_FOREACH(open_files, of, p->open_files) {
3839                 _cleanup_close_ int fd = -EBADF;
3840
3841                 fd = get_open_file_fd(c, p, of);
3842                 if (fd < 0) {
3843                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3844                                 log_exec_debug_errno(c, p, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3845                                 continue;
3846                         }
3847
3848                         return fd;
3849                 }
3850
3851                 if (!GREEDY_REALLOC(p->fds, *n_fds + 1))
3852                         return -ENOMEM;
3853
3854                 r = strv_extend(&p->fd_names, of->fdname);
3855                 if (r < 0)
3856                         return r;
3857
3858                 p->fds[*n_fds] = TAKE_FD(fd);
3859
3860                 (*n_fds)++;
3861         }
3862
3863         return 0;
3864 }
3865
3866 static void log_command_line(
3867                 const ExecContext *context,
3868                 const ExecParameters *params,
3869                 const char *msg,
3870                 const char *executable,
3871                 char **argv) {
3872
3873         assert(context);
3874         assert(params);
3875         assert(msg);
3876         assert(executable);
3877
3878         if (!DEBUG_LOGGING)
3879                 return;
3880
3881         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3882
3883         log_exec_struct(context, params, LOG_DEBUG,
3884                         "EXECUTABLE=%s", executable,
3885                         LOG_EXEC_MESSAGE(params, "%s: %s", msg, strnull(cmdline)),
3886                         LOG_EXEC_INVOCATION_ID(params));
3887 }
3888
3889 static bool exec_context_need_unprivileged_private_users(
3890                 const ExecContext *context,
3891                 const ExecParameters *params) {
3892
3893         assert(context);
3894         assert(params);
3895
3896         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3897          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3898          * (system manager) then we have privileges and don't need this. */
3899         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3900                 return false;
3901
3902         return context->private_users ||
3903                context->private_tmp ||
3904                context->private_devices ||
3905                context->private_network ||
3906                context->network_namespace_path ||
3907                context->private_ipc ||
3908                context->ipc_namespace_path ||
3909                context->private_mounts > 0 ||
3910                context->mount_apivfs ||
3911                context->n_bind_mounts > 0 ||
3912                context->n_temporary_filesystems > 0 ||
3913                context->root_directory ||
3914                !strv_isempty(context->extension_directories) ||
3915                context->protect_system != PROTECT_SYSTEM_NO ||
3916                context->protect_home != PROTECT_HOME_NO ||
3917                context->protect_kernel_tunables ||
3918                context->protect_kernel_modules ||
3919                context->protect_kernel_logs ||
3920                context->protect_control_groups ||
3921                context->protect_clock ||
3922                context->protect_hostname ||
3923                !strv_isempty(context->read_write_paths) ||
3924                !strv_isempty(context->read_only_paths) ||
3925                !strv_isempty(context->inaccessible_paths) ||
3926                !strv_isempty(context->exec_paths) ||
3927                !strv_isempty(context->no_exec_paths);
3928 }
3929
3930 static bool exec_context_shall_confirm_spawn(const ExecContext *context) {
3931         assert(context);
3932
3933         if (confirm_spawn_disabled())
3934                 return false;
3935
3936         /* For some reasons units remaining in the same process group
3937          * as PID 1 fail to acquire the console even if it's not used
3938          * by any process. So skip the confirmation question for them. */
3939         return !context->same_pgrp;
3940 }
3941
3942 static int exec_context_named_iofds(
3943                 const ExecContext *c,
3944                 const ExecParameters *p,
3945                 int named_iofds[static 3]) {
3946
3947         size_t targets;
3948         const char* stdio_fdname[3];
3949         size_t n_fds;
3950
3951         assert(c);
3952         assert(p);
3953         assert(named_iofds);
3954
3955         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3956                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3957                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3958
3959         for (size_t i = 0; i < 3; i++)
3960                 stdio_fdname[i] = exec_context_fdname(c, i);
3961
3962         n_fds = p->n_storage_fds + p->n_socket_fds;
3963
3964         for (size_t i = 0; i < n_fds  && targets > 0; i++)
3965                 if (named_iofds[STDIN_FILENO] < 0 &&
3966                     c->std_input == EXEC_INPUT_NAMED_FD &&
3967                     stdio_fdname[STDIN_FILENO] &&
3968                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3969
3970                         named_iofds[STDIN_FILENO] = p->fds[i];
3971                         targets--;
3972
3973                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3974                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3975                            stdio_fdname[STDOUT_FILENO] &&
3976                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3977
3978                         named_iofds[STDOUT_FILENO] = p->fds[i];
3979                         targets--;
3980
3981                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3982                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3983                            stdio_fdname[STDERR_FILENO] &&
3984                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3985
3986                         named_iofds[STDERR_FILENO] = p->fds[i];
3987                         targets--;
3988                 }
3989
3990         return targets == 0 ? 0 : -ENOENT;
3991 }
3992
3993 static void exec_shared_runtime_close(ExecSharedRuntime *shared) {
3994         if (!shared)
3995                 return;
3996
3997         safe_close_pair(shared->netns_storage_socket);
3998         safe_close_pair(shared->ipcns_storage_socket);
3999 }
4000
4001 static void exec_runtime_close(ExecRuntime *rt) {
4002         if (!rt)
4003                 return;
4004
4005         safe_close_pair(rt->ephemeral_storage_socket);
4006
4007         exec_shared_runtime_close(rt->shared);
4008         dynamic_creds_close(rt->dynamic_creds);
4009 }
4010
4011 static void exec_params_close(ExecParameters *p) {
4012         if (!p)
4013                 return;
4014
4015         p->stdin_fd = safe_close(p->stdin_fd);
4016         p->stdout_fd = safe_close(p->stdout_fd);
4017         p->stderr_fd = safe_close(p->stderr_fd);
4018 }
4019
4020 int exec_invoke(
4021                 const ExecCommand *command,
4022                 const ExecContext *context,
4023                 ExecParameters *params,
4024                 ExecRuntime *runtime,
4025                 const CGroupContext *cgroup_context,
4026                 int *exit_status) {
4027
4028         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4029         int r, ngids = 0;
4030         _cleanup_free_ gid_t *supplementary_gids = NULL;
4031         const char *username = NULL, *groupname = NULL;
4032         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4033         const char *home = NULL, *shell = NULL;
4034         char **final_argv = NULL;
4035         dev_t journal_stream_dev = 0;
4036         ino_t journal_stream_ino = 0;
4037         bool userns_set_up = false;
4038         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4039                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4040                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4041                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4042         bool keep_seccomp_privileges = false;
4043 #if HAVE_SELINUX
4044         _cleanup_free_ char *mac_selinux_context_net = NULL;
4045         bool use_selinux = false;
4046 #endif
4047 #if ENABLE_SMACK
4048         bool use_smack = false;
4049 #endif
4050 #if HAVE_APPARMOR
4051         bool use_apparmor = false;
4052 #endif
4053 #if HAVE_SECCOMP
4054         uint64_t saved_bset = 0;
4055 #endif
4056         uid_t saved_uid = getuid();
4057         gid_t saved_gid = getgid();
4058         uid_t uid = UID_INVALID;
4059         gid_t gid = GID_INVALID;
4060         size_t n_fds, /* fds to pass to the child */
4061                n_keep_fds; /* total number of fds not to close */
4062         int secure_bits;
4063         _cleanup_free_ gid_t *gids_after_pam = NULL;
4064         int ngids_after_pam = 0;
4065
4066         int socket_fd = -EBADF, named_iofds[3] = EBADF_TRIPLET;
4067         size_t n_storage_fds, n_socket_fds;
4068
4069         assert(command);
4070         assert(context);
4071         assert(params);
4072         assert(exit_status);
4073
4074         /* This should be mostly redundant, as the log level is also passed as an argument of the executor,
4075          * and is already applied earlier. Just for safety. */
4076         if (context->log_level_max >= 0)
4077                 log_set_max_level(context->log_level_max);
4078
4079         /* Explicitly test for CVE-2021-4034 inspired invocations */
4080         if (!command->path || strv_isempty(command->argv)) {
4081                 *exit_status = EXIT_EXEC;
4082                 return log_exec_error_errno(
4083                                 context,
4084                                 params,
4085                                 SYNTHETIC_ERRNO(EINVAL),
4086                                 "Invalid command line arguments.");
4087         }
4088
4089         LOG_CONTEXT_PUSH_EXEC(context, params);
4090
4091         if (context->std_input == EXEC_INPUT_SOCKET ||
4092             context->std_output == EXEC_OUTPUT_SOCKET ||
4093             context->std_error == EXEC_OUTPUT_SOCKET) {
4094
4095                 if (params->n_socket_fds > 1)
4096                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4097
4098                 if (params->n_socket_fds == 0)
4099                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4100
4101                 socket_fd = params->fds[0];
4102                 n_storage_fds = n_socket_fds = 0;
4103         } else {
4104                 n_socket_fds = params->n_socket_fds;
4105                 n_storage_fds = params->n_storage_fds;
4106         }
4107         n_fds = n_socket_fds + n_storage_fds;
4108
4109         r = exec_context_named_iofds(context, params, named_iofds);
4110         if (r < 0)
4111                 return log_exec_error_errno(context, params, r, "Failed to load a named file descriptor: %m");
4112
4113         rename_process_from_path(command->path);
4114
4115         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4116          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4117          * both of which will be demoted to SIG_DFL. */
4118         (void) default_signals(SIGNALS_CRASH_HANDLER,
4119                                SIGNALS_IGNORE);
4120
4121         if (context->ignore_sigpipe)
4122                 (void) ignore_signals(SIGPIPE);
4123
4124         r = reset_signal_mask();
4125         if (r < 0) {
4126                 *exit_status = EXIT_SIGNAL_MASK;
4127                 return log_exec_error_errno(context, params, r, "Failed to set process signal mask: %m");
4128         }
4129
4130         if (params->idle_pipe)
4131                 do_idle_pipe_dance(params->idle_pipe);
4132
4133         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4134          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4135          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4136          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4137
4138         log_forget_fds();
4139         log_set_open_when_needed(true);
4140         log_settle_target();
4141
4142         /* In case anything used libc syslog(), close this here, too */
4143         closelog();
4144
4145         r = collect_open_file_fds(context, params, &n_fds);
4146         if (r < 0) {
4147                 *exit_status = EXIT_FDS;
4148                 return log_exec_error_errno(context, params, r, "Failed to get OpenFile= file descriptors: %m");
4149         }
4150
4151         int keep_fds[n_fds + 3];
4152         memcpy_safe(keep_fds, params->fds, n_fds * sizeof(int));
4153         n_keep_fds = n_fds;
4154
4155         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->exec_fd);
4156         if (r < 0) {
4157                 *exit_status = EXIT_FDS;
4158                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4159         }
4160
4161 #if HAVE_LIBBPF
4162         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &params->bpf_restrict_fs_map_fd);
4163         if (r < 0) {
4164                 *exit_status = EXIT_FDS;
4165                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4166         }
4167 #endif
4168
4169         r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
4170         if (r < 0) {
4171                 *exit_status = EXIT_FDS;
4172                 return log_exec_error_errno(context, params, r, "Failed to close unwanted file descriptors: %m");
4173         }
4174
4175         if (!context->same_pgrp &&
4176             setsid() < 0) {
4177                 *exit_status = EXIT_SETSID;
4178                 return log_exec_error_errno(context, params, errno, "Failed to create new process session: %m");
4179         }
4180
4181         exec_context_tty_reset(context, params);
4182
4183         if (params->shall_confirm_spawn && exec_context_shall_confirm_spawn(context)) {
4184                 _cleanup_free_ char *cmdline = NULL;
4185
4186                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4187                 if (!cmdline) {
4188                         *exit_status = EXIT_MEMORY;
4189                         return log_oom();
4190                 }
4191
4192                 r = ask_for_confirmation(context, params, cmdline);
4193                 if (r != CONFIRM_EXECUTE) {
4194                         if (r == CONFIRM_PRETEND_SUCCESS) {
4195                                 *exit_status = EXIT_SUCCESS;
4196                                 return 0;
4197                         }
4198
4199                         *exit_status = EXIT_CONFIRM;
4200                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ECANCELED),
4201                                                     "Execution cancelled by the user");
4202                 }
4203         }
4204
4205         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4206          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4207          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4208          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4209          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4210         if (setenv("SYSTEMD_ACTIVATION_UNIT", params->unit_id, true) != 0 ||
4211             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4212                 *exit_status = EXIT_MEMORY;
4213                 return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4214         }
4215
4216         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4217                 _cleanup_strv_free_ char **suggested_paths = NULL;
4218
4219                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4220                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4221                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4222                         *exit_status = EXIT_USER;
4223                         return log_exec_error_errno(context, params, errno, "Failed to update environment: %m");
4224                 }
4225
4226                 r = compile_suggested_paths(context, params, &suggested_paths);
4227                 if (r < 0) {
4228                         *exit_status = EXIT_MEMORY;
4229                         return log_oom();
4230                 }
4231
4232                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4233                 if (r < 0) {
4234                         *exit_status = EXIT_USER;
4235                         if (r == -EILSEQ)
4236                                 return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4237                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4238                         return log_exec_error_errno(context, params, r, "Failed to update dynamic user credentials: %m");
4239                 }
4240
4241                 if (!uid_is_valid(uid)) {
4242                         *exit_status = EXIT_USER;
4243                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4244                 }
4245
4246                 if (!gid_is_valid(gid)) {
4247                         *exit_status = EXIT_USER;
4248                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4249                 }
4250
4251                 if (runtime->dynamic_creds->user)
4252                         username = runtime->dynamic_creds->user->name;
4253
4254         } else {
4255                 if (context->user) {
4256                         r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4257                         if (r < 0) {
4258                                 *exit_status = EXIT_USER;
4259                                 return log_exec_error_errno(context, params, r, "Failed to determine user credentials: %m");
4260                         }
4261                 }
4262
4263                 if (context->group) {
4264                         r = get_fixed_group(context->group, &groupname, &gid);
4265                         if (r < 0) {
4266                                 *exit_status = EXIT_GROUP;
4267                                 return log_exec_error_errno(context, params, r, "Failed to determine group credentials: %m");
4268                         }
4269                 }
4270         }
4271
4272         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4273         r = get_supplementary_groups(context, username, groupname, gid,
4274                                      &supplementary_gids, &ngids);
4275         if (r < 0) {
4276                 *exit_status = EXIT_GROUP;
4277                 return log_exec_error_errno(context, params, r, "Failed to determine supplementary groups: %m");
4278         }
4279
4280         r = send_user_lookup(params->unit_id, params->user_lookup_fd, uid, gid);
4281         if (r < 0) {
4282                 *exit_status = EXIT_USER;
4283                 return log_exec_error_errno(context, params, r, "Failed to send user credentials to PID1: %m");
4284         }
4285
4286         params->user_lookup_fd = safe_close(params->user_lookup_fd);
4287
4288         r = acquire_home(context, uid, &home, &home_buffer);
4289         if (r < 0) {
4290                 *exit_status = EXIT_CHDIR;
4291                 return log_exec_error_errno(context, params, r, "Failed to determine $HOME for user: %m");
4292         }
4293
4294         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4295         if (socket_fd >= 0)
4296                 (void) fd_nonblock(socket_fd, false);
4297
4298         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4299          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4300         if (params->cgroup_path) {
4301                 _cleanup_free_ char *p = NULL;
4302
4303                 r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4304                 if (r < 0) {
4305                         *exit_status = EXIT_CGROUP;
4306                         return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4307                 }
4308
4309                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4310                 if (r == -EUCLEAN) {
4311                         *exit_status = EXIT_CGROUP;
4312                         return log_exec_error_errno(context, params, r, "Failed to attach process to cgroup %s "
4313                                                     "because the cgroup or one of its parents or "
4314                                                     "siblings is in the threaded mode: %m", p);
4315                 }
4316                 if (r < 0) {
4317                         *exit_status = EXIT_CGROUP;
4318                         return log_exec_error_errno(context, params, r, "Failed to attach to cgroup %s: %m", p);
4319                 }
4320         }
4321
4322         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4323                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4324                 if (r < 0) {
4325                         *exit_status = EXIT_NETWORK;
4326                         return log_exec_error_errno(context, params, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4327                 }
4328         }
4329
4330         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4331                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4332                 if (r < 0) {
4333                         *exit_status = EXIT_NAMESPACE;
4334                         return log_exec_error_errno(context, params, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4335                 }
4336         }
4337
4338         r = setup_input(context, params, socket_fd, named_iofds);
4339         if (r < 0) {
4340                 *exit_status = EXIT_STDIN;
4341                 return log_exec_error_errno(context, params, r, "Failed to set up standard input: %m");
4342         }
4343
4344         r = setup_output(context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4345         if (r < 0) {
4346                 *exit_status = EXIT_STDOUT;
4347                 return log_exec_error_errno(context, params, r, "Failed to set up standard output: %m");
4348         }
4349
4350         r = setup_output(context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4351         if (r < 0) {
4352                 *exit_status = EXIT_STDERR;
4353                 return log_exec_error_errno(context, params, r, "Failed to set up standard error output: %m");
4354         }
4355
4356         if (context->oom_score_adjust_set) {
4357                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4358                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4359                 r = set_oom_score_adjust(context->oom_score_adjust);
4360                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4361                         log_exec_debug_errno(context, params, r,
4362                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4363                 else if (r < 0) {
4364                         *exit_status = EXIT_OOM_ADJUST;
4365                         return log_exec_error_errno(context, params, r, "Failed to adjust OOM setting: %m");
4366                 }
4367         }
4368
4369         if (context->coredump_filter_set) {
4370                 r = set_coredump_filter(context->coredump_filter);
4371                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4372                         log_exec_debug_errno(context, params, r, "Failed to adjust coredump_filter, ignoring: %m");
4373                 else if (r < 0) {
4374                         *exit_status = EXIT_LIMITS;
4375                         return log_exec_error_errno(context, params, r, "Failed to adjust coredump_filter: %m");
4376                 }
4377         }
4378
4379         if (context->nice_set) {
4380                 r = setpriority_closest(context->nice);
4381                 if (r < 0) {
4382                         *exit_status = EXIT_NICE;
4383                         return log_exec_error_errno(context, params, r, "Failed to set up process scheduling priority (nice level): %m");
4384                 }
4385         }
4386
4387         if (context->cpu_sched_set) {
4388                 struct sched_param param = {
4389                         .sched_priority = context->cpu_sched_priority,
4390                 };
4391
4392                 r = sched_setscheduler(0,
4393                                        context->cpu_sched_policy |
4394                                        (context->cpu_sched_reset_on_fork ?
4395                                         SCHED_RESET_ON_FORK : 0),
4396                                        &param);
4397                 if (r < 0) {
4398                         *exit_status = EXIT_SETSCHEDULER;
4399                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU scheduling: %m");
4400                 }
4401         }
4402
4403         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4404                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4405                 const CPUSet *cpu_set;
4406
4407                 if (context->cpu_affinity_from_numa) {
4408                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4409                         if (r < 0) {
4410                                 *exit_status = EXIT_CPUAFFINITY;
4411                                 return log_exec_error_errno(context, params, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4412                         }
4413
4414                         cpu_set = &converted_cpu_set;
4415                 } else
4416                         cpu_set = &context->cpu_set;
4417
4418                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4419                         *exit_status = EXIT_CPUAFFINITY;
4420                         return log_exec_error_errno(context, params, errno, "Failed to set up CPU affinity: %m");
4421                 }
4422         }
4423
4424         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4425                 r = apply_numa_policy(&context->numa_policy);
4426                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4427                         log_exec_debug_errno(context, params, r, "NUMA support not available, ignoring.");
4428                 else if (r < 0) {
4429                         *exit_status = EXIT_NUMA_POLICY;
4430                         return log_exec_error_errno(context, params, r, "Failed to set NUMA memory policy: %m");
4431                 }
4432         }
4433
4434         if (context->ioprio_set)
4435                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4436                         *exit_status = EXIT_IOPRIO;
4437                         return log_exec_error_errno(context, params, errno, "Failed to set up IO scheduling priority: %m");
4438                 }
4439
4440         if (context->timer_slack_nsec != NSEC_INFINITY)
4441                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4442                         *exit_status = EXIT_TIMERSLACK;
4443                         return log_exec_error_errno(context, params, errno, "Failed to set up timer slack: %m");
4444                 }
4445
4446         if (context->personality != PERSONALITY_INVALID) {
4447                 r = safe_personality(context->personality);
4448                 if (r < 0) {
4449                         *exit_status = EXIT_PERSONALITY;
4450                         return log_exec_error_errno(context, params, r, "Failed to set up execution domain (personality): %m");
4451                 }
4452         }
4453
4454 #if ENABLE_UTMP
4455         if (context->utmp_id) {
4456                 const char *line = context->tty_path ?
4457                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4458                         NULL;
4459                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4460                                       line,
4461                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4462                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4463                                       USER_PROCESS,
4464                                       username);
4465         }
4466 #endif
4467
4468         if (uid_is_valid(uid)) {
4469                 r = chown_terminal(STDIN_FILENO, uid);
4470                 if (r < 0) {
4471                         *exit_status = EXIT_STDIN;
4472                         return log_exec_error_errno(context, params, r, "Failed to change ownership of terminal: %m");
4473                 }
4474         }
4475
4476         if (params->cgroup_path) {
4477                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4478                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4479                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4480                  * touch a single hierarchy too. */
4481
4482                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4483                         _cleanup_free_ char *p = NULL;
4484
4485                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4486                         if (r < 0) {
4487                                 *exit_status = EXIT_CGROUP;
4488                                 return log_exec_error_errno(context, params, r, "Failed to adjust control group access: %m");
4489                         }
4490
4491                         r = exec_params_get_cgroup_path(params, cgroup_context, &p);
4492                         if (r < 0) {
4493                                 *exit_status = EXIT_CGROUP;
4494                                 return log_exec_error_errno(context, params, r, "Failed to acquire cgroup path: %m");
4495                         }
4496                         if (r > 0) {
4497                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4498                                 if (r < 0) {
4499                                         *exit_status = EXIT_CGROUP;
4500                                         return log_exec_error_errno(context, params, r, "Failed to adjust control subgroup access: %m");
4501                                 }
4502                         }
4503                 }
4504
4505                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4506                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4507                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4508                                 if (r < 0) {
4509                                         *exit_status = EXIT_MEMORY;
4510                                         return log_oom();
4511                                 }
4512
4513                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4514                                 if (r < 0) {
4515                                         log_exec_full_errno(context, params, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4516                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4517                                         memory_pressure_path = mfree(memory_pressure_path);
4518                                 }
4519                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4520                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4521                                 if (!memory_pressure_path) {
4522                                         *exit_status = EXIT_MEMORY;
4523                                         return log_oom();
4524                                 }
4525                         }
4526                 }
4527         }
4528
4529         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4530
4531         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4532                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4533                 if (r < 0)
4534                         return log_exec_error_errno(context, params, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4535         }
4536
4537         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4538                 r = exec_setup_credentials(context, params, params->unit_id, uid, gid);
4539                 if (r < 0) {
4540                         *exit_status = EXIT_CREDENTIALS;
4541                         return log_exec_error_errno(context, params, r, "Failed to set up credentials: %m");
4542                 }
4543         }
4544
4545         r = build_environment(
4546                         context,
4547                         params,
4548                         cgroup_context,
4549                         n_fds,
4550                         home,
4551                         username,
4552                         shell,
4553                         journal_stream_dev,
4554                         journal_stream_ino,
4555                         memory_pressure_path,
4556                         &our_env);
4557         if (r < 0) {
4558                 *exit_status = EXIT_MEMORY;
4559                 return log_oom();
4560         }
4561
4562         r = build_pass_environment(context, &pass_env);
4563         if (r < 0) {
4564                 *exit_status = EXIT_MEMORY;
4565                 return log_oom();
4566         }
4567
4568         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4569          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4570          * not specify PATH but the unit has ExecSearchPath. */
4571         if (!strv_isempty(context->exec_search_path)) {
4572                 _cleanup_free_ char *joined = NULL;
4573
4574                 joined = strv_join(context->exec_search_path, ":");
4575                 if (!joined) {
4576                         *exit_status = EXIT_MEMORY;
4577                         return log_oom();
4578                 }
4579
4580                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4581                 if (r < 0) {
4582                         *exit_status = EXIT_MEMORY;
4583                         return log_oom();
4584                 }
4585         }
4586
4587         accum_env = strv_env_merge(params->environment,
4588                                    our_env,
4589                                    joined_exec_search_path,
4590                                    pass_env,
4591                                    context->environment,
4592                                    params->files_env);
4593         if (!accum_env) {
4594                 *exit_status = EXIT_MEMORY;
4595                 return log_oom();
4596         }
4597         accum_env = strv_env_clean(accum_env);
4598
4599         (void) umask(context->umask);
4600
4601         r = setup_keyring(context, params, uid, gid);
4602         if (r < 0) {
4603                 *exit_status = EXIT_KEYRING;
4604                 return log_exec_error_errno(context, params, r, "Failed to set up kernel keyring: %m");
4605         }
4606
4607         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4608          * from it. */
4609         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4610
4611         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4612          * for it, and the kernel doesn't actually support ambient caps. */
4613         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4614
4615         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4616          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4617          * desired. */
4618         if (needs_ambient_hack)
4619                 needs_setuid = false;
4620         else
4621                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4622
4623         uint64_t capability_ambient_set = context->capability_ambient_set;
4624
4625         if (needs_sandboxing) {
4626                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4627                  * /sys being present. The actual MAC context application will happen later, as late as
4628                  * possible, to avoid impacting our own code paths. */
4629
4630 #if HAVE_SELINUX
4631                 use_selinux = mac_selinux_use();
4632 #endif
4633 #if ENABLE_SMACK
4634                 use_smack = mac_smack_use();
4635 #endif
4636 #if HAVE_APPARMOR
4637                 use_apparmor = mac_apparmor_use();
4638 #endif
4639         }
4640
4641         if (needs_sandboxing) {
4642                 int which_failed;
4643
4644                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4645                  * is set here. (See below.) */
4646
4647                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4648                 if (r < 0) {
4649                         *exit_status = EXIT_LIMITS;
4650                         return log_exec_error_errno(context, params, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4651                 }
4652         }
4653
4654         if (needs_setuid && context->pam_name && username) {
4655                 /* Let's call into PAM after we set up our own idea of resource limits so that pam_limits
4656                  * wins here. (See above.) */
4657
4658                 /* All fds passed in the fds array will be closed in the pam child process. */
4659                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, params->fds, n_fds, params->exec_fd);
4660                 if (r < 0) {
4661                         *exit_status = EXIT_PAM;
4662                         return log_exec_error_errno(context, params, r, "Failed to set up PAM session: %m");
4663                 }
4664
4665                 if (ambient_capabilities_supported()) {
4666                         uint64_t ambient_after_pam;
4667
4668                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4669                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4670                         r = capability_get_ambient(&ambient_after_pam);
4671                         if (r < 0) {
4672                                 *exit_status = EXIT_CAPABILITIES;
4673                                 return log_exec_error_errno(context, params, r, "Failed to query ambient caps: %m");
4674                         }
4675
4676                         capability_ambient_set |= ambient_after_pam;
4677                 }
4678
4679                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4680                 if (ngids_after_pam < 0) {
4681                         *exit_status = EXIT_GROUP;
4682                         return log_exec_error_errno(context, params, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4683                 }
4684         }
4685
4686         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4687                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4688                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4689                  * set up all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4690
4691                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4692                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4693                  * the actual requested operations fail (or silently continue). */
4694                 if (r < 0 && context->private_users) {
4695                         *exit_status = EXIT_USER;
4696                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing for unprivileged user: %m");
4697                 }
4698                 if (r < 0)
4699                         log_exec_info_errno(context, params, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4700                 else
4701                         userns_set_up = true;
4702         }
4703
4704         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4705
4706                 /* Try to enable network namespacing if network namespacing is available and we have
4707                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4708                  * new network namespace. And if we don't have that, then we could only create a network
4709                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4710                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4711                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4712                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4713                                 log_exec_notice_errno(context, params, r,
4714                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4715                         else if (r < 0) {
4716                                 *exit_status = EXIT_NETWORK;
4717                                 return log_exec_error_errno(context, params, r, "Failed to set up network namespacing: %m");
4718                         }
4719                 } else if (context->network_namespace_path) {
4720                         *exit_status = EXIT_NETWORK;
4721                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4722                                                     "NetworkNamespacePath= is not supported, refusing.");
4723                 } else
4724                         log_exec_notice(context, params, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4725         }
4726
4727         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4728
4729                 if (ns_type_supported(NAMESPACE_IPC)) {
4730                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4731                         if (r == -EPERM)
4732                                 log_exec_warning_errno(context, params, r,
4733                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4734                         else if (r < 0) {
4735                                 *exit_status = EXIT_NAMESPACE;
4736                                 return log_exec_error_errno(context, params, r, "Failed to set up IPC namespacing: %m");
4737                         }
4738                 } else if (context->ipc_namespace_path) {
4739                         *exit_status = EXIT_NAMESPACE;
4740                         return log_exec_error_errno(context, params, SYNTHETIC_ERRNO(EOPNOTSUPP),
4741                                                     "IPCNamespacePath= is not supported, refusing.");
4742                 } else
4743                         log_exec_warning(context, params, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4744         }
4745
4746         if (needs_mount_namespace) {
4747                 _cleanup_free_ char *error_path = NULL;
4748
4749                 r = apply_mount_namespace(command->flags,
4750                                           context,
4751                                           params,
4752                                           runtime,
4753                                           memory_pressure_path,
4754                                           needs_sandboxing,
4755                                           &error_path);
4756                 if (r < 0) {
4757                         *exit_status = EXIT_NAMESPACE;
4758                         return log_exec_error_errno(context, params, r, "Failed to set up mount namespacing%s%s: %m",
4759                                                     error_path ? ": " : "", strempty(error_path));
4760                 }
4761         }
4762
4763         if (needs_sandboxing) {
4764                 r = apply_protect_hostname(context, params, exit_status);
4765                 if (r < 0)
4766                         return r;
4767         }
4768
4769         if (context->memory_ksm >= 0)
4770                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4771                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4772                                 log_exec_debug_errno(context,
4773                                                      params,
4774                                                      errno,
4775                                                      "KSM support not available, ignoring.");
4776                         else {
4777                                 *exit_status = EXIT_KSM;
4778                                 return log_exec_error_errno(context, params, errno, "Failed to set KSM: %m");
4779                         }
4780                 }
4781
4782         /* Drop groups as early as possible.
4783          * This needs to be done after PrivateDevices=yes setup as device nodes should be owned by the host's root.
4784          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4785         if (needs_setuid) {
4786                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4787                 int ngids_to_enforce = 0;
4788
4789                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4790                                                    ngids,
4791                                                    gids_after_pam,
4792                                                    ngids_after_pam,
4793                                                    &gids_to_enforce);
4794                 if (ngids_to_enforce < 0) {
4795                         *exit_status = EXIT_GROUP;
4796                         return log_exec_error_errno(context, params,
4797                                                     ngids_to_enforce,
4798                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4799                 }
4800
4801                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4802                 if (r < 0) {
4803                         *exit_status = EXIT_GROUP;
4804                         return log_exec_error_errno(context, params, r, "Changing group credentials failed: %m");
4805                 }
4806         }
4807
4808         /* If the user namespace was not set up above, try to do it now.
4809          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4810          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4811          * case of mount namespaces being less privileged when the mount point list is copied from a
4812          * different user namespace). */
4813
4814         if (needs_sandboxing && context->private_users && !userns_set_up) {
4815                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4816                 if (r < 0) {
4817                         *exit_status = EXIT_USER;
4818                         return log_exec_error_errno(context, params, r, "Failed to set up user namespacing: %m");
4819                 }
4820         }
4821
4822         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4823          * shall execute. */
4824
4825         _cleanup_free_ char *executable = NULL;
4826         _cleanup_close_ int executable_fd = -EBADF;
4827         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4828         if (r < 0) {
4829                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4830                         log_exec_struct_errno(context, params, LOG_INFO, r,
4831                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4832                                               LOG_EXEC_INVOCATION_ID(params),
4833                                               LOG_EXEC_MESSAGE(params,
4834                                                                "Executable %s missing, skipping: %m",
4835                                                                command->path),
4836                                               "EXECUTABLE=%s", command->path);
4837                         *exit_status = EXIT_SUCCESS;
4838                         return 0;
4839                 }
4840
4841                 *exit_status = EXIT_EXEC;
4842                 return log_exec_struct_errno(context, params, LOG_INFO, r,
4843                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4844                                              LOG_EXEC_INVOCATION_ID(params),
4845                                              LOG_EXEC_MESSAGE(params,
4846                                                               "Failed to locate executable %s: %m",
4847                                                               command->path),
4848                                              "EXECUTABLE=%s", command->path);
4849         }
4850
4851         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, &executable_fd);
4852         if (r < 0) {
4853                 *exit_status = EXIT_FDS;
4854                 return log_exec_error_errno(context, params, r, "Failed to collect shifted fd: %m");
4855         }
4856
4857 #if HAVE_SELINUX
4858         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4859                 int fd = -EBADF;
4860
4861                 if (socket_fd >= 0)
4862                         fd = socket_fd;
4863                 else if (params->n_socket_fds == 1)
4864                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4865                          * use context from that fd to compute the label. */
4866                         fd = params->fds[0];
4867
4868                 if (fd >= 0) {
4869                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4870                         if (r < 0) {
4871                                 if (!context->selinux_context_ignore) {
4872                                         *exit_status = EXIT_SELINUX_CONTEXT;
4873                                         return log_exec_error_errno(context,
4874                                                                     params,
4875                                                                     r,
4876                                                                     "Failed to determine SELinux context: %m");
4877                                 }
4878                                 log_exec_debug_errno(context,
4879                                                      params,
4880                                                      r,
4881                                                      "Failed to determine SELinux context, ignoring: %m");
4882                         }
4883                 }
4884         }
4885 #endif
4886
4887         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4888          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4889          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4890          * execve(). But first, close the remaining sockets in the context objects. */
4891
4892         exec_runtime_close(runtime);
4893         exec_params_close(params);
4894
4895         r = close_all_fds(keep_fds, n_keep_fds);
4896         if (r >= 0)
4897                 r = shift_fds(params->fds, n_fds);
4898         if (r >= 0)
4899                 r = flag_fds(params->fds, n_socket_fds, n_fds, context->non_blocking);
4900         if (r < 0) {
4901                 *exit_status = EXIT_FDS;
4902                 return log_exec_error_errno(context, params, r, "Failed to adjust passed file descriptors: %m");
4903         }
4904
4905         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4906          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4907          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4908          * came this far. */
4909
4910         secure_bits = context->secure_bits;
4911
4912         if (needs_sandboxing) {
4913                 uint64_t bset;
4914
4915                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4916                  * (Note this is placed after the general resource limit initialization, see above, in order
4917                  * to take precedence.) */
4918                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4919                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4920                                 *exit_status = EXIT_LIMITS;
4921                                 return log_exec_error_errno(context, params, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4922                         }
4923                 }
4924
4925 #if ENABLE_SMACK
4926                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4927                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4928                 if (use_smack && context->smack_process_label) {
4929                         r = setup_smack(params, context, executable_fd);
4930                         if (r < 0 && !context->smack_process_label_ignore) {
4931                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4932                                 return log_exec_error_errno(context, params, r, "Failed to set SMACK process label: %m");
4933                         }
4934                 }
4935 #endif
4936
4937                 bset = context->capability_bounding_set;
4938                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4939                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4940                  * instead of us doing that */
4941                 if (needs_ambient_hack)
4942                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4943                                 (UINT64_C(1) << CAP_SETUID) |
4944                                 (UINT64_C(1) << CAP_SETGID);
4945
4946 #if HAVE_SECCOMP
4947                 /* If the service has any form of a seccomp filter and it allows dropping privileges, we'll
4948                  * keep the needed privileges to apply it even if we're not root. */
4949                 if (needs_setuid &&
4950                     uid_is_valid(uid) &&
4951                     context_has_seccomp(context) &&
4952                     seccomp_allows_drop_privileges(context)) {
4953                         keep_seccomp_privileges = true;
4954
4955                         if (prctl(PR_SET_KEEPCAPS, 1) < 0) {
4956                                 *exit_status = EXIT_USER;
4957                                 return log_exec_error_errno(context, params, errno, "Failed to enable keep capabilities flag: %m");
4958                         }
4959
4960                         /* Save the current bounding set so we can restore it after applying the seccomp
4961                          * filter */
4962                         saved_bset = bset;
4963                         bset |= (UINT64_C(1) << CAP_SYS_ADMIN) |
4964                                 (UINT64_C(1) << CAP_SETPCAP);
4965                 }
4966 #endif
4967
4968                 if (!cap_test_all(bset)) {
4969                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4970                         if (r < 0) {
4971                                 *exit_status = EXIT_CAPABILITIES;
4972                                 return log_exec_error_errno(context, params, r, "Failed to drop capabilities: %m");
4973                         }
4974                 }
4975
4976                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4977                  * keep-caps set.
4978                  *
4979                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4980                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4981                  * the ambient capabilities can be raised as they are present in the permitted and
4982                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4983                  * without changing the user, so we also set the ambient capabilities here.
4984                  *
4985                  * The requested ambient capabilities are raised in the inheritable set if the second
4986                  * argument is true. */
4987                 if (!needs_ambient_hack) {
4988                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4989                         if (r < 0) {
4990                                 *exit_status = EXIT_CAPABILITIES;
4991                                 return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (before UID change): %m");
4992                         }
4993                 }
4994         }
4995
4996         /* chroot to root directory first, before we lose the ability to chroot */
4997         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4998         if (r < 0)
4999                 return log_exec_error_errno(context, params, r, "Chrooting to the requested root directory failed: %m");
5000
5001         if (needs_setuid) {
5002                 if (uid_is_valid(uid)) {
5003                         r = enforce_user(context, uid, capability_ambient_set);
5004                         if (r < 0) {
5005                                 *exit_status = EXIT_USER;
5006                                 return log_exec_error_errno(context, params, r, "Failed to change UID to " UID_FMT ": %m", uid);
5007                         }
5008
5009                         if (keep_seccomp_privileges) {
5010                                 if (!FLAGS_SET(capability_ambient_set, (UINT64_C(1) << CAP_SETUID))) {
5011                                         r = drop_capability(CAP_SETUID);
5012                                         if (r < 0) {
5013                                                 *exit_status = EXIT_USER;
5014                                                 return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETUID: %m");
5015                                         }
5016                                 }
5017
5018                                 r = keep_capability(CAP_SYS_ADMIN);
5019                                 if (r < 0) {
5020                                         *exit_status = EXIT_USER;
5021                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SYS_ADMIN: %m");
5022                                 }
5023
5024                                 r = keep_capability(CAP_SETPCAP);
5025                                 if (r < 0) {
5026                                         *exit_status = EXIT_USER;
5027                                         return log_exec_error_errno(context, params, r, "Failed to keep CAP_SETPCAP: %m");
5028                                 }
5029                         }
5030
5031                         if (!needs_ambient_hack && capability_ambient_set != 0) {
5032
5033                                 /* Raise the ambient capabilities after user change. */
5034                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5035                                 if (r < 0) {
5036                                         *exit_status = EXIT_CAPABILITIES;
5037                                         return log_exec_error_errno(context, params, r, "Failed to apply ambient capabilities (after UID change): %m");
5038                                 }
5039                         }
5040                 }
5041         }
5042
5043         /* Apply working directory here, because the working directory might be on NFS and only the user
5044          * running this service might have the correct privilege to change to the working directory. Also, it
5045          * is absolutely 💣 crucial 💣 we applied all mount namespacing rearrangements before this, so that
5046          * the cwd cannot be used to pin directories outside of the sandbox. */
5047         r = apply_working_directory(context, params, runtime, home, exit_status);
5048         if (r < 0)
5049                 return log_exec_error_errno(context, params, r, "Changing to the requested working directory failed: %m");
5050
5051         if (needs_sandboxing) {
5052                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5053                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5054                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5055                  * are restricted. */
5056
5057 #if HAVE_SELINUX
5058                 if (use_selinux) {
5059                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5060
5061                         if (exec_context) {
5062                                 r = setexeccon(exec_context);
5063                                 if (r < 0) {
5064                                         if (!context->selinux_context_ignore) {
5065                                                 *exit_status = EXIT_SELINUX_CONTEXT;
5066                                                 return log_exec_error_errno(context, params, r, "Failed to change SELinux context to %s: %m", exec_context);
5067                                         }
5068                                         log_exec_debug_errno(context,
5069                                                              params,
5070                                                              r,
5071                                                              "Failed to change SELinux context to %s, ignoring: %m",
5072                                                              exec_context);
5073                                 }
5074                         }
5075                 }
5076 #endif
5077
5078 #if HAVE_APPARMOR
5079                 if (use_apparmor && context->apparmor_profile) {
5080                         r = aa_change_onexec(context->apparmor_profile);
5081                         if (r < 0 && !context->apparmor_profile_ignore) {
5082                                 *exit_status = EXIT_APPARMOR_PROFILE;
5083                                 return log_exec_error_errno(context,
5084                                                             params,
5085                                                             errno,
5086                                                             "Failed to prepare AppArmor profile change to %s: %m",
5087                                                             context->apparmor_profile);
5088                         }
5089                 }
5090 #endif
5091
5092                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5093                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5094                  * requires CAP_SETPCAP. */
5095                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5096                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5097                          * effective set here.
5098                          *
5099                          * The effective set is overwritten during execve() with the following values:
5100                          *
5101                          * - ambient set (for non-root processes)
5102                          *
5103                          * - (inheritable | bounding) set for root processes)
5104                          *
5105                          * Hence there is no security impact to raise it in the effective set before execve
5106                          */
5107                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5108                         if (r < 0) {
5109                                 *exit_status = EXIT_CAPABILITIES;
5110                                 return log_exec_error_errno(context, params, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5111                         }
5112                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5113                                 *exit_status = EXIT_SECUREBITS;
5114                                 return log_exec_error_errno(context, params, errno, "Failed to set process secure bits: %m");
5115                         }
5116                 }
5117
5118                 if (context_has_no_new_privileges(context))
5119                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5120                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5121                                 return log_exec_error_errno(context, params, errno, "Failed to disable new privileges: %m");
5122                         }
5123
5124 #if HAVE_SECCOMP
5125                 r = apply_address_families(context, params);
5126                 if (r < 0) {
5127                         *exit_status = EXIT_ADDRESS_FAMILIES;
5128                         return log_exec_error_errno(context, params, r, "Failed to restrict address families: %m");
5129                 }
5130
5131                 r = apply_memory_deny_write_execute(context, params);
5132                 if (r < 0) {
5133                         *exit_status = EXIT_SECCOMP;
5134                         return log_exec_error_errno(context, params, r, "Failed to disable writing to executable memory: %m");
5135                 }
5136
5137                 r = apply_restrict_realtime(context, params);
5138                 if (r < 0) {
5139                         *exit_status = EXIT_SECCOMP;
5140                         return log_exec_error_errno(context, params, r, "Failed to apply realtime restrictions: %m");
5141                 }
5142
5143                 r = apply_restrict_suid_sgid(context, params);
5144                 if (r < 0) {
5145                         *exit_status = EXIT_SECCOMP;
5146                         return log_exec_error_errno(context, params, r, "Failed to apply SUID/SGID restrictions: %m");
5147                 }
5148
5149                 r = apply_restrict_namespaces(context, params);
5150                 if (r < 0) {
5151                         *exit_status = EXIT_SECCOMP;
5152                         return log_exec_error_errno(context, params, r, "Failed to apply namespace restrictions: %m");
5153                 }
5154
5155                 r = apply_protect_sysctl(context, params);
5156                 if (r < 0) {
5157                         *exit_status = EXIT_SECCOMP;
5158                         return log_exec_error_errno(context, params, r, "Failed to apply sysctl restrictions: %m");
5159                 }
5160
5161                 r = apply_protect_kernel_modules(context, params);
5162                 if (r < 0) {
5163                         *exit_status = EXIT_SECCOMP;
5164                         return log_exec_error_errno(context, params, r, "Failed to apply module loading restrictions: %m");
5165                 }
5166
5167                 r = apply_protect_kernel_logs(context, params);
5168                 if (r < 0) {
5169                         *exit_status = EXIT_SECCOMP;
5170                         return log_exec_error_errno(context, params, r, "Failed to apply kernel log restrictions: %m");
5171                 }
5172
5173                 r = apply_protect_clock(context, params);
5174                 if (r < 0) {
5175                         *exit_status = EXIT_SECCOMP;
5176                         return log_exec_error_errno(context, params, r, "Failed to apply clock restrictions: %m");
5177                 }
5178
5179                 r = apply_private_devices(context, params);
5180                 if (r < 0) {
5181                         *exit_status = EXIT_SECCOMP;
5182                         return log_exec_error_errno(context, params, r, "Failed to set up private devices: %m");
5183                 }
5184
5185                 r = apply_syscall_archs(context, params);
5186                 if (r < 0) {
5187                         *exit_status = EXIT_SECCOMP;
5188                         return log_exec_error_errno(context, params, r, "Failed to apply syscall architecture restrictions: %m");
5189                 }
5190
5191                 r = apply_lock_personality(context, params);
5192                 if (r < 0) {
5193                         *exit_status = EXIT_SECCOMP;
5194                         return log_exec_error_errno(context, params, r, "Failed to lock personalities: %m");
5195                 }
5196
5197                 r = apply_syscall_log(context, params);
5198                 if (r < 0) {
5199                         *exit_status = EXIT_SECCOMP;
5200                         return log_exec_error_errno(context, params, r, "Failed to apply system call log filters: %m");
5201                 }
5202 #endif
5203
5204 #if HAVE_LIBBPF
5205                 r = apply_restrict_filesystems(context, params);
5206                 if (r < 0) {
5207                         *exit_status = EXIT_BPF;
5208                         return log_exec_error_errno(context, params, r, "Failed to restrict filesystems: %m");
5209                 }
5210 #endif
5211
5212 #if HAVE_SECCOMP
5213                 /* This really should remain as close to the execve() as possible, to make sure our own code is affected
5214                  * by the filter as little as possible. */
5215                 r = apply_syscall_filter(context, params, needs_ambient_hack);
5216                 if (r < 0) {
5217                         *exit_status = EXIT_SECCOMP;
5218                         return log_exec_error_errno(context, params, r, "Failed to apply system call filters: %m");
5219                 }
5220
5221                 if (keep_seccomp_privileges) {
5222                         /* Restore the capability bounding set with what's expected from the service + the
5223                          * ambient capabilities hack */
5224                         if (!cap_test_all(saved_bset)) {
5225                                 r = capability_bounding_set_drop(saved_bset, /* right_now= */ false);
5226                                 if (r < 0) {
5227                                         *exit_status = EXIT_CAPABILITIES;
5228                                         return log_exec_error_errno(context, params, r, "Failed to drop bset capabilities: %m");
5229                                 }
5230                         }
5231
5232                         /* Only drop CAP_SYS_ADMIN if it's not in the bounding set, otherwise we'll break
5233                          * applications that use it. */
5234                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SYS_ADMIN))) {
5235                                 r = drop_capability(CAP_SYS_ADMIN);
5236                                 if (r < 0) {
5237                                         *exit_status = EXIT_USER;
5238                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SYS_ADMIN: %m");
5239                                 }
5240                         }
5241
5242                         /* Only drop CAP_SETPCAP if it's not in the bounding set, otherwise we'll break
5243                          * applications that use it. */
5244                         if (!FLAGS_SET(saved_bset, (UINT64_C(1) << CAP_SETPCAP))) {
5245                                 r = drop_capability(CAP_SETPCAP);
5246                                 if (r < 0) {
5247                                         *exit_status = EXIT_USER;
5248                                         return log_exec_error_errno(context, params, r, "Failed to drop CAP_SETPCAP: %m");
5249                                 }
5250                         }
5251
5252                         if (prctl(PR_SET_KEEPCAPS, 0) < 0) {
5253                                 *exit_status = EXIT_USER;
5254                                 return log_exec_error_errno(context, params, errno, "Failed to drop keep capabilities flag: %m");
5255                         }
5256                 }
5257 #endif
5258
5259         }
5260
5261         if (!strv_isempty(context->unset_environment)) {
5262                 char **ee = NULL;
5263
5264                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5265                 if (!ee) {
5266                         *exit_status = EXIT_MEMORY;
5267                         return log_oom();
5268                 }
5269
5270                 strv_free_and_replace(accum_env, ee);
5271         }
5272
5273         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5274                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5275
5276                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5277                 if (r < 0) {
5278                         *exit_status = EXIT_MEMORY;
5279                         return log_exec_error_errno(context,
5280                                                     params,
5281                                                     r,
5282                                                     "Failed to replace environment variables: %m");
5283                 }
5284                 final_argv = replaced_argv;
5285
5286                 if (!strv_isempty(unset_variables)) {
5287                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5288                         log_exec_warning(context,
5289                                          params,
5290                                          "Referenced but unset environment variable evaluates to an empty string: %s",
5291                                          strna(ju));
5292                 }
5293
5294                 if (!strv_isempty(bad_variables)) {
5295                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5296                         log_exec_warning(context,
5297                                          params,
5298                                          "Invalid environment variable name evaluates to an empty string: %s",
5299                                          strna(jb));
5300                 }
5301         } else
5302                 final_argv = command->argv;
5303
5304         log_command_line(context, params, "Executing", executable, final_argv);
5305
5306         if (params->exec_fd >= 0) {
5307                 uint8_t hot = 1;
5308
5309                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5310                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5311
5312                 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5313                         *exit_status = EXIT_EXEC;
5314                         return log_exec_error_errno(context, params, errno, "Failed to enable exec_fd: %m");
5315                 }
5316         }
5317
5318         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5319
5320         if (params->exec_fd >= 0) {
5321                 uint8_t hot = 0;
5322
5323                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5324                  * that POLLHUP on it no longer means execve() succeeded. */
5325
5326                 if (write(params->exec_fd, &hot, sizeof(hot)) < 0) {
5327                         *exit_status = EXIT_EXEC;
5328                         return log_exec_error_errno(context, params, errno, "Failed to disable exec_fd: %m");
5329                 }
5330         }
5331
5332         *exit_status = EXIT_EXEC;
5333         return log_exec_error_errno(context, params, r, "Failed to execute %s: %m", executable);
5334 }