src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/file.h>
   8 #include <sys/ioctl.h>
   9 #include <sys/mman.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
  19
  20 #if HAVE_PAM
  21 #include <security/pam_appl.h>
  22 #endif
  23
  24 #if HAVE_SELINUX
  25 #include <selinux/selinux.h>
  26 #endif
  27
  28 #if HAVE_APPARMOR
  29 #include <sys/apparmor.h>
  30 #endif
  31
  32 #include "sd-messages.h"
  33
  34 #include "af-list.h"
  35 #include "alloc-util.h"
  36 #if HAVE_APPARMOR
  37 #include "apparmor-util.h"
  38 #endif
  39 #include "argv-util.h"
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "bpf-lsm.h"
  43 #include "btrfs-util.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "chattr-util.h"
  47 #include "cgroup-setup.h"
  48 #include "chase.h"
  49 #include "chown-recursive.h"
  50 #include "constants.h"
  51 #include "cpu-set-util.h"
  52 #include "credential.h"
  53 #include "data-fd-util.h"
  54 #include "env-file.h"
  55 #include "env-util.h"
  56 #include "errno-list.h"
  57 #include "escape.h"
  58 #include "execute.h"
  59 #include "exit-status.h"
  60 #include "fd-util.h"
  61 #include "format-util.h"
  62 #include "glob-util.h"
  63 #include "hexdecoct.h"
  64 #include "io-util.h"
  65 #include "ioprio-util.h"
  66 #include "lock-util.h"
  67 #include "log.h"
  68 #include "macro.h"
  69 #include "manager.h"
  70 #include "manager-dump.h"
  71 #include "memory-util.h"
  72 #include "missing_fs.h"
  73 #include "missing_ioprio.h"
  74 #include "missing_prctl.h"
  75 #include "mkdir-label.h"
  76 #include "namespace.h"
  77 #include "parse-util.h"
  78 #include "path-util.h"
  79 #include "proc-cmdline.h"
  80 #include "process-util.h"
  81 #include "psi-util.h"
  82 #include "rlimit-util.h"
  83 #include "rm-rf.h"
  84 #include "seccomp-util.h"
  85 #include "securebits-util.h"
  86 #include "selinux-util.h"
  87 #include "signal-util.h"
  88 #include "smack-util.h"
  89 #include "socket-util.h"
  90 #include "sort-util.h"
  91 #include "special.h"
  92 #include "stat-util.h"
  93 #include "string-table.h"
  94 #include "string-util.h"
  95 #include "strv.h"
  96 #include "syslog-util.h"
  97 #include "terminal-util.h"
  98 #include "tmpfile-util.h"
  99 #include "umask-util.h"
 100 #include "unit-serialize.h"
 101 #include "user-util.h"
 102 #include "utmp-wtmp.h"
 103
 104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 106
 107 #define SNDBUF_SIZE (8*1024*1024)
 108
 109 static int shift_fds(int fds[], size_t n_fds) {
 110         if (n_fds <= 0)
 111                 return 0;
 112
 113         /* Modifies the fds array! (sorts it) */
 114
 115         assert(fds);
 116
 117         for (int start = 0;;) {
 118                 int restart_from = -1;
 119
 120                 for (int i = start; i < (int) n_fds; i++) {
 121                         int nfd;
 122
 123                         /* Already at right index? */
 124                         if (fds[i] == i+3)
 125                                 continue;
 126
 127                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 128                         if (nfd < 0)
 129                                 return -errno;
 130
 131                         safe_close(fds[i]);
 132                         fds[i] = nfd;
 133
 134                         /* Hmm, the fd we wanted isn't free? Then
 135                          * let's remember that and try again from here */
 136                         if (nfd != i+3 && restart_from < 0)
 137                                 restart_from = i;
 138                 }
 139
 140                 if (restart_from < 0)
 141                         break;
 142
 143                 start = restart_from;
 144         }
 145
 146         return 0;
 147 }
 148
 149 static int flags_fds(
 150                 const int fds[],
 151                 size_t n_socket_fds,
 152                 size_t n_fds,
 153                 bool nonblock) {
 154
 155         int r;
 156
 157         if (n_fds <= 0)
 158                 return 0;
 159
 160         assert(fds);
 161
 162         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 163          * O_NONBLOCK only applies to socket activation though. */
 164
 165         for (size_t i = 0; i < n_fds; i++) {
 166
 167                 if (i < n_socket_fds) {
 168                         r = fd_nonblock(fds[i], nonblock);
 169                         if (r < 0)
 170                                 return r;
 171                 }
 172
 173                 /* We unconditionally drop FD_CLOEXEC from the fds,
 174                  * since after all we want to pass these fds to our
 175                  * children */
 176
 177                 r = fd_cloexec(fds[i], false);
 178                 if (r < 0)
 179                         return r;
 180         }
 181
 182         return 0;
 183 }
 184
 185 static const char *exec_context_tty_path(const ExecContext *context) {
 186         assert(context);
 187
 188         if (context->stdio_as_fds)
 189                 return NULL;
 190
 191         if (context->tty_path)
 192                 return context->tty_path;
 193
 194         return "/dev/console";
 195 }
 196
 197 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
 198         unsigned rows, cols;
 199         const char *tty;
 200
 201         assert(context);
 202         assert(ret_rows);
 203         assert(ret_cols);
 204
 205         rows = context->tty_rows;
 206         cols = context->tty_cols;
 207
 208         tty = exec_context_tty_path(context);
 209         if (tty)
 210                 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
 211
 212         *ret_rows = rows;
 213         *ret_cols = cols;
 214
 215         return 0;
 216 }
 217
 218 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 219         _cleanup_close_ int fd = -EBADF;
 220         const char *path = exec_context_tty_path(ASSERT_PTR(context));
 221
 222         /* Take a lock around the device for the duration of the setup that we do here.
 223          * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
 224          * We open a new fd that will be closed automatically, and operate on it for convenience.
 225          */
 226
 227         if (p && p->stdin_fd >= 0) {
 228                 fd = xopenat_lock(p->stdin_fd, NULL,
 229                                   O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
 230                 if (fd < 0)
 231                         return;
 232         } else if (path) {
 233                 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
 234                 if (fd < 0)
 235                         return;
 236
 237                 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
 238                         return;
 239         } else
 240                 return;   /* nothing to do */
 241
 242         if (context->tty_vhangup)
 243                 (void) terminal_vhangup_fd(fd);
 244
 245         if (context->tty_reset)
 246                 (void) reset_terminal_fd(fd, true);
 247
 248         if (p && p->stdin_fd >= 0) {
 249                 unsigned rows = context->tty_rows, cols = context->tty_cols;
 250
 251                 (void) exec_context_tty_size(context, &rows, &cols);
 252                 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
 253         }
 254
 255         if (context->tty_vt_disallocate && path)
 256                 (void) vt_disallocate(path);
 257 }
 258
 259 static bool is_terminal_input(ExecInput i) {
 260         return IN_SET(i,
 261                       EXEC_INPUT_TTY,
 262                       EXEC_INPUT_TTY_FORCE,
 263                       EXEC_INPUT_TTY_FAIL);
 264 }
 265
 266 static bool is_terminal_output(ExecOutput o) {
 267         return IN_SET(o,
 268                       EXEC_OUTPUT_TTY,
 269                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 270                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 271 }
 272
 273 static bool is_kmsg_output(ExecOutput o) {
 274         return IN_SET(o,
 275                       EXEC_OUTPUT_KMSG,
 276                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 277 }
 278
 279 static bool exec_context_needs_term(const ExecContext *c) {
 280         assert(c);
 281
 282         /* Return true if the execution context suggests we should set $TERM to something useful. */
 283
 284         if (is_terminal_input(c->std_input))
 285                 return true;
 286
 287         if (is_terminal_output(c->std_output))
 288                 return true;
 289
 290         if (is_terminal_output(c->std_error))
 291                 return true;
 292
 293         return !!c->tty_path;
 294 }
 295
 296 static int open_null_as(int flags, int nfd) {
 297         int fd;
 298
 299         assert(nfd >= 0);
 300
 301         fd = open("/dev/null", flags|O_NOCTTY);
 302         if (fd < 0)
 303                 return -errno;
 304
 305         return move_fd(fd, nfd, false);
 306 }
 307
 308 static int connect_journal_socket(
 309                 int fd,
 310                 const char *log_namespace,
 311                 uid_t uid,
 312                 gid_t gid) {
 313
 314         uid_t olduid = UID_INVALID;
 315         gid_t oldgid = GID_INVALID;
 316         const char *j;
 317         int r;
 318
 319         j = log_namespace ?
 320                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 321                 "/run/systemd/journal/stdout";
 322
 323         if (gid_is_valid(gid)) {
 324                 oldgid = getgid();
 325
 326                 if (setegid(gid) < 0)
 327                         return -errno;
 328         }
 329
 330         if (uid_is_valid(uid)) {
 331                 olduid = getuid();
 332
 333                 if (seteuid(uid) < 0) {
 334                         r = -errno;
 335                         goto restore_gid;
 336                 }
 337         }
 338
 339         r = connect_unix_path(fd, AT_FDCWD, j);
 340
 341         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 342            an LSM interferes. */
 343
 344         if (uid_is_valid(uid))
 345                 (void) seteuid(olduid);
 346
 347  restore_gid:
 348         if (gid_is_valid(gid))
 349                 (void) setegid(oldgid);
 350
 351         return r;
 352 }
 353
 354 static int connect_logger_as(
 355                 const Unit *unit,
 356                 const ExecContext *context,
 357                 const ExecParameters *params,
 358                 ExecOutput output,
 359                 const char *ident,
 360                 int nfd,
 361                 uid_t uid,
 362                 gid_t gid) {
 363
 364         _cleanup_close_ int fd = -EBADF;
 365         int r;
 366
 367         assert(context);
 368         assert(params);
 369         assert(output < _EXEC_OUTPUT_MAX);
 370         assert(ident);
 371         assert(nfd >= 0);
 372
 373         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 374         if (fd < 0)
 375                 return -errno;
 376
 377         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 378         if (r < 0)
 379                 return r;
 380
 381         if (shutdown(fd, SHUT_RD) < 0)
 382                 return -errno;
 383
 384         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 385
 386         if (dprintf(fd,
 387                 "%s\n"
 388                 "%s\n"
 389                 "%i\n"
 390                 "%i\n"
 391                 "%i\n"
 392                 "%i\n"
 393                 "%i\n",
 394                 context->syslog_identifier ?: ident,
 395                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 396                 context->syslog_priority,
 397                 !!context->syslog_level_prefix,
 398                 false,
 399                 is_kmsg_output(output),
 400                 is_terminal_output(output)) < 0)
 401                 return -errno;
 402
 403         return move_fd(TAKE_FD(fd), nfd, false);
 404 }
 405
 406 static int open_terminal_as(const char *path, int flags, int nfd) {
 407         int fd;
 408
 409         assert(path);
 410         assert(nfd >= 0);
 411
 412         fd = open_terminal(path, flags | O_NOCTTY);
 413         if (fd < 0)
 414                 return fd;
 415
 416         return move_fd(fd, nfd, false);
 417 }
 418
 419 static int acquire_path(const char *path, int flags, mode_t mode) {
 420         _cleanup_close_ int fd = -EBADF;
 421         int r;
 422
 423         assert(path);
 424
 425         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 426                 flags |= O_CREAT;
 427
 428         fd = open(path, flags|O_NOCTTY, mode);
 429         if (fd >= 0)
 430                 return TAKE_FD(fd);
 431
 432         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 433                 return -errno;
 434
 435         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 436
 437         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 438         if (fd < 0)
 439                 return -errno;
 440
 441         r = connect_unix_path(fd, AT_FDCWD, path);
 442         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 443                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 444                  * wasn't an AF_UNIX socket after all */
 445                 return -ENXIO;
 446         if (r < 0)
 447                 return r;
 448
 449         if ((flags & O_ACCMODE) == O_RDONLY)
 450                 r = shutdown(fd, SHUT_WR);
 451         else if ((flags & O_ACCMODE) == O_WRONLY)
 452                 r = shutdown(fd, SHUT_RD);
 453         else
 454                 r = 0;
 455         if (r < 0)
 456                 return -errno;
 457
 458         return TAKE_FD(fd);
 459 }
 460
 461 static int fixup_input(
 462                 const ExecContext *context,
 463                 int socket_fd,
 464                 bool apply_tty_stdin) {
 465
 466         ExecInput std_input;
 467
 468         assert(context);
 469
 470         std_input = context->std_input;
 471
 472         if (is_terminal_input(std_input) && !apply_tty_stdin)
 473                 return EXEC_INPUT_NULL;
 474
 475         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 476                 return EXEC_INPUT_NULL;
 477
 478         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 479                 return EXEC_INPUT_NULL;
 480
 481         return std_input;
 482 }
 483
 484 static int fixup_output(ExecOutput output, int socket_fd) {
 485
 486         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 487                 return EXEC_OUTPUT_INHERIT;
 488
 489         return output;
 490 }
 491
 492 static int setup_input(
 493                 const ExecContext *context,
 494                 const ExecParameters *params,
 495                 int socket_fd,
 496                 const int named_iofds[static 3]) {
 497
 498         ExecInput i;
 499         int r;
 500
 501         assert(context);
 502         assert(params);
 503         assert(named_iofds);
 504
 505         if (params->stdin_fd >= 0) {
 506                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 507                         return -errno;
 508
 509                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 510                 if (isatty(STDIN_FILENO)) {
 511                         unsigned rows = context->tty_rows, cols = context->tty_cols;
 512
 513                         (void) exec_context_tty_size(context, &rows, &cols);
 514                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 515                         (void) reset_terminal_fd(STDIN_FILENO, true);
 516                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
 517                 }
 518
 519                 return STDIN_FILENO;
 520         }
 521
 522         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 523
 524         switch (i) {
 525
 526         case EXEC_INPUT_NULL:
 527                 return open_null_as(O_RDONLY, STDIN_FILENO);
 528
 529         case EXEC_INPUT_TTY:
 530         case EXEC_INPUT_TTY_FORCE:
 531         case EXEC_INPUT_TTY_FAIL: {
 532                 unsigned rows, cols;
 533                 int fd;
 534
 535                 fd = acquire_terminal(exec_context_tty_path(context),
 536                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 537                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 538                                                                   ACQUIRE_TERMINAL_WAIT,
 539                                       USEC_INFINITY);
 540                 if (fd < 0)
 541                         return fd;
 542
 543                 r = exec_context_tty_size(context, &rows, &cols);
 544                 if (r < 0)
 545                         return r;
 546
 547                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
 548                 if (r < 0)
 549                         return r;
 550
 551                 return move_fd(fd, STDIN_FILENO, false);
 552         }
 553
 554         case EXEC_INPUT_SOCKET:
 555                 assert(socket_fd >= 0);
 556
 557                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 558
 559         case EXEC_INPUT_NAMED_FD:
 560                 assert(named_iofds[STDIN_FILENO] >= 0);
 561
 562                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 563                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 564
 565         case EXEC_INPUT_DATA: {
 566                 int fd;
 567
 568                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 569                 if (fd < 0)
 570                         return fd;
 571
 572                 return move_fd(fd, STDIN_FILENO, false);
 573         }
 574
 575         case EXEC_INPUT_FILE: {
 576                 bool rw;
 577                 int fd;
 578
 579                 assert(context->stdio_file[STDIN_FILENO]);
 580
 581                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 582                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 583
 584                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 585                 if (fd < 0)
 586                         return fd;
 587
 588                 return move_fd(fd, STDIN_FILENO, false);
 589         }
 590
 591         default:
 592                 assert_not_reached();
 593         }
 594 }
 595
 596 static bool can_inherit_stderr_from_stdout(
 597                 const ExecContext *context,
 598                 ExecOutput o,
 599                 ExecOutput e) {
 600
 601         assert(context);
 602
 603         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 604          * stderr fd */
 605
 606         if (e == EXEC_OUTPUT_INHERIT)
 607                 return true;
 608         if (e != o)
 609                 return false;
 610
 611         if (e == EXEC_OUTPUT_NAMED_FD)
 612                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 613
 614         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 615                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 616
 617         return true;
 618 }
 619
 620 static int setup_output(
 621                 const Unit *unit,
 622                 const ExecContext *context,
 623                 const ExecParameters *params,
 624                 int fileno,
 625                 int socket_fd,
 626                 const int named_iofds[static 3],
 627                 const char *ident,
 628                 uid_t uid,
 629                 gid_t gid,
 630                 dev_t *journal_stream_dev,
 631                 ino_t *journal_stream_ino) {
 632
 633         ExecOutput o;
 634         ExecInput i;
 635         int r;
 636
 637         assert(unit);
 638         assert(context);
 639         assert(params);
 640         assert(ident);
 641         assert(journal_stream_dev);
 642         assert(journal_stream_ino);
 643
 644         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 645
 646                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 647                         return -errno;
 648
 649                 return STDOUT_FILENO;
 650         }
 651
 652         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 653                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 654                         return -errno;
 655
 656                 return STDERR_FILENO;
 657         }
 658
 659         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 660         o = fixup_output(context->std_output, socket_fd);
 661
 662         if (fileno == STDERR_FILENO) {
 663                 ExecOutput e;
 664                 e = fixup_output(context->std_error, socket_fd);
 665
 666                 /* This expects the input and output are already set up */
 667
 668                 /* Don't change the stderr file descriptor if we inherit all
 669                  * the way and are not on a tty */
 670                 if (e == EXEC_OUTPUT_INHERIT &&
 671                     o == EXEC_OUTPUT_INHERIT &&
 672                     i == EXEC_INPUT_NULL &&
 673                     !is_terminal_input(context->std_input) &&
 674                     getppid() != 1)
 675                         return fileno;
 676
 677                 /* Duplicate from stdout if possible */
 678                 if (can_inherit_stderr_from_stdout(context, o, e))
 679                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 680
 681                 o = e;
 682
 683         } else if (o == EXEC_OUTPUT_INHERIT) {
 684                 /* If input got downgraded, inherit the original value */
 685                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 686                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 687
 688                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 689                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 690                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 691
 692                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 693                 if (getppid() != 1)
 694                         return fileno;
 695
 696                 /* We need to open /dev/null here anew, to get the right access mode. */
 697                 return open_null_as(O_WRONLY, fileno);
 698         }
 699
 700         switch (o) {
 701
 702         case EXEC_OUTPUT_NULL:
 703                 return open_null_as(O_WRONLY, fileno);
 704
 705         case EXEC_OUTPUT_TTY:
 706                 if (is_terminal_input(i))
 707                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 708
 709                 /* We don't reset the terminal if this is just about output */
 710                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 711
 712         case EXEC_OUTPUT_KMSG:
 713         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 714         case EXEC_OUTPUT_JOURNAL:
 715         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 716                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 717                 if (r < 0) {
 718                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 719                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 720                         r = open_null_as(O_WRONLY, fileno);
 721                 } else {
 722                         struct stat st;
 723
 724                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 725                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 726                          * services to detect whether they are connected to the journal or not.
 727                          *
 728                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 729                          * about STDERR as that's usually the best way to do logging. */
 730
 731                         if (fstat(fileno, &st) >= 0 &&
 732                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 733                                 *journal_stream_dev = st.st_dev;
 734                                 *journal_stream_ino = st.st_ino;
 735                         }
 736                 }
 737                 return r;
 738
 739         case EXEC_OUTPUT_SOCKET:
 740                 assert(socket_fd >= 0);
 741
 742                 return RET_NERRNO(dup2(socket_fd, fileno));
 743
 744         case EXEC_OUTPUT_NAMED_FD:
 745                 assert(named_iofds[fileno] >= 0);
 746
 747                 (void) fd_nonblock(named_iofds[fileno], false);
 748                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 749
 750         case EXEC_OUTPUT_FILE:
 751         case EXEC_OUTPUT_FILE_APPEND:
 752         case EXEC_OUTPUT_FILE_TRUNCATE: {
 753                 bool rw;
 754                 int fd, flags;
 755
 756                 assert(context->stdio_file[fileno]);
 757
 758                 rw = context->std_input == EXEC_INPUT_FILE &&
 759                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 760
 761                 if (rw)
 762                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 763
 764                 flags = O_WRONLY;
 765                 if (o == EXEC_OUTPUT_FILE_APPEND)
 766                         flags |= O_APPEND;
 767                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 768                         flags |= O_TRUNC;
 769
 770                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 771                 if (fd < 0)
 772                         return fd;
 773
 774                 return move_fd(fd, fileno, 0);
 775         }
 776
 777         default:
 778                 assert_not_reached();
 779         }
 780 }
 781
 782 static int chown_terminal(int fd, uid_t uid) {
 783         int r;
 784
 785         assert(fd >= 0);
 786
 787         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 788         if (isatty(fd) < 1) {
 789                 if (IN_SET(errno, EINVAL, ENOTTY))
 790                         return 0; /* not a tty */
 791
 792                 return -errno;
 793         }
 794
 795         /* This might fail. What matters are the results. */
 796         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 797         if (r < 0)
 798                 return r;
 799
 800         return 1;
 801 }
 802
 803 static int setup_confirm_stdio(
 804                 const ExecContext *context,
 805                 const char *vc,
 806                 int *ret_saved_stdin,
 807                 int *ret_saved_stdout) {
 808
 809         _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
 810         unsigned rows, cols;
 811         int r;
 812
 813         assert(ret_saved_stdin);
 814         assert(ret_saved_stdout);
 815
 816         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 817         if (saved_stdin < 0)
 818                 return -errno;
 819
 820         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 821         if (saved_stdout < 0)
 822                 return -errno;
 823
 824         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 825         if (fd < 0)
 826                 return fd;
 827
 828         r = chown_terminal(fd, getuid());
 829         if (r < 0)
 830                 return r;
 831
 832         r = reset_terminal_fd(fd, true);
 833         if (r < 0)
 834                 return r;
 835
 836         r = exec_context_tty_size(context, &rows, &cols);
 837         if (r < 0)
 838                 return r;
 839
 840         r = terminal_set_size_fd(fd, vc, rows, cols);
 841         if (r < 0)
 842                 return r;
 843
 844         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 845         TAKE_FD(fd);
 846         if (r < 0)
 847                 return r;
 848
 849         *ret_saved_stdin = TAKE_FD(saved_stdin);
 850         *ret_saved_stdout = TAKE_FD(saved_stdout);
 851         return 0;
 852 }
 853
 854 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 855         assert(err < 0);
 856
 857         if (err == -ETIMEDOUT)
 858                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 859         else {
 860                 errno = -err;
 861                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 862         }
 863 }
 864
 865 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 866         _cleanup_close_ int fd = -EBADF;
 867
 868         assert(vc);
 869
 870         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 871         if (fd < 0)
 872                 return;
 873
 874         write_confirm_error_fd(err, fd, u);
 875 }
 876
 877 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 878         int r = 0;
 879
 880         assert(saved_stdin);
 881         assert(saved_stdout);
 882
 883         release_terminal();
 884
 885         if (*saved_stdin >= 0)
 886                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 887                         r = -errno;
 888
 889         if (*saved_stdout >= 0)
 890                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 891                         r = -errno;
 892
 893         *saved_stdin = safe_close(*saved_stdin);
 894         *saved_stdout = safe_close(*saved_stdout);
 895
 896         return r;
 897 }
 898
 899 enum {
 900         CONFIRM_PRETEND_FAILURE = -1,
 901         CONFIRM_PRETEND_SUCCESS =  0,
 902         CONFIRM_EXECUTE = 1,
 903 };
 904
 905 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 906         int saved_stdout = -1, saved_stdin = -1, r;
 907         _cleanup_free_ char *e = NULL;
 908         char c;
 909
 910         /* For any internal errors, assume a positive response. */
 911         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 912         if (r < 0) {
 913                 write_confirm_error(r, vc, u);
 914                 return CONFIRM_EXECUTE;
 915         }
 916
 917         /* confirm_spawn might have been disabled while we were sleeping. */
 918         if (manager_is_confirm_spawn_disabled(u->manager)) {
 919                 r = 1;
 920                 goto restore_stdio;
 921         }
 922
 923         e = ellipsize(cmdline, 60, 100);
 924         if (!e) {
 925                 log_oom();
 926                 r = CONFIRM_EXECUTE;
 927                 goto restore_stdio;
 928         }
 929
 930         for (;;) {
 931                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 932                 if (r < 0) {
 933                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 934                         r = CONFIRM_EXECUTE;
 935                         goto restore_stdio;
 936                 }
 937
 938                 switch (c) {
 939                 case 'c':
 940                         printf("Resuming normal execution.\n");
 941                         manager_disable_confirm_spawn();
 942                         r = 1;
 943                         break;
 944                 case 'D':
 945                         unit_dump(u, stdout, "  ");
 946                         continue; /* ask again */
 947                 case 'f':
 948                         printf("Failing execution.\n");
 949                         r = CONFIRM_PRETEND_FAILURE;
 950                         break;
 951                 case 'h':
 952                         printf("  c - continue, proceed without asking anymore\n"
 953                                "  D - dump, show the state of the unit\n"
 954                                "  f - fail, don't execute the command and pretend it failed\n"
 955                                "  h - help\n"
 956                                "  i - info, show a short summary of the unit\n"
 957                                "  j - jobs, show jobs that are in progress\n"
 958                                "  s - skip, don't execute the command and pretend it succeeded\n"
 959                                "  y - yes, execute the command\n");
 960                         continue; /* ask again */
 961                 case 'i':
 962                         printf("  Description: %s\n"
 963                                "  Unit:        %s\n"
 964                                "  Command:     %s\n",
 965                                u->id, u->description, cmdline);
 966                         continue; /* ask again */
 967                 case 'j':
 968                         manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, "  ");
 969                         continue; /* ask again */
 970                 case 'n':
 971                         /* 'n' was removed in favor of 'f'. */
 972                         printf("Didn't understand 'n', did you mean 'f'?\n");
 973                         continue; /* ask again */
 974                 case 's':
 975                         printf("Skipping execution.\n");
 976                         r = CONFIRM_PRETEND_SUCCESS;
 977                         break;
 978                 case 'y':
 979                         r = CONFIRM_EXECUTE;
 980                         break;
 981                 default:
 982                         assert_not_reached();
 983                 }
 984                 break;
 985         }
 986
 987 restore_stdio:
 988         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 989         return r;
 990 }
 991
 992 static int get_fixed_user(const ExecContext *c, const char **user,
 993                           uid_t *uid, gid_t *gid,
 994                           const char **home, const char **shell) {
 995         int r;
 996         const char *name;
 997
 998         assert(c);
 999
1000         if (!c->user)
1001                 return 0;
1002
1003         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1004          * (i.e. are "/" or "/bin/nologin"). */
1005
1006         name = c->user;
1007         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1008         if (r < 0)
1009                 return r;
1010
1011         *user = name;
1012         return 0;
1013 }
1014
1015 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1016         int r;
1017         const char *name;
1018
1019         assert(c);
1020
1021         if (!c->group)
1022                 return 0;
1023
1024         name = c->group;
1025         r = get_group_creds(&name, gid, 0);
1026         if (r < 0)
1027                 return r;
1028
1029         *group = name;
1030         return 0;
1031 }
1032
1033 static int get_supplementary_groups(const ExecContext *c, const char *user,
1034                                     const char *group, gid_t gid,
1035                                     gid_t **supplementary_gids, int *ngids) {
1036         int r, k = 0;
1037         int ngroups_max;
1038         bool keep_groups = false;
1039         gid_t *groups = NULL;
1040         _cleanup_free_ gid_t *l_gids = NULL;
1041
1042         assert(c);
1043
1044         /*
1045          * If user is given, then lookup GID and supplementary groups list.
1046          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1047          * here and as early as possible so we keep the list of supplementary
1048          * groups of the caller.
1049          */
1050         if (user && gid_is_valid(gid) && gid != 0) {
1051                 /* First step, initialize groups from /etc/groups */
1052                 if (initgroups(user, gid) < 0)
1053                         return -errno;
1054
1055                 keep_groups = true;
1056         }
1057
1058         if (strv_isempty(c->supplementary_groups))
1059                 return 0;
1060
1061         /*
1062          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1063          * be positive, otherwise fail.
1064          */
1065         errno = 0;
1066         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1067         if (ngroups_max <= 0)
1068                 return errno_or_else(EOPNOTSUPP);
1069
1070         l_gids = new(gid_t, ngroups_max);
1071         if (!l_gids)
1072                 return -ENOMEM;
1073
1074         if (keep_groups) {
1075                 /*
1076                  * Lookup the list of groups that the user belongs to, we
1077                  * avoid NSS lookups here too for gid=0.
1078                  */
1079                 k = ngroups_max;
1080                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1081                         return -EINVAL;
1082         } else
1083                 k = 0;
1084
1085         STRV_FOREACH(i, c->supplementary_groups) {
1086                 const char *g;
1087
1088                 if (k >= ngroups_max)
1089                         return -E2BIG;
1090
1091                 g = *i;
1092                 r = get_group_creds(&g, l_gids+k, 0);
1093                 if (r < 0)
1094                         return r;
1095
1096                 k++;
1097         }
1098
1099         /*
1100          * Sets ngids to zero to drop all supplementary groups, happens
1101          * when we are under root and SupplementaryGroups= is empty.
1102          */
1103         if (k == 0) {
1104                 *ngids = 0;
1105                 return 0;
1106         }
1107
1108         /* Otherwise get the final list of supplementary groups */
1109         groups = memdup(l_gids, sizeof(gid_t) * k);
1110         if (!groups)
1111                 return -ENOMEM;
1112
1113         *supplementary_gids = groups;
1114         *ngids = k;
1115
1116         groups = NULL;
1117
1118         return 0;
1119 }
1120
1121 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1122         int r;
1123
1124         /* Handle SupplementaryGroups= if it is not empty */
1125         if (ngids > 0) {
1126                 r = maybe_setgroups(ngids, supplementary_gids);
1127                 if (r < 0)
1128                         return r;
1129         }
1130
1131         if (gid_is_valid(gid)) {
1132                 /* Then set our gids */
1133                 if (setresgid(gid, gid, gid) < 0)
1134                         return -errno;
1135         }
1136
1137         return 0;
1138 }
1139
1140 static int set_securebits(unsigned bits, unsigned mask) {
1141         unsigned applied;
1142         int current;
1143
1144         current = prctl(PR_GET_SECUREBITS);
1145         if (current < 0)
1146                 return -errno;
1147
1148         /* Clear all securebits defined in mask and set bits */
1149         applied = ((unsigned) current & ~mask) | bits;
1150         if ((unsigned) current == applied)
1151                 return 0;
1152
1153         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1154                 return -errno;
1155
1156         return 1;
1157 }
1158
1159 static int enforce_user(
1160                 const ExecContext *context,
1161                 uid_t uid,
1162                 uint64_t capability_ambient_set) {
1163         assert(context);
1164         int r;
1165
1166         if (!uid_is_valid(uid))
1167                 return 0;
1168
1169         /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1170          * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1171          * case. */
1172
1173         if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1174
1175                 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1176                  * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1177                 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1178                 if (r < 0)
1179                         return r;
1180         }
1181
1182         /* Second step: actually set the uids */
1183         if (setresuid(uid, uid, uid) < 0)
1184                 return -errno;
1185
1186         /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1187          * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1188          * outside of this call. */
1189         return 0;
1190 }
1191
1192 #if HAVE_PAM
1193
1194 static int null_conv(
1195                 int num_msg,
1196                 const struct pam_message **msg,
1197                 struct pam_response **resp,
1198                 void *appdata_ptr) {
1199
1200         /* We don't support conversations */
1201
1202         return PAM_CONV_ERR;
1203 }
1204
1205 #endif
1206
1207 static int setup_pam(
1208                 const char *name,
1209                 const char *user,
1210                 uid_t uid,
1211                 gid_t gid,
1212                 const char *tty,
1213                 char ***env, /* updated on success */
1214                 const int fds[], size_t n_fds) {
1215
1216 #if HAVE_PAM
1217
1218         static const struct pam_conv conv = {
1219                 .conv = null_conv,
1220                 .appdata_ptr = NULL
1221         };
1222
1223         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1224         _cleanup_strv_free_ char **e = NULL;
1225         pam_handle_t *handle = NULL;
1226         sigset_t old_ss;
1227         int pam_code = PAM_SUCCESS, r;
1228         bool close_session = false;
1229         pid_t pam_pid = 0, parent_pid;
1230         int flags = 0;
1231
1232         assert(name);
1233         assert(user);
1234         assert(env);
1235
1236         /* We set up PAM in the parent process, then fork. The child
1237          * will then stay around until killed via PR_GET_PDEATHSIG or
1238          * systemd via the cgroup logic. It will then remove the PAM
1239          * session again. The parent process will exec() the actual
1240          * daemon. We do things this way to ensure that the main PID
1241          * of the daemon is the one we initially fork()ed. */
1242
1243         r = barrier_create(&barrier);
1244         if (r < 0)
1245                 goto fail;
1246
1247         if (log_get_max_level() < LOG_DEBUG)
1248                 flags |= PAM_SILENT;
1249
1250         pam_code = pam_start(name, user, &conv, &handle);
1251         if (pam_code != PAM_SUCCESS) {
1252                 handle = NULL;
1253                 goto fail;
1254         }
1255
1256         if (!tty) {
1257                 _cleanup_free_ char *q = NULL;
1258
1259                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1260                  * out if that's the case, and read the TTY off it. */
1261
1262                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1263                         tty = strjoina("/dev/", q);
1264         }
1265
1266         if (tty) {
1267                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1268                 if (pam_code != PAM_SUCCESS)
1269                         goto fail;
1270         }
1271
1272         STRV_FOREACH(nv, *env) {
1273                 pam_code = pam_putenv(handle, *nv);
1274                 if (pam_code != PAM_SUCCESS)
1275                         goto fail;
1276         }
1277
1278         pam_code = pam_acct_mgmt(handle, flags);
1279         if (pam_code != PAM_SUCCESS)
1280                 goto fail;
1281
1282         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1283         if (pam_code != PAM_SUCCESS)
1284                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1285
1286         pam_code = pam_open_session(handle, flags);
1287         if (pam_code != PAM_SUCCESS)
1288                 goto fail;
1289
1290         close_session = true;
1291
1292         e = pam_getenvlist(handle);
1293         if (!e) {
1294                 pam_code = PAM_BUF_ERR;
1295                 goto fail;
1296         }
1297
1298         /* Block SIGTERM, so that we know that it won't get lost in the child */
1299
1300         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1301
1302         parent_pid = getpid_cached();
1303
1304         r = safe_fork("(sd-pam)", 0, &pam_pid);
1305         if (r < 0)
1306                 goto fail;
1307         if (r == 0) {
1308                 int sig, ret = EXIT_PAM;
1309
1310                 /* The child's job is to reset the PAM session on termination */
1311                 barrier_set_role(&barrier, BARRIER_CHILD);
1312
1313                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1314                  * those fds are open here that have been opened by PAM. */
1315                 (void) close_many(fds, n_fds);
1316
1317                 /* Drop privileges - we don't need any to pam_close_session and this will make
1318                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1319                  * threads to fail to exit normally */
1320
1321                 r = maybe_setgroups(0, NULL);
1322                 if (r < 0)
1323                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1324                 if (setresgid(gid, gid, gid) < 0)
1325                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1326                 if (setresuid(uid, uid, uid) < 0)
1327                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1328
1329                 (void) ignore_signals(SIGPIPE);
1330
1331                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1332                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1333                  * this way. We rely on the control groups kill logic to do the rest for us. */
1334                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1335                         goto child_finish;
1336
1337                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1338                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1339                  *
1340                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1341                 (void) barrier_place(&barrier);
1342
1343                 /* Check if our parent process might already have died? */
1344                 if (getppid() == parent_pid) {
1345                         sigset_t ss;
1346
1347                         assert_se(sigemptyset(&ss) >= 0);
1348                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1349
1350                         for (;;) {
1351                                 if (sigwait(&ss, &sig) < 0) {
1352                                         if (errno == EINTR)
1353                                                 continue;
1354
1355                                         goto child_finish;
1356                                 }
1357
1358                                 assert(sig == SIGTERM);
1359                                 break;
1360                         }
1361                 }
1362
1363                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1364                 if (pam_code != PAM_SUCCESS)
1365                         goto child_finish;
1366
1367                 /* If our parent died we'll end the session */
1368                 if (getppid() != parent_pid) {
1369                         pam_code = pam_close_session(handle, flags);
1370                         if (pam_code != PAM_SUCCESS)
1371                                 goto child_finish;
1372                 }
1373
1374                 ret = 0;
1375
1376         child_finish:
1377                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1378                  * know about this. See pam_end(3) */
1379                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1380                 _exit(ret);
1381         }
1382
1383         barrier_set_role(&barrier, BARRIER_PARENT);
1384
1385         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1386          * here. */
1387         handle = NULL;
1388
1389         /* Unblock SIGTERM again in the parent */
1390         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1391
1392         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1393          * this fd around. */
1394         closelog();
1395
1396         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1397          * recover. However, warn loudly if it happens. */
1398         if (!barrier_place_and_sync(&barrier))
1399                 log_error("PAM initialization failed");
1400
1401         return strv_free_and_replace(*env, e);
1402
1403 fail:
1404         if (pam_code != PAM_SUCCESS) {
1405                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1406                 r = -EPERM;  /* PAM errors do not map to errno */
1407         } else
1408                 log_error_errno(r, "PAM failed: %m");
1409
1410         if (handle) {
1411                 if (close_session)
1412                         pam_code = pam_close_session(handle, flags);
1413
1414                 (void) pam_end(handle, pam_code | flags);
1415         }
1416
1417         closelog();
1418         return r;
1419 #else
1420         return 0;
1421 #endif
1422 }
1423
1424 static void rename_process_from_path(const char *path) {
1425         _cleanup_free_ char *buf = NULL;
1426         const char *p;
1427
1428         assert(path);
1429
1430         /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1431          * /bin/ps */
1432
1433         if (path_extract_filename(path, &buf) < 0) {
1434                 rename_process("(...)");
1435                 return;
1436         }
1437
1438         size_t l = strlen(buf);
1439         if (l > 8) {
1440                 /* The end of the process name is usually more interesting, since the first bit might just be
1441                  * "systemd-" */
1442                 p = buf + l - 8;
1443                 l = 8;
1444         } else
1445                 p = buf;
1446
1447         char process_name[11];
1448         process_name[0] = '(';
1449         memcpy(process_name+1, p, l);
1450         process_name[1+l] = ')';
1451         process_name[1+l+1] = 0;
1452
1453         rename_process(process_name);
1454 }
1455
1456 static bool context_has_address_families(const ExecContext *c) {
1457         assert(c);
1458
1459         return c->address_families_allow_list ||
1460                 !set_isempty(c->address_families);
1461 }
1462
1463 static bool context_has_syscall_filters(const ExecContext *c) {
1464         assert(c);
1465
1466         return c->syscall_allow_list ||
1467                 !hashmap_isempty(c->syscall_filter);
1468 }
1469
1470 static bool context_has_syscall_logs(const ExecContext *c) {
1471         assert(c);
1472
1473         return c->syscall_log_allow_list ||
1474                 !hashmap_isempty(c->syscall_log);
1475 }
1476
1477 static bool context_has_no_new_privileges(const ExecContext *c) {
1478         assert(c);
1479
1480         if (c->no_new_privileges)
1481                 return true;
1482
1483         if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1484                 return false;
1485
1486         /* We need NNP if we have any form of seccomp and are unprivileged */
1487         return c->lock_personality ||
1488                 c->memory_deny_write_execute ||
1489                 c->private_devices ||
1490                 c->protect_clock ||
1491                 c->protect_hostname ||
1492                 c->protect_kernel_tunables ||
1493                 c->protect_kernel_modules ||
1494                 c->protect_kernel_logs ||
1495                 context_has_address_families(c) ||
1496                 exec_context_restrict_namespaces_set(c) ||
1497                 c->restrict_realtime ||
1498                 c->restrict_suid_sgid ||
1499                 !set_isempty(c->syscall_archs) ||
1500                 context_has_syscall_filters(c) ||
1501                 context_has_syscall_logs(c);
1502 }
1503
1504 #if HAVE_SECCOMP
1505
1506 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1507
1508         if (is_seccomp_available())
1509                 return false;
1510
1511         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1512         return true;
1513 }
1514
1515 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1516         uint32_t negative_action, default_action, action;
1517         int r;
1518
1519         assert(u);
1520         assert(c);
1521
1522         if (!context_has_syscall_filters(c))
1523                 return 0;
1524
1525         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1526                 return 0;
1527
1528         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1529
1530         if (c->syscall_allow_list) {
1531                 default_action = negative_action;
1532                 action = SCMP_ACT_ALLOW;
1533         } else {
1534                 default_action = SCMP_ACT_ALLOW;
1535                 action = negative_action;
1536         }
1537
1538         if (needs_ambient_hack) {
1539                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1540                 if (r < 0)
1541                         return r;
1542         }
1543
1544         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1545 }
1546
1547 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1548 #ifdef SCMP_ACT_LOG
1549         uint32_t default_action, action;
1550 #endif
1551
1552         assert(u);
1553         assert(c);
1554
1555         if (!context_has_syscall_logs(c))
1556                 return 0;
1557
1558 #ifdef SCMP_ACT_LOG
1559         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1560                 return 0;
1561
1562         if (c->syscall_log_allow_list) {
1563                 /* Log nothing but the ones listed */
1564                 default_action = SCMP_ACT_ALLOW;
1565                 action = SCMP_ACT_LOG;
1566         } else {
1567                 /* Log everything but the ones listed */
1568                 default_action = SCMP_ACT_LOG;
1569                 action = SCMP_ACT_ALLOW;
1570         }
1571
1572         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1573 #else
1574         /* old libseccomp */
1575         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1576         return 0;
1577 #endif
1578 }
1579
1580 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1581         assert(u);
1582         assert(c);
1583
1584         if (set_isempty(c->syscall_archs))
1585                 return 0;
1586
1587         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1588                 return 0;
1589
1590         return seccomp_restrict_archs(c->syscall_archs);
1591 }
1592
1593 static int apply_address_families(const Unit* u, const ExecContext *c) {
1594         assert(u);
1595         assert(c);
1596
1597         if (!context_has_address_families(c))
1598                 return 0;
1599
1600         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1601                 return 0;
1602
1603         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1604 }
1605
1606 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1607         int r;
1608
1609         assert(u);
1610         assert(c);
1611
1612         if (!c->memory_deny_write_execute)
1613                 return 0;
1614
1615         /* use prctl() if kernel supports it (6.3) */
1616         r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1617         if (r == 0) {
1618                 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1619                 return 0;
1620         }
1621         if (r < 0 && errno != EINVAL)
1622                 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1623         /* else use seccomp */
1624         log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1625
1626         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1627                 return 0;
1628
1629         return seccomp_memory_deny_write_execute();
1630 }
1631
1632 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1633         assert(u);
1634         assert(c);
1635
1636         if (!c->restrict_realtime)
1637                 return 0;
1638
1639         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1640                 return 0;
1641
1642         return seccomp_restrict_realtime();
1643 }
1644
1645 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1646         assert(u);
1647         assert(c);
1648
1649         if (!c->restrict_suid_sgid)
1650                 return 0;
1651
1652         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1653                 return 0;
1654
1655         return seccomp_restrict_suid_sgid();
1656 }
1657
1658 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1659         assert(u);
1660         assert(c);
1661
1662         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1663          * let's protect even those systems where this is left on in the kernel. */
1664
1665         if (!c->protect_kernel_tunables)
1666                 return 0;
1667
1668         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1669                 return 0;
1670
1671         return seccomp_protect_sysctl();
1672 }
1673
1674 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1675         assert(u);
1676         assert(c);
1677
1678         /* Turn off module syscalls on ProtectKernelModules=yes */
1679
1680         if (!c->protect_kernel_modules)
1681                 return 0;
1682
1683         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1684                 return 0;
1685
1686         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1687 }
1688
1689 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1690         assert(u);
1691         assert(c);
1692
1693         if (!c->protect_kernel_logs)
1694                 return 0;
1695
1696         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1697                 return 0;
1698
1699         return seccomp_protect_syslog();
1700 }
1701
1702 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1703         assert(u);
1704         assert(c);
1705
1706         if (!c->protect_clock)
1707                 return 0;
1708
1709         if (skip_seccomp_unavailable(u, "ProtectClock="))
1710                 return 0;
1711
1712         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1713 }
1714
1715 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1716         assert(u);
1717         assert(c);
1718
1719         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1720
1721         if (!c->private_devices)
1722                 return 0;
1723
1724         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1725                 return 0;
1726
1727         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1728 }
1729
1730 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1731         assert(u);
1732         assert(c);
1733
1734         if (!exec_context_restrict_namespaces_set(c))
1735                 return 0;
1736
1737         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1738                 return 0;
1739
1740         return seccomp_restrict_namespaces(c->restrict_namespaces);
1741 }
1742
1743 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1744         unsigned long personality;
1745         int r;
1746
1747         assert(u);
1748         assert(c);
1749
1750         if (!c->lock_personality)
1751                 return 0;
1752
1753         if (skip_seccomp_unavailable(u, "LockPersonality="))
1754                 return 0;
1755
1756         personality = c->personality;
1757
1758         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1759         if (personality == PERSONALITY_INVALID) {
1760
1761                 r = opinionated_personality(&personality);
1762                 if (r < 0)
1763                         return r;
1764         }
1765
1766         return seccomp_lock_personality(personality);
1767 }
1768
1769 #endif
1770
1771 #if HAVE_LIBBPF
1772 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1773         assert(u);
1774         assert(c);
1775
1776         if (!exec_context_restrict_filesystems_set(c))
1777                 return 0;
1778
1779         if (!u->manager->restrict_fs) {
1780                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1781                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1782                 return 0;
1783         }
1784
1785         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1786 }
1787 #endif
1788
1789 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1790         assert(u);
1791         assert(c);
1792
1793         if (!c->protect_hostname)
1794                 return 0;
1795
1796         if (ns_type_supported(NAMESPACE_UTS)) {
1797                 if (unshare(CLONE_NEWUTS) < 0) {
1798                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1799                                 *ret_exit_status = EXIT_NAMESPACE;
1800                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1801                         }
1802
1803                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1804                 }
1805         } else
1806                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1807
1808 #if HAVE_SECCOMP
1809         int r;
1810
1811         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1812                 return 0;
1813
1814         r = seccomp_protect_hostname();
1815         if (r < 0) {
1816                 *ret_exit_status = EXIT_SECCOMP;
1817                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1818         }
1819 #endif
1820
1821         return 0;
1822 }
1823
1824 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1825         assert(idle_pipe);
1826
1827         idle_pipe[1] = safe_close(idle_pipe[1]);
1828         idle_pipe[2] = safe_close(idle_pipe[2]);
1829
1830         if (idle_pipe[0] >= 0) {
1831                 int r;
1832
1833                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1834
1835                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1836                         ssize_t n;
1837
1838                         /* Signal systemd that we are bored and want to continue. */
1839                         n = write(idle_pipe[3], "x", 1);
1840                         if (n > 0)
1841                                 /* Wait for systemd to react to the signal above. */
1842                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1843                 }
1844
1845                 idle_pipe[0] = safe_close(idle_pipe[0]);
1846
1847         }
1848
1849         idle_pipe[3] = safe_close(idle_pipe[3]);
1850 }
1851
1852 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1853
1854 static int build_environment(
1855                 const Unit *u,
1856                 const ExecContext *c,
1857                 const ExecParameters *p,
1858                 const CGroupContext *cgroup_context,
1859                 size_t n_fds,
1860                 char **fdnames,
1861                 const char *home,
1862                 const char *username,
1863                 const char *shell,
1864                 dev_t journal_stream_dev,
1865                 ino_t journal_stream_ino,
1866                 const char *memory_pressure_path,
1867                 char ***ret) {
1868
1869         _cleanup_strv_free_ char **our_env = NULL;
1870         size_t n_env = 0;
1871         char *x;
1872         int r;
1873
1874         assert(u);
1875         assert(c);
1876         assert(p);
1877         assert(ret);
1878
1879 #define N_ENV_VARS 19
1880         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1881         if (!our_env)
1882                 return -ENOMEM;
1883
1884         if (n_fds > 0) {
1885                 _cleanup_free_ char *joined = NULL;
1886
1887                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1888                         return -ENOMEM;
1889                 our_env[n_env++] = x;
1890
1891                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1892                         return -ENOMEM;
1893                 our_env[n_env++] = x;
1894
1895                 joined = strv_join(fdnames, ":");
1896                 if (!joined)
1897                         return -ENOMEM;
1898
1899                 x = strjoin("LISTEN_FDNAMES=", joined);
1900                 if (!x)
1901                         return -ENOMEM;
1902                 our_env[n_env++] = x;
1903         }
1904
1905         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1906                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1907                         return -ENOMEM;
1908                 our_env[n_env++] = x;
1909
1910                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1911                         return -ENOMEM;
1912                 our_env[n_env++] = x;
1913         }
1914
1915         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1916          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1917          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1918         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1919                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1920                 if (!x)
1921                         return -ENOMEM;
1922                 our_env[n_env++] = x;
1923         }
1924
1925         if (home) {
1926                 x = strjoin("HOME=", home);
1927                 if (!x)
1928                         return -ENOMEM;
1929
1930                 path_simplify(x + 5);
1931                 our_env[n_env++] = x;
1932         }
1933
1934         if (username) {
1935                 x = strjoin("LOGNAME=", username);
1936                 if (!x)
1937                         return -ENOMEM;
1938                 our_env[n_env++] = x;
1939
1940                 x = strjoin("USER=", username);
1941                 if (!x)
1942                         return -ENOMEM;
1943                 our_env[n_env++] = x;
1944         }
1945
1946         if (shell) {
1947                 x = strjoin("SHELL=", shell);
1948                 if (!x)
1949                         return -ENOMEM;
1950
1951                 path_simplify(x + 6);
1952                 our_env[n_env++] = x;
1953         }
1954
1955         if (!sd_id128_is_null(u->invocation_id)) {
1956                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1957                         return -ENOMEM;
1958
1959                 our_env[n_env++] = x;
1960         }
1961
1962         if (exec_context_needs_term(c)) {
1963                 _cleanup_free_ char *cmdline = NULL;
1964                 const char *tty_path, *term = NULL;
1965
1966                 tty_path = exec_context_tty_path(c);
1967
1968                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1969                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1970                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1971
1972                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1973                         term = getenv("TERM");
1974                 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1975                         _cleanup_free_ char *key = NULL;
1976
1977                         key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1978                         if (!key)
1979                                 return -ENOMEM;
1980
1981                         r = proc_cmdline_get_key(key, 0, &cmdline);
1982                         if (r < 0)
1983                                 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1984                         else if (r > 0)
1985                                 term = cmdline;
1986                 }
1987
1988                 if (!term)
1989                         term = default_term_for_tty(tty_path);
1990
1991                 x = strjoin("TERM=", term);
1992                 if (!x)
1993                         return -ENOMEM;
1994                 our_env[n_env++] = x;
1995         }
1996
1997         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1998                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1999                         return -ENOMEM;
2000
2001                 our_env[n_env++] = x;
2002         }
2003
2004         if (c->log_namespace) {
2005                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2006                 if (!x)
2007                         return -ENOMEM;
2008
2009                 our_env[n_env++] = x;
2010         }
2011
2012         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2013                 _cleanup_free_ char *joined = NULL;
2014                 const char *n;
2015
2016                 if (!p->prefix[t])
2017                         continue;
2018
2019                 if (c->directories[t].n_items == 0)
2020                         continue;
2021
2022                 n = exec_directory_env_name_to_string(t);
2023                 if (!n)
2024                         continue;
2025
2026                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2027                         _cleanup_free_ char *prefixed = NULL;
2028
2029                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2030                         if (!prefixed)
2031                                 return -ENOMEM;
2032
2033                         if (!strextend_with_separator(&joined, ":", prefixed))
2034                                 return -ENOMEM;
2035                 }
2036
2037                 x = strjoin(n, "=", joined);
2038                 if (!x)
2039                         return -ENOMEM;
2040
2041                 our_env[n_env++] = x;
2042         }
2043
2044         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2045                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2046                 if (!x)
2047                         return -ENOMEM;
2048
2049                 our_env[n_env++] = x;
2050         }
2051
2052         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2053                 return -ENOMEM;
2054
2055         our_env[n_env++] = x;
2056
2057         if (memory_pressure_path) {
2058                 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2059                 if (!x)
2060                         return -ENOMEM;
2061
2062                 our_env[n_env++] = x;
2063
2064                 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2065                         _cleanup_free_ char *b = NULL, *e = NULL;
2066
2067                         if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2068                                      MEMORY_PRESSURE_DEFAULT_TYPE,
2069                                      cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2070                                      CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2071                                      MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2072                                 return -ENOMEM;
2073
2074                         if (base64mem(b, strlen(b) + 1, &e) < 0)
2075                                 return -ENOMEM;
2076
2077                         x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2078                         if (!x)
2079                                 return -ENOMEM;
2080
2081                         our_env[n_env++] = x;
2082                 }
2083         }
2084
2085         assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2086 #undef N_ENV_VARS
2087
2088         *ret = TAKE_PTR(our_env);
2089
2090         return 0;
2091 }
2092
2093 static int build_pass_environment(const ExecContext *c, char ***ret) {
2094         _cleanup_strv_free_ char **pass_env = NULL;
2095         size_t n_env = 0;
2096
2097         STRV_FOREACH(i, c->pass_environment) {
2098                 _cleanup_free_ char *x = NULL;
2099                 char *v;
2100
2101                 v = getenv(*i);
2102                 if (!v)
2103                         continue;
2104                 x = strjoin(*i, "=", v);
2105                 if (!x)
2106                         return -ENOMEM;
2107
2108                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2109                         return -ENOMEM;
2110
2111                 pass_env[n_env++] = TAKE_PTR(x);
2112                 pass_env[n_env] = NULL;
2113         }
2114
2115         *ret = TAKE_PTR(pass_env);
2116
2117         return 0;
2118 }
2119
2120 bool exec_needs_network_namespace(const ExecContext *context) {
2121         assert(context);
2122
2123         return context->private_network || context->network_namespace_path;
2124 }
2125
2126 static bool exec_needs_ephemeral(const ExecContext *context) {
2127         return (context->root_image || context->root_directory) && context->root_ephemeral;
2128 }
2129
2130 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2131         assert(context);
2132
2133         return context->private_ipc || context->ipc_namespace_path;
2134 }
2135
2136 bool exec_needs_mount_namespace(
2137                 const ExecContext *context,
2138                 const ExecParameters *params,
2139                 const ExecRuntime *runtime) {
2140
2141         assert(context);
2142
2143         if (context->root_image)
2144                 return true;
2145
2146         if (!strv_isempty(context->read_write_paths) ||
2147             !strv_isempty(context->read_only_paths) ||
2148             !strv_isempty(context->inaccessible_paths) ||
2149             !strv_isempty(context->exec_paths) ||
2150             !strv_isempty(context->no_exec_paths))
2151                 return true;
2152
2153         if (context->n_bind_mounts > 0)
2154                 return true;
2155
2156         if (context->n_temporary_filesystems > 0)
2157                 return true;
2158
2159         if (context->n_mount_images > 0)
2160                 return true;
2161
2162         if (context->n_extension_images > 0)
2163                 return true;
2164
2165         if (!strv_isempty(context->extension_directories))
2166                 return true;
2167
2168         if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2169                 return true;
2170
2171         if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2172                 return true;
2173
2174         if (context->private_devices ||
2175             context->private_mounts > 0 ||
2176             (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2177             context->protect_system != PROTECT_SYSTEM_NO ||
2178             context->protect_home != PROTECT_HOME_NO ||
2179             context->protect_kernel_tunables ||
2180             context->protect_kernel_modules ||
2181             context->protect_kernel_logs ||
2182             context->protect_control_groups ||
2183             context->protect_proc != PROTECT_PROC_DEFAULT ||
2184             context->proc_subset != PROC_SUBSET_ALL ||
2185             exec_needs_ipc_namespace(context))
2186                 return true;
2187
2188         if (context->root_directory) {
2189                 if (exec_context_get_effective_mount_apivfs(context))
2190                         return true;
2191
2192                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2193                         if (params && !params->prefix[t])
2194                                 continue;
2195
2196                         if (context->directories[t].n_items > 0)
2197                                 return true;
2198                 }
2199         }
2200
2201         if (context->dynamic_user &&
2202             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2203              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2204              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2205                 return true;
2206
2207         if (context->log_namespace)
2208                 return true;
2209
2210         return false;
2211 }
2212
2213 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2214         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2215         _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2216         _cleanup_close_ int unshare_ready_fd = -EBADF;
2217         _cleanup_(sigkill_waitp) pid_t pid = 0;
2218         uint64_t c = 1;
2219         ssize_t n;
2220         int r;
2221
2222         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2223          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2224          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2225          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2226          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2227          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2228          * continues execution normally.
2229          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2230          * does not need CAP_SETUID to write the single line mapping to itself. */
2231
2232         /* Can only set up multiple mappings with CAP_SETUID. */
2233         if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2234                 r = asprintf(&uid_map,
2235                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2236                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2237                              ouid, ouid, uid, uid);
2238         else
2239                 r = asprintf(&uid_map,
2240                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2241                              ouid, ouid);
2242
2243         if (r < 0)
2244                 return -ENOMEM;
2245
2246         /* Can only set up multiple mappings with CAP_SETGID. */
2247         if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2248                 r = asprintf(&gid_map,
2249                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2250                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2251                              ogid, ogid, gid, gid);
2252         else
2253                 r = asprintf(&gid_map,
2254                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2255                              ogid, ogid);
2256
2257         if (r < 0)
2258                 return -ENOMEM;
2259
2260         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2261          * namespace. */
2262         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2263         if (unshare_ready_fd < 0)
2264                 return -errno;
2265
2266         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2267          * failed. */
2268         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2269                 return -errno;
2270
2271         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2272         if (r < 0)
2273                 return r;
2274         if (r == 0) {
2275                 _cleanup_close_ int fd = -EBADF;
2276                 const char *a;
2277                 pid_t ppid;
2278
2279                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2280                  * here, after the parent opened its own user namespace. */
2281
2282                 ppid = getppid();
2283                 errno_pipe[0] = safe_close(errno_pipe[0]);
2284
2285                 /* Wait until the parent unshared the user namespace */
2286                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2287                         r = -errno;
2288                         goto child_fail;
2289                 }
2290
2291                 /* Disable the setgroups() system call in the child user namespace, for good. */
2292                 a = procfs_file_alloca(ppid, "setgroups");
2293                 fd = open(a, O_WRONLY|O_CLOEXEC);
2294                 if (fd < 0) {
2295                         if (errno != ENOENT) {
2296                                 r = -errno;
2297                                 goto child_fail;
2298                         }
2299
2300                         /* If the file is missing the kernel is too old, let's continue anyway. */
2301                 } else {
2302                         if (write(fd, "deny\n", 5) < 0) {
2303                                 r = -errno;
2304                                 goto child_fail;
2305                         }
2306
2307                         fd = safe_close(fd);
2308                 }
2309
2310                 /* First write the GID map */
2311                 a = procfs_file_alloca(ppid, "gid_map");
2312                 fd = open(a, O_WRONLY|O_CLOEXEC);
2313                 if (fd < 0) {
2314                         r = -errno;
2315                         goto child_fail;
2316                 }
2317                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2318                         r = -errno;
2319                         goto child_fail;
2320                 }
2321                 fd = safe_close(fd);
2322
2323                 /* The write the UID map */
2324                 a = procfs_file_alloca(ppid, "uid_map");
2325                 fd = open(a, O_WRONLY|O_CLOEXEC);
2326                 if (fd < 0) {
2327                         r = -errno;
2328                         goto child_fail;
2329                 }
2330                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2331                         r = -errno;
2332                         goto child_fail;
2333                 }
2334
2335                 _exit(EXIT_SUCCESS);
2336
2337         child_fail:
2338                 (void) write(errno_pipe[1], &r, sizeof(r));
2339                 _exit(EXIT_FAILURE);
2340         }
2341
2342         errno_pipe[1] = safe_close(errno_pipe[1]);
2343
2344         if (unshare(CLONE_NEWUSER) < 0)
2345                 return -errno;
2346
2347         /* Let the child know that the namespace is ready now */
2348         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2349                 return -errno;
2350
2351         /* Try to read an error code from the child */
2352         n = read(errno_pipe[0], &r, sizeof(r));
2353         if (n < 0)
2354                 return -errno;
2355         if (n == sizeof(r)) { /* an error code was sent to us */
2356                 if (r < 0)
2357                         return r;
2358                 return -EIO;
2359         }
2360         if (n != 0) /* on success we should have read 0 bytes */
2361                 return -EIO;
2362
2363         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2364         if (r < 0)
2365                 return r;
2366         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2367                 return -EIO;
2368
2369         return 0;
2370 }
2371
2372 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2373         assert(context);
2374
2375         if (!context->dynamic_user)
2376                 return false;
2377
2378         if (type == EXEC_DIRECTORY_CONFIGURATION)
2379                 return false;
2380
2381         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2382                 return false;
2383
2384         return true;
2385 }
2386
2387 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2388         _cleanup_free_ char *src_abs = NULL;
2389         int r;
2390
2391         assert(source);
2392
2393         src_abs = path_join(root, source);
2394         if (!src_abs)
2395                 return -ENOMEM;
2396
2397         STRV_FOREACH(dst, symlinks) {
2398                 _cleanup_free_ char *dst_abs = NULL;
2399
2400                 dst_abs = path_join(root, *dst);
2401                 if (!dst_abs)
2402                         return -ENOMEM;
2403
2404                 r = mkdir_parents_label(dst_abs, 0755);
2405                 if (r < 0)
2406                         return r;
2407
2408                 r = symlink_idempotent(src_abs, dst_abs, true);
2409                 if (r < 0)
2410                         return r;
2411         }
2412
2413         return 0;
2414 }
2415
2416 static int setup_exec_directory(
2417                 Unit *u,
2418                 const ExecContext *context,
2419                 const ExecParameters *params,
2420                 uid_t uid,
2421                 gid_t gid,
2422                 ExecDirectoryType type,
2423                 bool needs_mount_namespace,
2424                 int *exit_status) {
2425
2426         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2427                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2428                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2429                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2430                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2431                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2432         };
2433         int r;
2434
2435         assert(context);
2436         assert(params);
2437         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2438         assert(exit_status);
2439
2440         if (!params->prefix[type])
2441                 return 0;
2442
2443         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2444                 if (!uid_is_valid(uid))
2445                         uid = 0;
2446                 if (!gid_is_valid(gid))
2447                         gid = 0;
2448         }
2449
2450         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2451                 _cleanup_free_ char *p = NULL, *pp = NULL;
2452
2453                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2454                 if (!p) {
2455                         r = -ENOMEM;
2456                         goto fail;
2457                 }
2458
2459                 r = mkdir_parents_label(p, 0755);
2460                 if (r < 0)
2461                         goto fail;
2462
2463                 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2464
2465                         /* If we are in user mode, and a configuration directory exists but a state directory
2466                          * doesn't exist, then we likely are upgrading from an older systemd version that
2467                          * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2468                          * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2469                          * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2470                          * separated. If a service has both dirs configured but only the configuration dir
2471                          * exists and the state dir does not, we assume we are looking at an update
2472                          * situation. Hence, create a compatibility symlink, so that all expectations are
2473                          * met.
2474                          *
2475                          * (We also do something similar with the log directory, which still doesn't exist in
2476                          * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2477
2478                         /* this assumes the state dir is always created before the configuration dir */
2479                         assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2480                         assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2481
2482                         r = laccess(p, F_OK);
2483                         if (r == -ENOENT) {
2484                                 _cleanup_free_ char *q = NULL;
2485
2486                                 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2487                                  * under the configuration hierarchy. */
2488
2489                                 if (type == EXEC_DIRECTORY_STATE)
2490                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2491                                 else if (type == EXEC_DIRECTORY_LOGS)
2492                                         q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2493                                 else
2494                                         assert_not_reached();
2495                                 if (!q) {
2496                                         r = -ENOMEM;
2497                                         goto fail;
2498                                 }
2499
2500                                 r = laccess(q, F_OK);
2501                                 if (r >= 0) {
2502                                         /* It does exist! This hence looks like an update. Symlink the
2503                                          * configuration directory into the state directory. */
2504
2505                                         r = symlink_idempotent(q, p, /* make_relative= */ true);
2506                                         if (r < 0)
2507                                                 goto fail;
2508
2509                                         log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2510                                         continue;
2511                                 } else if (r != -ENOENT)
2512                                         log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2513
2514                         } else if (r < 0)
2515                                 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2516                 }
2517
2518                 if (exec_directory_is_private(context, type)) {
2519                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2520                          * case we want to avoid leaving a directory around fully accessible that is owned by
2521                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2522                          * trick used by container managers to prohibit host users to get access to files of
2523                          * the same UID in containers: we place everything inside a directory that has an
2524                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2525                          * for unprivileged host code. We then use fs namespacing to make this directory
2526                          * permeable for the service itself.
2527                          *
2528                          * Specifically: for a service which wants a special directory "foo/" we first create
2529                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2530                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2531                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2532                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2533                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2534                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2535                          * for the service and making sure it only gets access to the dirs it needs but no
2536                          * others. Tricky? Yes, absolutely, but it works!
2537                          *
2538                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2539                          * to be owned by the service itself.
2540                          *
2541                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2542                          * for sharing files or sockets with other services. */
2543
2544                         pp = path_join(params->prefix[type], "private");
2545                         if (!pp) {
2546                                 r = -ENOMEM;
2547                                 goto fail;
2548                         }
2549
2550                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2551                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2552                         if (r < 0)
2553                                 goto fail;
2554
2555                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2556                                 r = -ENOMEM;
2557                                 goto fail;
2558                         }
2559
2560                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2561                         r = mkdir_parents_label(pp, 0755);
2562                         if (r < 0)
2563                                 goto fail;
2564
2565                         if (is_dir(p, false) > 0 &&
2566                             (laccess(pp, F_OK) == -ENOENT)) {
2567
2568                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2569                                  * it over. Most likely the service has been upgraded from one that didn't use
2570                                  * DynamicUser=1, to one that does. */
2571
2572                                 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2573                                               "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2574                                               exec_directory_type_to_string(type), p, pp);
2575
2576                                 r = RET_NERRNO(rename(p, pp));
2577                                 if (r < 0)
2578                                         goto fail;
2579                         } else {
2580                                 /* Otherwise, create the actual directory for the service */
2581
2582                                 r = mkdir_label(pp, context->directories[type].mode);
2583                                 if (r < 0 && r != -EEXIST)
2584                                         goto fail;
2585                         }
2586
2587                         if (!context->directories[type].items[i].only_create) {
2588                                 /* And link it up from the original place.
2589                                  * Notes
2590                                  * 1) If a mount namespace is going to be used, then this symlink remains on
2591                                  *    the host, and a new one for the child namespace will be created later.
2592                                  * 2) It is not necessary to create this symlink when one of its parent
2593                                  *    directories is specified and already created. E.g.
2594                                  *        StateDirectory=foo foo/bar
2595                                  *    In that case, the inode points to pp and p for "foo/bar" are the same:
2596                                  *        pp = "/var/lib/private/foo/bar"
2597                                  *        p = "/var/lib/foo/bar"
2598                                  *    and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2599                                  *    we do not need to create the symlink, but we cannot create the symlink.
2600                                  *    See issue #24783. */
2601                                 r = symlink_idempotent(pp, p, true);
2602                                 if (r < 0)
2603                                         goto fail;
2604                         }
2605
2606                 } else {
2607                         _cleanup_free_ char *target = NULL;
2608
2609                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2610                             readlink_and_make_absolute(p, &target) >= 0) {
2611                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2612
2613                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2614                                  * by DynamicUser=1 (see above)?
2615                                  *
2616                                  * We do this for all directory types except for ConfigurationDirectory=,
2617                                  * since they all support the private/ symlink logic at least in some
2618                                  * configurations, see above. */
2619
2620                                 r = chase(target, NULL, 0, &target_resolved, NULL);
2621                                 if (r < 0)
2622                                         goto fail;
2623
2624                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2625                                 if (!q) {
2626                                         r = -ENOMEM;
2627                                         goto fail;
2628                                 }
2629
2630                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2631                                 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2632                                 if (r < 0)
2633                                         goto fail;
2634
2635                                 if (path_equal(q_resolved, target_resolved)) {
2636
2637                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2638                                          * but is no longer. Let's move the directory back up. */
2639
2640                                         log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2641                                                       "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2642                                                       exec_directory_type_to_string(type), q, p);
2643
2644                                         r = RET_NERRNO(unlink(p));
2645                                         if (r < 0)
2646                                                 goto fail;
2647
2648                                         r = RET_NERRNO(rename(q, p));
2649                                         if (r < 0)
2650                                                 goto fail;
2651                                 }
2652                         }
2653
2654                         r = mkdir_label(p, context->directories[type].mode);
2655                         if (r < 0) {
2656                                 if (r != -EEXIST)
2657                                         goto fail;
2658
2659                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2660                                         struct stat st;
2661
2662                                         /* Don't change the owner/access mode of the configuration directory,
2663                                          * as in the common case it is not written to by a service, and shall
2664                                          * not be writable. */
2665
2666                                         r = RET_NERRNO(stat(p, &st));
2667                                         if (r < 0)
2668                                                 goto fail;
2669
2670                                         /* Still complain if the access mode doesn't match */
2671                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2672                                                 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2673                                                                  "(File system: %o %sMode: %o)",
2674                                                                  exec_directory_type_to_string(type), context->directories[type].items[i].path,
2675                                                                  st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2676
2677                                         continue;
2678                                 }
2679                         }
2680                 }
2681
2682                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2683                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2684                  * current UID/GID ownership.) */
2685                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2686                 if (r < 0)
2687                         goto fail;
2688
2689                 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2690                  * available to user code anyway */
2691                 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2692                         continue;
2693
2694                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2695                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2696                  * assignments to exist. */
2697                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2698                 if (r < 0)
2699                         goto fail;
2700         }
2701
2702         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2703          * they are set up later, to allow configuring empty var/run/etc. */
2704         if (!needs_mount_namespace)
2705                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2706                         r = create_many_symlinks(params->prefix[type],
2707                                                  context->directories[type].items[i].path,
2708                                                  context->directories[type].items[i].symlinks);
2709                         if (r < 0)
2710                                 goto fail;
2711                 }
2712
2713         return 0;
2714
2715 fail:
2716         *exit_status = exit_status_table[type];
2717         return r;
2718 }
2719
2720 #if ENABLE_SMACK
2721 static int setup_smack(
2722                 const Manager *manager,
2723                 const ExecContext *context,
2724                 int executable_fd) {
2725         int r;
2726
2727         assert(context);
2728         assert(executable_fd >= 0);
2729
2730         if (context->smack_process_label) {
2731                 r = mac_smack_apply_pid(0, context->smack_process_label);
2732                 if (r < 0)
2733                         return r;
2734         } else if (manager->default_smack_process_label) {
2735                 _cleanup_free_ char *exec_label = NULL;
2736
2737                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2738                 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2739                         return r;
2740
2741                 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
2742                 if (r < 0)
2743                         return r;
2744         }
2745
2746         return 0;
2747 }
2748 #endif
2749
2750 static int compile_bind_mounts(
2751                 const ExecContext *context,
2752                 const ExecParameters *params,
2753                 BindMount **ret_bind_mounts,
2754                 size_t *ret_n_bind_mounts,
2755                 char ***ret_empty_directories) {
2756
2757         _cleanup_strv_free_ char **empty_directories = NULL;
2758         BindMount *bind_mounts = NULL;
2759         size_t n, h = 0;
2760         int r;
2761
2762         assert(context);
2763         assert(params);
2764         assert(ret_bind_mounts);
2765         assert(ret_n_bind_mounts);
2766         assert(ret_empty_directories);
2767
2768         CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2769
2770         n = context->n_bind_mounts;
2771         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2772                 if (!params->prefix[t])
2773                         continue;
2774
2775                 for (size_t i = 0; i < context->directories[t].n_items; i++)
2776                         n += !context->directories[t].items[i].only_create;
2777         }
2778
2779         if (n <= 0) {
2780                 *ret_bind_mounts = NULL;
2781                 *ret_n_bind_mounts = 0;
2782                 *ret_empty_directories = NULL;
2783                 return 0;
2784         }
2785
2786         bind_mounts = new(BindMount, n);
2787         if (!bind_mounts)
2788                 return -ENOMEM;
2789
2790         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2791                 BindMount *item = context->bind_mounts + i;
2792                 _cleanup_free_ char *s = NULL, *d = NULL;
2793
2794                 s = strdup(item->source);
2795                 if (!s)
2796                         return -ENOMEM;
2797
2798                 d = strdup(item->destination);
2799                 if (!d)
2800                         return -ENOMEM;
2801
2802                 bind_mounts[h++] = (BindMount) {
2803                         .source = TAKE_PTR(s),
2804                         .destination = TAKE_PTR(d),
2805                         .read_only = item->read_only,
2806                         .recursive = item->recursive,
2807                         .ignore_enoent = item->ignore_enoent,
2808                 };
2809         }
2810
2811         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2812                 if (!params->prefix[t])
2813                         continue;
2814
2815                 if (context->directories[t].n_items == 0)
2816                         continue;
2817
2818                 if (exec_directory_is_private(context, t) &&
2819                     !exec_context_with_rootfs(context)) {
2820                         char *private_root;
2821
2822                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2823                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2824                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2825
2826                         private_root = path_join(params->prefix[t], "private");
2827                         if (!private_root)
2828                                 return -ENOMEM;
2829
2830                         r = strv_consume(&empty_directories, private_root);
2831                         if (r < 0)
2832                                 return r;
2833                 }
2834
2835                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2836                         _cleanup_free_ char *s = NULL, *d = NULL;
2837
2838                         /* When one of the parent directories is in the list, we cannot create the symlink
2839                          * for the child directory. See also the comments in setup_exec_directory(). */
2840                         if (context->directories[t].items[i].only_create)
2841                                 continue;
2842
2843                         if (exec_directory_is_private(context, t))
2844                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2845                         else
2846                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2847                         if (!s)
2848                                 return -ENOMEM;
2849
2850                         if (exec_directory_is_private(context, t) &&
2851                             exec_context_with_rootfs(context))
2852                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2853                                  * directory is not created on the root directory. So, let's bind-mount the directory
2854                                  * on the 'non-private' place. */
2855                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2856                         else
2857                                 d = strdup(s);
2858                         if (!d)
2859                                 return -ENOMEM;
2860
2861                         bind_mounts[h++] = (BindMount) {
2862                                 .source = TAKE_PTR(s),
2863                                 .destination = TAKE_PTR(d),
2864                                 .read_only = false,
2865                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2866                                 .recursive = true,
2867                                 .ignore_enoent = false,
2868                         };
2869                 }
2870         }
2871
2872         assert(h == n);
2873
2874         *ret_bind_mounts = TAKE_PTR(bind_mounts);
2875         *ret_n_bind_mounts = n;
2876         *ret_empty_directories = TAKE_PTR(empty_directories);
2877
2878         return (int) n;
2879 }
2880
2881 /* ret_symlinks will contain a list of pairs src:dest that describes
2882  * the symlinks to create later on. For example, the symlinks needed
2883  * to safely give private directories to DynamicUser=1 users. */
2884 static int compile_symlinks(
2885                 const ExecContext *context,
2886                 const ExecParameters *params,
2887                 bool setup_os_release_symlink,
2888                 char ***ret_symlinks) {
2889
2890         _cleanup_strv_free_ char **symlinks = NULL;
2891         int r;
2892
2893         assert(context);
2894         assert(params);
2895         assert(ret_symlinks);
2896
2897         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2898                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2899                         _cleanup_free_ char *private_path = NULL, *path = NULL;
2900
2901                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2902                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2903
2904                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2905                                 dst_abs = path_join(params->prefix[dt], *symlink);
2906                                 if (!src_abs || !dst_abs)
2907                                         return -ENOMEM;
2908
2909                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2910                                 if (r < 0)
2911                                         return r;
2912                         }
2913
2914                         if (!exec_directory_is_private(context, dt) ||
2915                             exec_context_with_rootfs(context) ||
2916                             context->directories[dt].items[i].only_create)
2917                                 continue;
2918
2919                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2920                         if (!private_path)
2921                                 return -ENOMEM;
2922
2923                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2924                         if (!path)
2925                                 return -ENOMEM;
2926
2927                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2928                         if (r < 0)
2929                                 return r;
2930                 }
2931         }
2932
2933         /* We make the host's os-release available via a symlink, so that we can copy it atomically
2934          * and readers will never get a half-written version. Note that, while the paths specified here are
2935          * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2936          * 'os-release -> .os-release-stage/os-release' is what will be created. */
2937         if (setup_os_release_symlink) {
2938                 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2939                 if (r < 0)
2940                         return r;
2941
2942                 r = strv_extend(&symlinks, "/run/host/os-release");
2943                 if (r < 0)
2944                         return r;
2945         }
2946
2947         *ret_symlinks = TAKE_PTR(symlinks);
2948
2949         return 0;
2950 }
2951
2952 static bool insist_on_sandboxing(
2953                 const ExecContext *context,
2954                 const char *root_dir,
2955                 const char *root_image,
2956                 const BindMount *bind_mounts,
2957                 size_t n_bind_mounts) {
2958
2959         assert(context);
2960         assert(n_bind_mounts == 0 || bind_mounts);
2961
2962         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2963          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2964          * rearrange stuff in a way we cannot ignore gracefully. */
2965
2966         if (context->n_temporary_filesystems > 0)
2967                 return true;
2968
2969         if (root_dir || root_image)
2970                 return true;
2971
2972         if (context->n_mount_images > 0)
2973                 return true;
2974
2975         if (context->dynamic_user)
2976                 return true;
2977
2978         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2979                 return true;
2980
2981         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2982          * essential. */
2983         for (size_t i = 0; i < n_bind_mounts; i++)
2984                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2985                         return true;
2986
2987         if (context->log_namespace)
2988                 return true;
2989
2990         return false;
2991 }
2992
2993 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2994         _cleanup_close_ int fd = -EBADF;
2995         int r;
2996
2997         if (!runtime || !runtime->ephemeral_copy)
2998                 return 0;
2999
3000         r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3001         if (r < 0)
3002                 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3003
3004         CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3005
3006         fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3007         if (fd >= 0)
3008                 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3009                 return 0;
3010
3011         if (fd != -EAGAIN)
3012                 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3013
3014         log_debug("Making ephemeral snapshot of %s to %s",
3015                   context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3016
3017         if (context->root_image)
3018                 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3019                                COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3020         else
3021                 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3022                                               AT_FDCWD, runtime->ephemeral_copy,
3023                                               BTRFS_SNAPSHOT_FALLBACK_COPY |
3024                                               BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3025                                               BTRFS_SNAPSHOT_RECURSIVE |
3026                                               BTRFS_SNAPSHOT_LOCK_BSD);
3027         if (fd < 0)
3028                 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3029                                        context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3030
3031         if (context->root_image) {
3032                 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3033                  * which tends to not perform well in combination with lots of random writes.
3034                  *
3035                  * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3036                  * copy, but we at least want to make the intention clear.
3037                  */
3038                 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3039                 if (r < 0)
3040                         log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3041         }
3042
3043         r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3044         if (r < 0)
3045                 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3046
3047         return 1;
3048 }
3049
3050 static int verity_settings_prepare(
3051                 VeritySettings *verity,
3052                 const char *root_image,
3053                 const void *root_hash,
3054                 size_t root_hash_size,
3055                 const char *root_hash_path,
3056                 const void *root_hash_sig,
3057                 size_t root_hash_sig_size,
3058                 const char *root_hash_sig_path,
3059                 const char *verity_data_path) {
3060
3061         int r;
3062
3063         assert(verity);
3064
3065         if (root_hash) {
3066                 void *d;
3067
3068                 d = memdup(root_hash, root_hash_size);
3069                 if (!d)
3070                         return -ENOMEM;
3071
3072                 free_and_replace(verity->root_hash, d);
3073                 verity->root_hash_size = root_hash_size;
3074                 verity->designator = PARTITION_ROOT;
3075         }
3076
3077         if (root_hash_sig) {
3078                 void *d;
3079
3080                 d = memdup(root_hash_sig, root_hash_sig_size);
3081                 if (!d)
3082                         return -ENOMEM;
3083
3084                 free_and_replace(verity->root_hash_sig, d);
3085                 verity->root_hash_sig_size = root_hash_sig_size;
3086                 verity->designator = PARTITION_ROOT;
3087         }
3088
3089         if (verity_data_path) {
3090                 r = free_and_strdup(&verity->data_path, verity_data_path);
3091                 if (r < 0)
3092                         return r;
3093         }
3094
3095         r = verity_settings_load(
3096                         verity,
3097                         root_image,
3098                         root_hash_path,
3099                         root_hash_sig_path);
3100         if (r < 0)
3101                 return log_debug_errno(r, "Failed to load root hash: %m");
3102
3103         return 0;
3104 }
3105
3106 static int apply_mount_namespace(
3107                 const Unit *u,
3108                 ExecCommandFlags command_flags,
3109                 const ExecContext *context,
3110                 const ExecParameters *params,
3111                 ExecRuntime *runtime,
3112                 const char *memory_pressure_path,
3113                 char **error_path) {
3114
3115         _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3116         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3117                         **read_write_paths_cleanup = NULL;
3118         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3119                         *extension_dir = NULL, *host_os_release_stage = NULL;
3120         const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3121         char **read_write_paths;
3122         NamespaceInfo ns_info;
3123         bool needs_sandboxing, setup_os_release_symlink;
3124         BindMount *bind_mounts = NULL;
3125         size_t n_bind_mounts = 0;
3126         int r;
3127
3128         assert(context);
3129
3130         CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3131
3132         if (params->flags & EXEC_APPLY_CHROOT) {
3133                 r = setup_ephemeral(context, runtime);
3134                 if (r < 0)
3135                         return r;
3136
3137                 if (context->root_image)
3138                         root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3139                 else
3140                         root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3141         }
3142
3143         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3144         if (r < 0)
3145                 return r;
3146
3147         /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3148          * service will need to write to it in order to start the notifications. */
3149         if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3150                 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3151                 if (!read_write_paths_cleanup)
3152                         return -ENOMEM;
3153
3154                 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3155                 if (r < 0)
3156                         return r;
3157
3158                 read_write_paths = read_write_paths_cleanup;
3159         } else
3160                 read_write_paths = context->read_write_paths;
3161
3162         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3163         if (needs_sandboxing) {
3164                 /* The runtime struct only contains the parent of the private /tmp,
3165                  * which is non-accessible to world users. Inside of it there's a /tmp
3166                  * that is sticky, and that's the one we want to use here.
3167                  * This does not apply when we are using /run/systemd/empty as fallback. */
3168
3169                 if (context->private_tmp && runtime && runtime->shared) {
3170                         if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3171                                 tmp_dir = runtime->shared->tmp_dir;
3172                         else if (runtime->shared->tmp_dir)
3173                                 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3174
3175                         if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3176                                 var_tmp_dir = runtime->shared->var_tmp_dir;
3177                         else if (runtime->shared->var_tmp_dir)
3178                                 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3179                 }
3180
3181                 ns_info = (NamespaceInfo) {
3182                         .ignore_protect_paths = false,
3183                         .private_dev = context->private_devices,
3184                         .protect_control_groups = context->protect_control_groups,
3185                         .protect_kernel_tunables = context->protect_kernel_tunables,
3186                         .protect_kernel_modules = context->protect_kernel_modules,
3187                         .protect_kernel_logs = context->protect_kernel_logs,
3188                         .protect_hostname = context->protect_hostname,
3189                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3190                         .protect_home = context->protect_home,
3191                         .protect_system = context->protect_system,
3192                         .protect_proc = context->protect_proc,
3193                         .proc_subset = context->proc_subset,
3194                         .private_network = exec_needs_network_namespace(context),
3195                         .private_ipc = exec_needs_ipc_namespace(context),
3196                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3197                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3198                 };
3199         } else if (!context->dynamic_user && root_dir)
3200                 /*
3201                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3202                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3203                  * fail if we are enable to apply the sandbox inside the mount namespace.
3204                  */
3205                 ns_info = (NamespaceInfo) {
3206                         .ignore_protect_paths = true,
3207                 };
3208         else
3209                 ns_info = (NamespaceInfo) {};
3210
3211         /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3212         setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
3213         r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3214         if (r < 0)
3215                 return r;
3216
3217         if (context->mount_propagation_flag == MS_SHARED)
3218                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3219
3220         if (exec_context_has_credentials(context) &&
3221             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3222             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3223                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3224                 if (!creds_path)
3225                         return -ENOMEM;
3226         }
3227
3228         if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3229                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3230                 if (!propagate_dir)
3231                         return -ENOMEM;
3232
3233                 incoming_dir = strdup("/run/systemd/incoming");
3234                 if (!incoming_dir)
3235                         return -ENOMEM;
3236
3237                 extension_dir = strdup("/run/systemd/unit-extensions");
3238                 if (!extension_dir)
3239                         return -ENOMEM;
3240
3241                 /* If running under a different root filesystem, propagate the host's os-release. We make a
3242                  * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3243                 if (setup_os_release_symlink) {
3244                         host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3245                         if (!host_os_release_stage)
3246                                 return -ENOMEM;
3247                 }
3248         } else {
3249                 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3250
3251                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3252                         return -ENOMEM;
3253
3254                 if (setup_os_release_symlink) {
3255                         if (asprintf(&host_os_release_stage,
3256                                      "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3257                                      geteuid()) < 0)
3258                                 return -ENOMEM;
3259                 }
3260         }
3261
3262         if (root_image) {
3263                 r = verity_settings_prepare(
3264                         &verity,
3265                         root_image,
3266                         context->root_hash, context->root_hash_size, context->root_hash_path,
3267                         context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3268                         context->root_verity);
3269                 if (r < 0)
3270                         return r;
3271         }
3272
3273         r = setup_namespace(
3274                         root_dir,
3275                         root_image,
3276                         context->root_image_options,
3277                         context->root_image_policy ?: &image_policy_service,
3278                         &ns_info,
3279                         read_write_paths,
3280                         needs_sandboxing ? context->read_only_paths : NULL,
3281                         needs_sandboxing ? context->inaccessible_paths : NULL,
3282                         needs_sandboxing ? context->exec_paths : NULL,
3283                         needs_sandboxing ? context->no_exec_paths : NULL,
3284                         empty_directories,
3285                         symlinks,
3286                         bind_mounts,
3287                         n_bind_mounts,
3288                         context->temporary_filesystems,
3289                         context->n_temporary_filesystems,
3290                         context->mount_images,
3291                         context->n_mount_images,
3292                         context->mount_image_policy ?: &image_policy_service,
3293                         tmp_dir,
3294                         var_tmp_dir,
3295                         creds_path,
3296                         context->log_namespace,
3297                         context->mount_propagation_flag,
3298                         &verity,
3299                         context->extension_images,
3300                         context->n_extension_images,
3301                         context->extension_image_policy ?: &image_policy_sysext,
3302                         context->extension_directories,
3303                         propagate_dir,
3304                         incoming_dir,
3305                         extension_dir,
3306                         root_dir || root_image ? params->notify_socket : NULL,
3307                         host_os_release_stage,
3308                         error_path);
3309
3310         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3311          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3312          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3313          * completely different execution environment. */
3314         if (r == -ENOANO) {
3315                 if (insist_on_sandboxing(
3316                                     context,
3317                                     root_dir, root_image,
3318                                     bind_mounts,
3319                                     n_bind_mounts))
3320                         return log_unit_debug_errno(u,
3321                                                     SYNTHETIC_ERRNO(EOPNOTSUPP),
3322                                                     "Failed to set up namespace, and refusing to continue since "
3323                                                     "the selected namespacing options alter mount environment non-trivially.\n"
3324                                                     "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3325                                                     n_bind_mounts,
3326                                                     context->n_temporary_filesystems,
3327                                                     yes_no(root_dir),
3328                                                     yes_no(root_image),
3329                                                     yes_no(context->dynamic_user));
3330
3331                 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3332                 return 0;
3333         }
3334
3335         return r;
3336 }
3337
3338 static int apply_working_directory(
3339                 const ExecContext *context,
3340                 const ExecParameters *params,
3341                 ExecRuntime *runtime,
3342                 const char *home,
3343                 int *exit_status) {
3344
3345         const char *d, *wd;
3346
3347         assert(context);
3348         assert(exit_status);
3349
3350         if (context->working_directory_home) {
3351
3352                 if (!home) {
3353                         *exit_status = EXIT_CHDIR;
3354                         return -ENXIO;
3355                 }
3356
3357                 wd = home;
3358
3359         } else
3360                 wd = empty_to_root(context->working_directory);
3361
3362         if (params->flags & EXEC_APPLY_CHROOT)
3363                 d = wd;
3364         else
3365                 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3366
3367         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3368                 *exit_status = EXIT_CHDIR;
3369                 return -errno;
3370         }
3371
3372         return 0;
3373 }
3374
3375 static int apply_root_directory(
3376                 const ExecContext *context,
3377                 const ExecParameters *params,
3378                 ExecRuntime *runtime,
3379                 const bool needs_mount_ns,
3380                 int *exit_status) {
3381
3382         assert(context);
3383         assert(exit_status);
3384
3385         if (params->flags & EXEC_APPLY_CHROOT)
3386                 if (!needs_mount_ns && context->root_directory)
3387                         if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3388                                 *exit_status = EXIT_CHROOT;
3389                                 return -errno;
3390                         }
3391
3392         return 0;
3393 }
3394
3395 static int setup_keyring(
3396                 const Unit *u,
3397                 const ExecContext *context,
3398                 const ExecParameters *p,
3399                 uid_t uid, gid_t gid) {
3400
3401         key_serial_t keyring;
3402         int r = 0;
3403         uid_t saved_uid;
3404         gid_t saved_gid;
3405
3406         assert(u);
3407         assert(context);
3408         assert(p);
3409
3410         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3411          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3412          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3413          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3414          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3415          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3416
3417         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3418                 return 0;
3419
3420         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3421          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3422          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3423          * & group is just as nasty as acquiring a reference to the user keyring. */
3424
3425         saved_uid = getuid();
3426         saved_gid = getgid();
3427
3428         if (gid_is_valid(gid) && gid != saved_gid) {
3429                 if (setregid(gid, -1) < 0)
3430                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3431         }
3432
3433         if (uid_is_valid(uid) && uid != saved_uid) {
3434                 if (setreuid(uid, -1) < 0) {
3435                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3436                         goto out;
3437                 }
3438         }
3439
3440         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3441         if (keyring == -1) {
3442                 if (errno == ENOSYS)
3443                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3444                 else if (ERRNO_IS_PRIVILEGE(errno))
3445                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3446                 else if (errno == EDQUOT)
3447                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3448                 else
3449                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3450
3451                 goto out;
3452         }
3453
3454         /* When requested link the user keyring into the session keyring. */
3455         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3456
3457                 if (keyctl(KEYCTL_LINK,
3458                            KEY_SPEC_USER_KEYRING,
3459                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3460                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3461                         goto out;
3462                 }
3463         }
3464
3465         /* Restore uid/gid back */
3466         if (uid_is_valid(uid) && uid != saved_uid) {
3467                 if (setreuid(saved_uid, -1) < 0) {
3468                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3469                         goto out;
3470                 }
3471         }
3472
3473         if (gid_is_valid(gid) && gid != saved_gid) {
3474                 if (setregid(saved_gid, -1) < 0)
3475                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3476         }
3477
3478         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3479         if (!sd_id128_is_null(u->invocation_id)) {
3480                 key_serial_t key;
3481
3482                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3483                 if (key == -1)
3484                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3485                 else {
3486                         if (keyctl(KEYCTL_SETPERM, key,
3487                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3488                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3489                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3490                 }
3491         }
3492
3493 out:
3494         /* Revert back uid & gid for the last time, and exit */
3495         /* no extra logging, as only the first already reported error matters */
3496         if (getuid() != saved_uid)
3497                 (void) setreuid(saved_uid, -1);
3498
3499         if (getgid() != saved_gid)
3500                 (void) setregid(saved_gid, -1);
3501
3502         return r;
3503 }
3504
3505 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3506         assert(array);
3507         assert(n);
3508         assert(pair);
3509
3510         if (pair[0] >= 0)
3511                 array[(*n)++] = pair[0];
3512         if (pair[1] >= 0)
3513                 array[(*n)++] = pair[1];
3514 }
3515
3516 static int close_remaining_fds(
3517                 const ExecParameters *params,
3518                 const ExecRuntime *runtime,
3519                 int user_lookup_fd,
3520                 int socket_fd,
3521                 const int *fds, size_t n_fds) {
3522
3523         size_t n_dont_close = 0;
3524         int dont_close[n_fds + 14];
3525
3526         assert(params);
3527
3528         if (params->stdin_fd >= 0)
3529                 dont_close[n_dont_close++] = params->stdin_fd;
3530         if (params->stdout_fd >= 0)
3531                 dont_close[n_dont_close++] = params->stdout_fd;
3532         if (params->stderr_fd >= 0)
3533                 dont_close[n_dont_close++] = params->stderr_fd;
3534
3535         if (socket_fd >= 0)
3536                 dont_close[n_dont_close++] = socket_fd;
3537         if (n_fds > 0) {
3538                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3539                 n_dont_close += n_fds;
3540         }
3541
3542         if (runtime)
3543                 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3544
3545         if (runtime && runtime->shared) {
3546                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3547                 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3548         }
3549
3550         if (runtime && runtime->dynamic_creds) {
3551                 if (runtime->dynamic_creds->user)
3552                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3553                 if (runtime->dynamic_creds->group)
3554                         append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3555         }
3556
3557         if (user_lookup_fd >= 0)
3558                 dont_close[n_dont_close++] = user_lookup_fd;
3559
3560         return close_all_fds(dont_close, n_dont_close);
3561 }
3562
3563 static int send_user_lookup(
3564                 Unit *unit,
3565                 int user_lookup_fd,
3566                 uid_t uid,
3567                 gid_t gid) {
3568
3569         assert(unit);
3570
3571         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3572          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3573          * specified. */
3574
3575         if (user_lookup_fd < 0)
3576                 return 0;
3577
3578         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3579                 return 0;
3580
3581         if (writev(user_lookup_fd,
3582                (struct iovec[]) {
3583                            IOVEC_MAKE(&uid, sizeof(uid)),
3584                            IOVEC_MAKE(&gid, sizeof(gid)),
3585                            IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
3586                 return -errno;
3587
3588         return 0;
3589 }
3590
3591 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3592         int r;
3593
3594         assert(c);
3595         assert(home);
3596         assert(buf);
3597
3598         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3599
3600         if (*home)
3601                 return 0;
3602
3603         if (!c->working_directory_home)
3604                 return 0;
3605
3606         r = get_home_dir(buf);
3607         if (r < 0)
3608                 return r;
3609
3610         *home = *buf;
3611         return 1;
3612 }
3613
3614 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3615         _cleanup_strv_free_ char ** list = NULL;
3616         int r;
3617
3618         assert(c);
3619         assert(p);
3620         assert(ret);
3621
3622         assert(c->dynamic_user);
3623
3624         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3625          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3626          * directories. */
3627
3628         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3629                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3630                         continue;
3631
3632                 if (!p->prefix[t])
3633                         continue;
3634
3635                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3636                         char *e;
3637
3638                         if (exec_directory_is_private(c, t))
3639                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3640                         else
3641                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3642                         if (!e)
3643                                 return -ENOMEM;
3644
3645                         r = strv_consume(&list, e);
3646                         if (r < 0)
3647                                 return r;
3648                 }
3649         }
3650
3651         *ret = TAKE_PTR(list);
3652
3653         return 0;
3654 }
3655
3656 static int exec_parameters_get_cgroup_path(
3657                 const ExecParameters *params,
3658                 const CGroupContext *c,
3659                 char **ret) {
3660
3661         const char *subgroup = NULL;
3662         char *p;
3663
3664         assert(params);
3665         assert(ret);
3666
3667         if (!params->cgroup_path)
3668                 return -EINVAL;
3669
3670         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3671          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3672          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3673          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3674          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3675          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3676          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3677          * flag, which is only passed for the former statements, not for the latter. */
3678
3679         if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3680                 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3681                         subgroup = ".control";
3682                 else
3683                         subgroup = c->delegate_subgroup;
3684         }
3685
3686         if (subgroup)
3687                 p = path_join(params->cgroup_path, subgroup);
3688         else
3689                 p = strdup(params->cgroup_path);
3690         if (!p)
3691                 return -ENOMEM;
3692
3693         *ret = p;
3694         return !!subgroup;
3695 }
3696
3697 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3698         _cleanup_(cpu_set_reset) CPUSet s = {};
3699         int r;
3700
3701         assert(c);
3702         assert(ret);
3703
3704         if (!c->numa_policy.nodes.set) {
3705                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3706                 return 0;
3707         }
3708
3709         r = numa_to_cpu_set(&c->numa_policy, &s);
3710         if (r < 0)
3711                 return r;
3712
3713         cpu_set_reset(ret);
3714
3715         return cpu_set_add_all(ret, &s);
3716 }
3717
3718 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3719         assert(c);
3720
3721         return c->cpu_affinity_from_numa;
3722 }
3723
3724 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3725         int r;
3726
3727         assert(fds);
3728         assert(n_fds);
3729         assert(*n_fds < fds_size);
3730         assert(ret_fd);
3731
3732         if (fd < 0) {
3733                 *ret_fd = -EBADF;
3734                 return 0;
3735         }
3736
3737         if (fd < 3 + (int) *n_fds) {
3738                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3739                  * the fds we pass to the process (or which are closed only during execve). */
3740
3741                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3742                 if (r < 0)
3743                         return -errno;
3744
3745                 close_and_replace(fd, r);
3746         }
3747
3748         *ret_fd = fds[*n_fds] = fd;
3749         (*n_fds) ++;
3750         return 1;
3751 }
3752
3753 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3754         union sockaddr_union addr = {
3755                 .un.sun_family = AF_UNIX,
3756         };
3757         socklen_t sa_len;
3758         static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3759         int r;
3760
3761         assert(u);
3762         assert(of);
3763         assert(ofd >= 0);
3764
3765         r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3766         if (r < 0)
3767                 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3768
3769         sa_len = r;
3770
3771         for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3772                 _cleanup_close_ int fd = -EBADF;
3773
3774                 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3775                 if (fd < 0)
3776                         return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3777
3778                 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3779                 if (r == -EPROTOTYPE)
3780                         continue;
3781                 if (r < 0)
3782                         return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3783
3784                 return TAKE_FD(fd);
3785         }
3786
3787         return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3788 }
3789
3790 static int get_open_file_fd(Unit *u, const OpenFile *of) {
3791         struct stat st;
3792         _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3793
3794         assert(u);
3795         assert(of);
3796
3797         ofd = open(of->path, O_PATH | O_CLOEXEC);
3798         if (ofd < 0)
3799                 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3800
3801         if (fstat(ofd, &st) < 0)
3802                 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
3803
3804         if (S_ISSOCK(st.st_mode)) {
3805                 fd = connect_unix_harder(u, of, ofd);
3806                 if (fd < 0)
3807                         return fd;
3808
3809                 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3810                         return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3811                                                     of->path);
3812
3813                 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3814         } else {
3815                 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3816                 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3817                         flags |= O_APPEND;
3818                 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3819                         flags |= O_TRUNC;
3820
3821                 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3822                 if (fd < 0)
3823                         return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3824
3825                 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3826         }
3827
3828         return TAKE_FD(fd);
3829 }
3830
3831 static int collect_open_file_fds(
3832                 Unit *u,
3833                 OpenFile* open_files,
3834                 int **fds,
3835                 char ***fdnames,
3836                 size_t *n_fds) {
3837         int r;
3838
3839         assert(u);
3840         assert(fds);
3841         assert(fdnames);
3842         assert(n_fds);
3843
3844         LIST_FOREACH(open_files, of, open_files) {
3845                 _cleanup_close_ int fd = -EBADF;
3846
3847                 fd = get_open_file_fd(u, of);
3848                 if (fd < 0) {
3849                         if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3850                                 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3851                                 continue;
3852                         }
3853
3854                         return fd;
3855                 }
3856
3857                 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3858                         return -ENOMEM;
3859
3860                 r = strv_extend(fdnames, of->fdname);
3861                 if (r < 0)
3862                         return r;
3863
3864                 (*fds)[*n_fds] = TAKE_FD(fd);
3865
3866                 (*n_fds)++;
3867         }
3868
3869         return 0;
3870 }
3871
3872 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3873         assert(unit);
3874         assert(msg);
3875         assert(executable);
3876
3877         if (!DEBUG_LOGGING)
3878                 return;
3879
3880         _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3881
3882         log_unit_struct(unit, LOG_DEBUG,
3883                         "EXECUTABLE=%s", executable,
3884                         LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3885                         LOG_UNIT_INVOCATION_ID(unit));
3886 }
3887
3888 static bool exec_context_need_unprivileged_private_users(
3889                 const ExecContext *context,
3890                 const ExecParameters *params) {
3891
3892         assert(context);
3893         assert(params);
3894
3895         /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3896          * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3897          * (system manager) then we have privileges and don't need this. */
3898         if (params->runtime_scope != RUNTIME_SCOPE_USER)
3899                 return false;
3900
3901         return context->private_users ||
3902                context->private_tmp ||
3903                context->private_devices ||
3904                context->private_network ||
3905                context->network_namespace_path ||
3906                context->private_ipc ||
3907                context->ipc_namespace_path ||
3908                context->private_mounts > 0 ||
3909                context->mount_apivfs ||
3910                context->n_bind_mounts > 0 ||
3911                context->n_temporary_filesystems > 0 ||
3912                context->root_directory ||
3913                !strv_isempty(context->extension_directories) ||
3914                context->protect_system != PROTECT_SYSTEM_NO ||
3915                context->protect_home != PROTECT_HOME_NO ||
3916                context->protect_kernel_tunables ||
3917                context->protect_kernel_modules ||
3918                context->protect_kernel_logs ||
3919                context->protect_control_groups ||
3920                context->protect_clock ||
3921                context->protect_hostname ||
3922                !strv_isempty(context->read_write_paths) ||
3923                !strv_isempty(context->read_only_paths) ||
3924                !strv_isempty(context->inaccessible_paths) ||
3925                !strv_isempty(context->exec_paths) ||
3926                !strv_isempty(context->no_exec_paths);
3927 }
3928
3929 static int exec_child(
3930                 Unit *unit,
3931                 const ExecCommand *command,
3932                 const ExecContext *context,
3933                 const ExecParameters *params,
3934                 ExecRuntime *runtime,
3935                 const CGroupContext *cgroup_context,
3936                 int socket_fd,
3937                 const int named_iofds[static 3],
3938                 int *params_fds,
3939                 size_t n_socket_fds,
3940                 size_t n_storage_fds,
3941                 char **files_env,
3942                 int user_lookup_fd,
3943                 int *exit_status) {
3944
3945         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3946         int r, ngids = 0, exec_fd;
3947         _cleanup_free_ gid_t *supplementary_gids = NULL;
3948         const char *username = NULL, *groupname = NULL;
3949         _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3950         const char *home = NULL, *shell = NULL;
3951         char **final_argv = NULL;
3952         dev_t journal_stream_dev = 0;
3953         ino_t journal_stream_ino = 0;
3954         bool userns_set_up = false;
3955         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3956                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3957                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3958                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3959 #if HAVE_SELINUX
3960         _cleanup_free_ char *mac_selinux_context_net = NULL;
3961         bool use_selinux = false;
3962 #endif
3963 #if ENABLE_SMACK
3964         bool use_smack = false;
3965 #endif
3966 #if HAVE_APPARMOR
3967         bool use_apparmor = false;
3968 #endif
3969         uid_t saved_uid = getuid();
3970         gid_t saved_gid = getgid();
3971         uid_t uid = UID_INVALID;
3972         gid_t gid = GID_INVALID;
3973         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3974                n_keep_fds; /* total number of fds not to close */
3975         int secure_bits;
3976         _cleanup_free_ gid_t *gids_after_pam = NULL;
3977         int ngids_after_pam = 0;
3978         _cleanup_free_ int *fds = NULL;
3979         _cleanup_strv_free_ char **fdnames = NULL;
3980
3981         assert(unit);
3982         assert(command);
3983         assert(context);
3984         assert(params);
3985         assert(exit_status);
3986
3987         /* Explicitly test for CVE-2021-4034 inspired invocations */
3988         assert(command->path);
3989         assert(!strv_isempty(command->argv));
3990
3991         rename_process_from_path(command->path);
3992
3993         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3994          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3995          * both of which will be demoted to SIG_DFL. */
3996         (void) default_signals(SIGNALS_CRASH_HANDLER,
3997                                SIGNALS_IGNORE);
3998
3999         if (context->ignore_sigpipe)
4000                 (void) ignore_signals(SIGPIPE);
4001
4002         r = reset_signal_mask();
4003         if (r < 0) {
4004                 *exit_status = EXIT_SIGNAL_MASK;
4005                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4006         }
4007
4008         if (params->idle_pipe)
4009                 do_idle_pipe_dance(params->idle_pipe);
4010
4011         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4012          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4013          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4014          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4015
4016         log_forget_fds();
4017         log_set_open_when_needed(true);
4018         log_settle_target();
4019
4020         /* In case anything used libc syslog(), close this here, too */
4021         closelog();
4022
4023         fds = newdup(int, params_fds, n_fds);
4024         if (!fds) {
4025                 *exit_status = EXIT_MEMORY;
4026                 return log_oom();
4027         }
4028
4029         fdnames = strv_copy((char**) params->fd_names);
4030         if (!fdnames) {
4031                 *exit_status = EXIT_MEMORY;
4032                 return log_oom();
4033         }
4034
4035         r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4036         if (r < 0) {
4037                 *exit_status = EXIT_FDS;
4038                 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4039         }
4040
4041         int keep_fds[n_fds + 3];
4042         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4043         n_keep_fds = n_fds;
4044
4045         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4046         if (r < 0) {
4047                 *exit_status = EXIT_FDS;
4048                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4049         }
4050
4051 #if HAVE_LIBBPF
4052         if (unit->manager->restrict_fs) {
4053                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4054                 if (bpf_map_fd < 0) {
4055                         *exit_status = EXIT_FDS;
4056                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4057                 }
4058
4059                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4060                 if (r < 0) {
4061                         *exit_status = EXIT_FDS;
4062                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4063                 }
4064         }
4065 #endif
4066
4067         r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4068         if (r < 0) {
4069                 *exit_status = EXIT_FDS;
4070                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4071         }
4072
4073         if (!context->same_pgrp &&
4074             setsid() < 0) {
4075                 *exit_status = EXIT_SETSID;
4076                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4077         }
4078
4079         exec_context_tty_reset(context, params);
4080
4081         if (unit_shall_confirm_spawn(unit)) {
4082                 _cleanup_free_ char *cmdline = NULL;
4083
4084                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4085                 if (!cmdline) {
4086                         *exit_status = EXIT_MEMORY;
4087                         return log_oom();
4088                 }
4089
4090                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4091                 if (r != CONFIRM_EXECUTE) {
4092                         if (r == CONFIRM_PRETEND_SUCCESS) {
4093                                 *exit_status = EXIT_SUCCESS;
4094                                 return 0;
4095                         }
4096
4097                         *exit_status = EXIT_CONFIRM;
4098                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4099                                                     "Execution cancelled by the user");
4100                 }
4101         }
4102
4103         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4104          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4105          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4106          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4107          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4108         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4109             setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4110                 *exit_status = EXIT_MEMORY;
4111                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4112         }
4113
4114         if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4115                 _cleanup_strv_free_ char **suggested_paths = NULL;
4116
4117                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4118                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4119                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4120                         *exit_status = EXIT_USER;
4121                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4122                 }
4123
4124                 r = compile_suggested_paths(context, params, &suggested_paths);
4125                 if (r < 0) {
4126                         *exit_status = EXIT_MEMORY;
4127                         return log_oom();
4128                 }
4129
4130                 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4131                 if (r < 0) {
4132                         *exit_status = EXIT_USER;
4133                         if (r == -EILSEQ)
4134                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4135                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4136                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4137                 }
4138
4139                 if (!uid_is_valid(uid)) {
4140                         *exit_status = EXIT_USER;
4141                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4142                 }
4143
4144                 if (!gid_is_valid(gid)) {
4145                         *exit_status = EXIT_USER;
4146                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4147                 }
4148
4149                 if (runtime->dynamic_creds->user)
4150                         username = runtime->dynamic_creds->user->name;
4151
4152         } else {
4153                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4154                 if (r < 0) {
4155                         *exit_status = EXIT_USER;
4156                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4157                 }
4158
4159                 r = get_fixed_group(context, &groupname, &gid);
4160                 if (r < 0) {
4161                         *exit_status = EXIT_GROUP;
4162                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4163                 }
4164         }
4165
4166         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4167         r = get_supplementary_groups(context, username, groupname, gid,
4168                                      &supplementary_gids, &ngids);
4169         if (r < 0) {
4170                 *exit_status = EXIT_GROUP;
4171                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4172         }
4173
4174         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4175         if (r < 0) {
4176                 *exit_status = EXIT_USER;
4177                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4178         }
4179
4180         user_lookup_fd = safe_close(user_lookup_fd);
4181
4182         r = acquire_home(context, uid, &home, &home_buffer);
4183         if (r < 0) {
4184                 *exit_status = EXIT_CHDIR;
4185                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4186         }
4187
4188         /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4189         if (socket_fd >= 0)
4190                 (void) fd_nonblock(socket_fd, false);
4191
4192         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4193          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4194         if (params->cgroup_path) {
4195                 _cleanup_free_ char *p = NULL;
4196
4197                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4198                 if (r < 0) {
4199                         *exit_status = EXIT_CGROUP;
4200                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4201                 }
4202
4203                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4204                 if (r == -EUCLEAN) {
4205                         *exit_status = EXIT_CGROUP;
4206                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4207                                                     "because the cgroup or one of its parents or "
4208                                                     "siblings is in the threaded mode: %m", p);
4209                 }
4210                 if (r < 0) {
4211                         *exit_status = EXIT_CGROUP;
4212                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4213                 }
4214         }
4215
4216         if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4217                 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4218                 if (r < 0) {
4219                         *exit_status = EXIT_NETWORK;
4220                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4221                 }
4222         }
4223
4224         if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4225                 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4226                 if (r < 0) {
4227                         *exit_status = EXIT_NAMESPACE;
4228                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4229                 }
4230         }
4231
4232         r = setup_input(context, params, socket_fd, named_iofds);
4233         if (r < 0) {
4234                 *exit_status = EXIT_STDIN;
4235                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4236         }
4237
4238         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4239         if (r < 0) {
4240                 *exit_status = EXIT_STDOUT;
4241                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4242         }
4243
4244         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4245         if (r < 0) {
4246                 *exit_status = EXIT_STDERR;
4247                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4248         }
4249
4250         if (context->oom_score_adjust_set) {
4251                 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4252                  * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4253                 r = set_oom_score_adjust(context->oom_score_adjust);
4254                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4255                         log_unit_debug_errno(unit, r,
4256                                              "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4257                 else if (r < 0) {
4258                         *exit_status = EXIT_OOM_ADJUST;
4259                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4260                 }
4261         }
4262
4263         if (context->coredump_filter_set) {
4264                 r = set_coredump_filter(context->coredump_filter);
4265                 if (ERRNO_IS_NEG_PRIVILEGE(r))
4266                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4267                 else if (r < 0) {
4268                         *exit_status = EXIT_LIMITS;
4269                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4270                 }
4271         }
4272
4273         if (context->nice_set) {
4274                 r = setpriority_closest(context->nice);
4275                 if (r < 0) {
4276                         *exit_status = EXIT_NICE;
4277                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4278                 }
4279         }
4280
4281         if (context->cpu_sched_set) {
4282                 struct sched_param param = {
4283                         .sched_priority = context->cpu_sched_priority,
4284                 };
4285
4286                 r = sched_setscheduler(0,
4287                                        context->cpu_sched_policy |
4288                                        (context->cpu_sched_reset_on_fork ?
4289                                         SCHED_RESET_ON_FORK : 0),
4290                                        &param);
4291                 if (r < 0) {
4292                         *exit_status = EXIT_SETSCHEDULER;
4293                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4294                 }
4295         }
4296
4297         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4298                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4299                 const CPUSet *cpu_set;
4300
4301                 if (context->cpu_affinity_from_numa) {
4302                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4303                         if (r < 0) {
4304                                 *exit_status = EXIT_CPUAFFINITY;
4305                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4306                         }
4307
4308                         cpu_set = &converted_cpu_set;
4309                 } else
4310                         cpu_set = &context->cpu_set;
4311
4312                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4313                         *exit_status = EXIT_CPUAFFINITY;
4314                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4315                 }
4316         }
4317
4318         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4319                 r = apply_numa_policy(&context->numa_policy);
4320                 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4321                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4322                 else if (r < 0) {
4323                         *exit_status = EXIT_NUMA_POLICY;
4324                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4325                 }
4326         }
4327
4328         if (context->ioprio_set)
4329                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4330                         *exit_status = EXIT_IOPRIO;
4331                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4332                 }
4333
4334         if (context->timer_slack_nsec != NSEC_INFINITY)
4335                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4336                         *exit_status = EXIT_TIMERSLACK;
4337                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4338                 }
4339
4340         if (context->personality != PERSONALITY_INVALID) {
4341                 r = safe_personality(context->personality);
4342                 if (r < 0) {
4343                         *exit_status = EXIT_PERSONALITY;
4344                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4345                 }
4346         }
4347
4348         if (context->utmp_id) {
4349                 const char *line = context->tty_path ?
4350                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4351                         NULL;
4352                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4353                                       line,
4354                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4355                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4356                                       USER_PROCESS,
4357                                       username);
4358         }
4359
4360         if (uid_is_valid(uid)) {
4361                 r = chown_terminal(STDIN_FILENO, uid);
4362                 if (r < 0) {
4363                         *exit_status = EXIT_STDIN;
4364                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4365                 }
4366         }
4367
4368         if (params->cgroup_path) {
4369                 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4370                  * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4371                  * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4372                  * touch a single hierarchy too. */
4373
4374                 if (params->flags & EXEC_CGROUP_DELEGATE) {
4375                         _cleanup_free_ char *p = NULL;
4376
4377                         r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4378                         if (r < 0) {
4379                                 *exit_status = EXIT_CGROUP;
4380                                 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4381                         }
4382
4383                         r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4384                         if (r < 0) {
4385                                 *exit_status = EXIT_CGROUP;
4386                                 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4387                         }
4388                         if (r > 0) {
4389                                 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4390                                 if (r < 0) {
4391                                         *exit_status = EXIT_CGROUP;
4392                                         return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4393                                 }
4394                         }
4395                 }
4396
4397                 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4398                         if (cgroup_context_want_memory_pressure(cgroup_context)) {
4399                                 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4400                                 if (r < 0) {
4401                                         *exit_status = EXIT_MEMORY;
4402                                         return log_oom();
4403                                 }
4404
4405                                 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4406                                 if (r < 0) {
4407                                         log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4408                                                             "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4409                                         memory_pressure_path = mfree(memory_pressure_path);
4410                                 }
4411                         } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4412                                 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4413                                 if (!memory_pressure_path) {
4414                                         *exit_status = EXIT_MEMORY;
4415                                         return log_oom();
4416                                 }
4417                         }
4418                 }
4419         }
4420
4421         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4422
4423         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4424                 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4425                 if (r < 0)
4426                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4427         }
4428
4429         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4430                 r = setup_credentials(context, params, unit->id, uid, gid);
4431                 if (r < 0) {
4432                         *exit_status = EXIT_CREDENTIALS;
4433                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4434                 }
4435         }
4436
4437         r = build_environment(
4438                         unit,
4439                         context,
4440                         params,
4441                         cgroup_context,
4442                         n_fds,
4443                         fdnames,
4444                         home,
4445                         username,
4446                         shell,
4447                         journal_stream_dev,
4448                         journal_stream_ino,
4449                         memory_pressure_path,
4450                         &our_env);
4451         if (r < 0) {
4452                 *exit_status = EXIT_MEMORY;
4453                 return log_oom();
4454         }
4455
4456         r = build_pass_environment(context, &pass_env);
4457         if (r < 0) {
4458                 *exit_status = EXIT_MEMORY;
4459                 return log_oom();
4460         }
4461
4462         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4463          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4464          * not specify PATH but the unit has ExecSearchPath. */
4465         if (!strv_isempty(context->exec_search_path)) {
4466                 _cleanup_free_ char *joined = NULL;
4467
4468                 joined = strv_join(context->exec_search_path, ":");
4469                 if (!joined) {
4470                         *exit_status = EXIT_MEMORY;
4471                         return log_oom();
4472                 }
4473
4474                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4475                 if (r < 0) {
4476                         *exit_status = EXIT_MEMORY;
4477                         return log_oom();
4478                 }
4479         }
4480
4481         accum_env = strv_env_merge(params->environment,
4482                                    our_env,
4483                                    joined_exec_search_path,
4484                                    pass_env,
4485                                    context->environment,
4486                                    files_env);
4487         if (!accum_env) {
4488                 *exit_status = EXIT_MEMORY;
4489                 return log_oom();
4490         }
4491         accum_env = strv_env_clean(accum_env);
4492
4493         (void) umask(context->umask);
4494
4495         r = setup_keyring(unit, context, params, uid, gid);
4496         if (r < 0) {
4497                 *exit_status = EXIT_KEYRING;
4498                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4499         }
4500
4501         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4502          * from it. */
4503         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4504
4505         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4506          * for it, and the kernel doesn't actually support ambient caps. */
4507         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4508
4509         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4510          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4511          * desired. */
4512         if (needs_ambient_hack)
4513                 needs_setuid = false;
4514         else
4515                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4516
4517         uint64_t capability_ambient_set = context->capability_ambient_set;
4518
4519         if (needs_sandboxing) {
4520                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4521                  * /sys being present. The actual MAC context application will happen later, as late as
4522                  * possible, to avoid impacting our own code paths. */
4523
4524 #if HAVE_SELINUX
4525                 use_selinux = mac_selinux_use();
4526 #endif
4527 #if ENABLE_SMACK
4528                 use_smack = mac_smack_use();
4529 #endif
4530 #if HAVE_APPARMOR
4531                 use_apparmor = mac_apparmor_use();
4532 #endif
4533         }
4534
4535         if (needs_sandboxing) {
4536                 int which_failed;
4537
4538                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4539                  * is set here. (See below.) */
4540
4541                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4542                 if (r < 0) {
4543                         *exit_status = EXIT_LIMITS;
4544                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4545                 }
4546         }
4547
4548         if (needs_setuid && context->pam_name && username) {
4549                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4550                  * wins here. (See above.) */
4551
4552                 /* All fds passed in the fds array will be closed in the pam child process. */
4553                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4554                 if (r < 0) {
4555                         *exit_status = EXIT_PAM;
4556                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4557                 }
4558
4559                 if (ambient_capabilities_supported()) {
4560                         uint64_t ambient_after_pam;
4561
4562                         /* PAM modules might have set some ambient caps. Query them here and merge them into
4563                          * the caps we want to set in the end, so that we don't end up unsetting them. */
4564                         r = capability_get_ambient(&ambient_after_pam);
4565                         if (r < 0) {
4566                                 *exit_status = EXIT_CAPABILITIES;
4567                                 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4568                         }
4569
4570                         capability_ambient_set |= ambient_after_pam;
4571                 }
4572
4573                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4574                 if (ngids_after_pam < 0) {
4575                         *exit_status = EXIT_MEMORY;
4576                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4577                 }
4578         }
4579
4580         if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4581                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4582                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4583                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4584
4585                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4586                 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4587                  * the actual requested operations fail (or silently continue). */
4588                 if (r < 0 && context->private_users) {
4589                         *exit_status = EXIT_USER;
4590                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4591                 }
4592                 if (r < 0)
4593                         log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4594                 else
4595                         userns_set_up = true;
4596         }
4597
4598         if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4599
4600                 /* Try to enable network namespacing if network namespacing is available and we have
4601                  * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4602                  * new network namespace. And if we don't have that, then we could only create a network
4603                  * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4604                 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4605                         r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4606                         if (ERRNO_IS_NEG_PRIVILEGE(r))
4607                                 log_unit_notice_errno(unit, r,
4608                                                       "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4609                         else if (r < 0) {
4610                                 *exit_status = EXIT_NETWORK;
4611                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4612                         }
4613                 } else if (context->network_namespace_path) {
4614                         *exit_status = EXIT_NETWORK;
4615                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4616                                                     "NetworkNamespacePath= is not supported, refusing.");
4617                 } else
4618                         log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4619         }
4620
4621         if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4622
4623                 if (ns_type_supported(NAMESPACE_IPC)) {
4624                         r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4625                         if (r == -EPERM)
4626                                 log_unit_warning_errno(unit, r,
4627                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4628                         else if (r < 0) {
4629                                 *exit_status = EXIT_NAMESPACE;
4630                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4631                         }
4632                 } else if (context->ipc_namespace_path) {
4633                         *exit_status = EXIT_NAMESPACE;
4634                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4635                                                     "IPCNamespacePath= is not supported, refusing.");
4636                 } else
4637                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4638         }
4639
4640         if (needs_mount_namespace) {
4641                 _cleanup_free_ char *error_path = NULL;
4642
4643                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
4644                 if (r < 0) {
4645                         *exit_status = EXIT_NAMESPACE;
4646                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4647                                                     error_path ? ": " : "", strempty(error_path));
4648                 }
4649         }
4650
4651         if (needs_sandboxing) {
4652                 r = apply_protect_hostname(unit, context, exit_status);
4653                 if (r < 0)
4654                         return r;
4655         }
4656
4657         if (context->memory_ksm >= 0)
4658                 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4659                         if (ERRNO_IS_NOT_SUPPORTED(errno))
4660                                 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4661                         else {
4662                                 *exit_status = EXIT_KSM;
4663                                 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4664                         }
4665                 }
4666
4667         /* Drop groups as early as possible.
4668          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4669          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4670         if (needs_setuid) {
4671                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4672                 int ngids_to_enforce = 0;
4673
4674                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4675                                                    ngids,
4676                                                    gids_after_pam,
4677                                                    ngids_after_pam,
4678                                                    &gids_to_enforce);
4679                 if (ngids_to_enforce < 0) {
4680                         *exit_status = EXIT_MEMORY;
4681                         return log_unit_error_errno(unit,
4682                                                     ngids_to_enforce,
4683                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4684                 }
4685
4686                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4687                 if (r < 0) {
4688                         *exit_status = EXIT_GROUP;
4689                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4690                 }
4691         }
4692
4693         /* If the user namespace was not set up above, try to do it now.
4694          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4695          * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4696          * case of mount namespaces being less privileged when the mount point list is copied from a
4697          * different user namespace). */
4698
4699         if (needs_sandboxing && context->private_users && !userns_set_up) {
4700                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4701                 if (r < 0) {
4702                         *exit_status = EXIT_USER;
4703                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4704                 }
4705         }
4706
4707         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4708          * shall execute. */
4709
4710         _cleanup_free_ char *executable = NULL;
4711         _cleanup_close_ int executable_fd = -EBADF;
4712         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4713         if (r < 0) {
4714                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4715                         log_unit_struct_errno(unit, LOG_INFO, r,
4716                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4717                                               LOG_UNIT_INVOCATION_ID(unit),
4718                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4719                                                                command->path),
4720                                               "EXECUTABLE=%s", command->path);
4721                         *exit_status = EXIT_SUCCESS;
4722                         return 0;
4723                 }
4724
4725                 *exit_status = EXIT_EXEC;
4726                 return log_unit_struct_errno(unit, LOG_INFO, r,
4727                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4728                                              LOG_UNIT_INVOCATION_ID(unit),
4729                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4730                                                               command->path),
4731                                              "EXECUTABLE=%s", command->path);
4732         }
4733
4734         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4735         if (r < 0) {
4736                 *exit_status = EXIT_FDS;
4737                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4738         }
4739
4740 #if HAVE_SELINUX
4741         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4742                 int fd = -EBADF;
4743
4744                 if (socket_fd >= 0)
4745                         fd = socket_fd;
4746                 else if (params->n_socket_fds == 1)
4747                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4748                          * use context from that fd to compute the label. */
4749                         fd = params->fds[0];
4750
4751                 if (fd >= 0) {
4752                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4753                         if (r < 0) {
4754                                 if (!context->selinux_context_ignore) {
4755                                         *exit_status = EXIT_SELINUX_CONTEXT;
4756                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4757                                 }
4758                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4759                         }
4760                 }
4761         }
4762 #endif
4763
4764         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4765          * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4766          * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4767          * execve(). */
4768
4769         r = close_all_fds(keep_fds, n_keep_fds);
4770         if (r >= 0)
4771                 r = shift_fds(fds, n_fds);
4772         if (r >= 0)
4773                 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4774         if (r < 0) {
4775                 *exit_status = EXIT_FDS;
4776                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4777         }
4778
4779         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4780          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4781          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4782          * came this far. */
4783
4784         secure_bits = context->secure_bits;
4785
4786         if (needs_sandboxing) {
4787                 uint64_t bset;
4788
4789                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4790                  * (Note this is placed after the general resource limit initialization, see above, in order
4791                  * to take precedence.) */
4792                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4793                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4794                                 *exit_status = EXIT_LIMITS;
4795                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4796                         }
4797                 }
4798
4799 #if ENABLE_SMACK
4800                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4801                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4802                 if (use_smack) {
4803                         r = setup_smack(unit->manager, context, executable_fd);
4804                         if (r < 0 && !context->smack_process_label_ignore) {
4805                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4806                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4807                         }
4808                 }
4809 #endif
4810
4811                 bset = context->capability_bounding_set;
4812                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4813                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4814                  * instead of us doing that */
4815                 if (needs_ambient_hack)
4816                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4817                                 (UINT64_C(1) << CAP_SETUID) |
4818                                 (UINT64_C(1) << CAP_SETGID);
4819
4820                 if (!cap_test_all(bset)) {
4821                         r = capability_bounding_set_drop(bset, /* right_now= */ false);
4822                         if (r < 0) {
4823                                 *exit_status = EXIT_CAPABILITIES;
4824                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4825                         }
4826                 }
4827
4828                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4829                  * keep-caps set.
4830                  *
4831                  * To be able to raise the ambient capabilities after setresuid() they have to be added to
4832                  * the inherited set and keep caps has to be set (done in enforce_user()).  After setresuid()
4833                  * the ambient capabilities can be raised as they are present in the permitted and
4834                  * inhertiable set. However it is possible that someone wants to set ambient capabilities
4835                  * without changing the user, so we also set the ambient capabilities here.
4836                  *
4837                  * The requested ambient capabilities are raised in the inheritable set if the second
4838                  * argument is true. */
4839                 if (!needs_ambient_hack) {
4840                         r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4841                         if (r < 0) {
4842                                 *exit_status = EXIT_CAPABILITIES;
4843                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4844                         }
4845                 }
4846         }
4847
4848         /* chroot to root directory first, before we lose the ability to chroot */
4849         r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4850         if (r < 0)
4851                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4852
4853         if (needs_setuid) {
4854                 if (uid_is_valid(uid)) {
4855                         r = enforce_user(context, uid, capability_ambient_set);
4856                         if (r < 0) {
4857                                 *exit_status = EXIT_USER;
4858                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4859                         }
4860
4861                         if (!needs_ambient_hack && capability_ambient_set != 0) {
4862
4863                                 /* Raise the ambient capabilities after user change. */
4864                                 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4865                                 if (r < 0) {
4866                                         *exit_status = EXIT_CAPABILITIES;
4867                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4868                                 }
4869                         }
4870                 }
4871         }
4872
4873         /* Apply working directory here, because the working directory might be on NFS and only the user running
4874          * this service might have the correct privilege to change to the working directory */
4875         r = apply_working_directory(context, params, runtime, home, exit_status);
4876         if (r < 0)
4877                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4878
4879         if (needs_sandboxing) {
4880                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4881                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4882                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4883                  * are restricted. */
4884
4885 #if HAVE_SELINUX
4886                 if (use_selinux) {
4887                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4888
4889                         if (exec_context) {
4890                                 r = setexeccon(exec_context);
4891                                 if (r < 0) {
4892                                         if (!context->selinux_context_ignore) {
4893                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4894                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4895                                         }
4896                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4897                                 }
4898                         }
4899                 }
4900 #endif
4901
4902 #if HAVE_APPARMOR
4903                 if (use_apparmor && context->apparmor_profile) {
4904                         r = aa_change_onexec(context->apparmor_profile);
4905                         if (r < 0 && !context->apparmor_profile_ignore) {
4906                                 *exit_status = EXIT_APPARMOR_PROFILE;
4907                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4908                         }
4909                 }
4910 #endif
4911
4912                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4913                  * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4914                  * requires CAP_SETPCAP. */
4915                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4916                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4917                          * effective set here.
4918                          *
4919                          * The effective set is overwritten during execve() with the following values:
4920                          *
4921                          * - ambient set (for non-root processes)
4922                          *
4923                          * - (inheritable | bounding) set for root processes)
4924                          *
4925                          * Hence there is no security impact to raise it in the effective set before execve
4926                          */
4927                         r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4928                         if (r < 0) {
4929                                 *exit_status = EXIT_CAPABILITIES;
4930                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4931                         }
4932                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4933                                 *exit_status = EXIT_SECUREBITS;
4934                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4935                         }
4936                 }
4937
4938                 if (context_has_no_new_privileges(context))
4939                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4940                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4941                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4942                         }
4943
4944 #if HAVE_SECCOMP
4945                 r = apply_address_families(unit, context);
4946                 if (r < 0) {
4947                         *exit_status = EXIT_ADDRESS_FAMILIES;
4948                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4949                 }
4950
4951                 r = apply_memory_deny_write_execute(unit, context);
4952                 if (r < 0) {
4953                         *exit_status = EXIT_SECCOMP;
4954                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4955                 }
4956
4957                 r = apply_restrict_realtime(unit, context);
4958                 if (r < 0) {
4959                         *exit_status = EXIT_SECCOMP;
4960                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4961                 }
4962
4963                 r = apply_restrict_suid_sgid(unit, context);
4964                 if (r < 0) {
4965                         *exit_status = EXIT_SECCOMP;
4966                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4967                 }
4968
4969                 r = apply_restrict_namespaces(unit, context);
4970                 if (r < 0) {
4971                         *exit_status = EXIT_SECCOMP;
4972                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4973                 }
4974
4975                 r = apply_protect_sysctl(unit, context);
4976                 if (r < 0) {
4977                         *exit_status = EXIT_SECCOMP;
4978                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4979                 }
4980
4981                 r = apply_protect_kernel_modules(unit, context);
4982                 if (r < 0) {
4983                         *exit_status = EXIT_SECCOMP;
4984                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4985                 }
4986
4987                 r = apply_protect_kernel_logs(unit, context);
4988                 if (r < 0) {
4989                         *exit_status = EXIT_SECCOMP;
4990                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4991                 }
4992
4993                 r = apply_protect_clock(unit, context);
4994                 if (r < 0) {
4995                         *exit_status = EXIT_SECCOMP;
4996                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4997                 }
4998
4999                 r = apply_private_devices(unit, context);
5000                 if (r < 0) {
5001                         *exit_status = EXIT_SECCOMP;
5002                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5003                 }
5004
5005                 r = apply_syscall_archs(unit, context);
5006                 if (r < 0) {
5007                         *exit_status = EXIT_SECCOMP;
5008                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5009                 }
5010
5011                 r = apply_lock_personality(unit, context);
5012                 if (r < 0) {
5013                         *exit_status = EXIT_SECCOMP;
5014                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5015                 }
5016
5017                 r = apply_syscall_log(unit, context);
5018                 if (r < 0) {
5019                         *exit_status = EXIT_SECCOMP;
5020                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5021                 }
5022
5023                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5024                  * by the filter as little as possible. */
5025                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5026                 if (r < 0) {
5027                         *exit_status = EXIT_SECCOMP;
5028                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5029                 }
5030 #endif
5031
5032 #if HAVE_LIBBPF
5033                 r = apply_restrict_filesystems(unit, context);
5034                 if (r < 0) {
5035                         *exit_status = EXIT_BPF;
5036                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5037                 }
5038 #endif
5039
5040         }
5041
5042         if (!strv_isempty(context->unset_environment)) {
5043                 char **ee = NULL;
5044
5045                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5046                 if (!ee) {
5047                         *exit_status = EXIT_MEMORY;
5048                         return log_oom();
5049                 }
5050
5051                 strv_free_and_replace(accum_env, ee);
5052         }
5053
5054         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5055                 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5056
5057                 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5058                 if (r < 0) {
5059                         *exit_status = EXIT_MEMORY;
5060                         return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5061                 }
5062                 final_argv = replaced_argv;
5063
5064                 if (!strv_isempty(unset_variables)) {
5065                         _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5066                         log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5067                 }
5068
5069                 if (!strv_isempty(bad_variables)) {
5070                         _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5071                         log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5072                 }
5073         } else
5074                 final_argv = command->argv;
5075
5076         log_command_line(unit, "Executing", executable, final_argv);
5077
5078         if (exec_fd >= 0) {
5079                 uint8_t hot = 1;
5080
5081                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5082                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5083
5084                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5085                         *exit_status = EXIT_EXEC;
5086                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5087                 }
5088         }
5089
5090         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5091
5092         if (exec_fd >= 0) {
5093                 uint8_t hot = 0;
5094
5095                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5096                  * that POLLHUP on it no longer means execve() succeeded. */
5097
5098                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5099                         *exit_status = EXIT_EXEC;
5100                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5101                 }
5102         }
5103
5104         *exit_status = EXIT_EXEC;
5105         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5106 }
5107
5108 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5109 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5110
5111 int exec_spawn(Unit *unit,
5112                ExecCommand *command,
5113                const ExecContext *context,
5114                const ExecParameters *params,
5115                ExecRuntime *runtime,
5116                const CGroupContext *cgroup_context,
5117                pid_t *ret) {
5118
5119         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5120         _cleanup_free_ char *subcgroup_path = NULL;
5121         _cleanup_strv_free_ char **files_env = NULL;
5122         size_t n_storage_fds = 0, n_socket_fds = 0;
5123         pid_t pid;
5124
5125         assert(unit);
5126         assert(command);
5127         assert(context);
5128         assert(ret);
5129         assert(params);
5130         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5131
5132         LOG_CONTEXT_PUSH_UNIT(unit);
5133
5134         if (context->std_input == EXEC_INPUT_SOCKET ||
5135             context->std_output == EXEC_OUTPUT_SOCKET ||
5136             context->std_error == EXEC_OUTPUT_SOCKET) {
5137
5138                 if (params->n_socket_fds > 1)
5139                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5140
5141                 if (params->n_socket_fds == 0)
5142                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5143
5144                 socket_fd = params->fds[0];
5145         } else {
5146                 socket_fd = -EBADF;
5147                 fds = params->fds;
5148                 n_socket_fds = params->n_socket_fds;
5149                 n_storage_fds = params->n_storage_fds;
5150         }
5151
5152         r = exec_context_named_iofds(context, params, named_iofds);
5153         if (r < 0)
5154                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5155
5156         r = exec_context_load_environment(unit, context, &files_env);
5157         if (r < 0)
5158                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5159
5160         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5161            and, until the next SELinux policy changes, we save further reloads in future children. */
5162         mac_selinux_maybe_reload();
5163
5164         /* We won't know the real executable path until we create the mount namespace in the child, but we
5165            want to log from the parent, so we use the possibly inaccurate path here. */
5166         log_command_line(unit, "About to execute", command->path, command->argv);
5167
5168         if (params->cgroup_path) {
5169                 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5170                 if (r < 0)
5171                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5172                 if (r > 0) {
5173                         /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5174                          * realized by the unit logic) */
5175
5176                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5177                         if (r < 0)
5178                                 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5179                 }
5180         }
5181
5182         pid = fork();
5183         if (pid < 0)
5184                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5185
5186         if (pid == 0) {
5187                 int exit_status;
5188
5189                 r = exec_child(unit,
5190                                command,
5191                                context,
5192                                params,
5193                                runtime,
5194                                cgroup_context,
5195                                socket_fd,
5196                                named_iofds,
5197                                fds,
5198                                n_socket_fds,
5199                                n_storage_fds,
5200                                files_env,
5201                                unit->manager->user_lookup_fds[1],
5202                                &exit_status);
5203
5204                 if (r < 0) {
5205                         const char *status = ASSERT_PTR(
5206                                         exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
5207
5208                         log_unit_struct_errno(unit, LOG_ERR, r,
5209                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5210                                               LOG_UNIT_INVOCATION_ID(unit),
5211                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5212                                                                status, command->path),
5213                                               "EXECUTABLE=%s", command->path);
5214                 } else
5215                         assert(exit_status == EXIT_SUCCESS);
5216
5217                 _exit(exit_status);
5218         }
5219
5220         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5221
5222         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5223          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5224          * process will be killed too). */
5225         if (subcgroup_path)
5226                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5227
5228         exec_status_start(&command->exec_status, pid);
5229
5230         *ret = pid;
5231         return 0;
5232 }
5233
5234 void exec_context_init(ExecContext *c) {
5235         assert(c);
5236
5237         c->umask = 0022;
5238         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5239         c->cpu_sched_policy = SCHED_OTHER;
5240         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5241         c->syslog_level_prefix = true;
5242         c->ignore_sigpipe = true;
5243         c->timer_slack_nsec = NSEC_INFINITY;
5244         c->personality = PERSONALITY_INVALID;
5245         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5246                 c->directories[t].mode = 0755;
5247         c->timeout_clean_usec = USEC_INFINITY;
5248         c->capability_bounding_set = CAP_MASK_UNSET;
5249         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5250         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5251         c->log_level_max = -1;
5252 #if HAVE_SECCOMP
5253         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5254 #endif
5255         c->tty_rows = UINT_MAX;
5256         c->tty_cols = UINT_MAX;
5257         numa_policy_reset(&c->numa_policy);
5258         c->private_mounts = -1;
5259         c->memory_ksm = -1;
5260 }
5261
5262 void exec_context_done(ExecContext *c) {
5263         assert(c);
5264
5265         c->environment = strv_free(c->environment);
5266         c->environment_files = strv_free(c->environment_files);
5267         c->pass_environment = strv_free(c->pass_environment);
5268         c->unset_environment = strv_free(c->unset_environment);
5269
5270         rlimit_free_all(c->rlimit);
5271
5272         for (size_t l = 0; l < 3; l++) {
5273                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5274                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5275         }
5276
5277         c->working_directory = mfree(c->working_directory);
5278         c->root_directory = mfree(c->root_directory);
5279         c->root_image = mfree(c->root_image);
5280         c->root_image_options = mount_options_free_all(c->root_image_options);
5281         c->root_hash = mfree(c->root_hash);
5282         c->root_hash_size = 0;
5283         c->root_hash_path = mfree(c->root_hash_path);
5284         c->root_hash_sig = mfree(c->root_hash_sig);
5285         c->root_hash_sig_size = 0;
5286         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5287         c->root_verity = mfree(c->root_verity);
5288         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5289         c->extension_directories = strv_free(c->extension_directories);
5290         c->tty_path = mfree(c->tty_path);
5291         c->syslog_identifier = mfree(c->syslog_identifier);
5292         c->user = mfree(c->user);
5293         c->group = mfree(c->group);
5294
5295         c->supplementary_groups = strv_free(c->supplementary_groups);
5296
5297         c->pam_name = mfree(c->pam_name);
5298
5299         c->read_only_paths = strv_free(c->read_only_paths);
5300         c->read_write_paths = strv_free(c->read_write_paths);
5301         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5302         c->exec_paths = strv_free(c->exec_paths);
5303         c->no_exec_paths = strv_free(c->no_exec_paths);
5304         c->exec_search_path = strv_free(c->exec_search_path);
5305
5306         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5307         c->bind_mounts = NULL;
5308         c->n_bind_mounts = 0;
5309         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5310         c->temporary_filesystems = NULL;
5311         c->n_temporary_filesystems = 0;
5312         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5313
5314         cpu_set_reset(&c->cpu_set);
5315         numa_policy_reset(&c->numa_policy);
5316
5317         c->utmp_id = mfree(c->utmp_id);
5318         c->selinux_context = mfree(c->selinux_context);
5319         c->apparmor_profile = mfree(c->apparmor_profile);
5320         c->smack_process_label = mfree(c->smack_process_label);
5321
5322         c->restrict_filesystems = set_free_free(c->restrict_filesystems);
5323
5324         c->syscall_filter = hashmap_free(c->syscall_filter);
5325         c->syscall_archs = set_free(c->syscall_archs);
5326         c->address_families = set_free(c->address_families);
5327
5328         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5329                 exec_directory_done(&c->directories[t]);
5330
5331         c->log_level_max = -1;
5332
5333         exec_context_free_log_extra_fields(c);
5334         c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5335         c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
5336
5337         c->log_ratelimit_interval_usec = 0;
5338         c->log_ratelimit_burst = 0;
5339
5340         c->stdin_data = mfree(c->stdin_data);
5341         c->stdin_data_size = 0;
5342
5343         c->network_namespace_path = mfree(c->network_namespace_path);
5344         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5345
5346         c->log_namespace = mfree(c->log_namespace);
5347
5348         c->load_credentials = hashmap_free(c->load_credentials);
5349         c->set_credentials = hashmap_free(c->set_credentials);
5350         c->import_credentials = set_free_free(c->import_credentials);
5351
5352         c->root_image_policy = image_policy_free(c->root_image_policy);
5353         c->mount_image_policy = image_policy_free(c->mount_image_policy);
5354         c->extension_image_policy = image_policy_free(c->extension_image_policy);
5355 }
5356
5357 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5358         assert(c);
5359
5360         if (!runtime_prefix)
5361                 return 0;
5362
5363         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5364                 _cleanup_free_ char *p = NULL;
5365
5366                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5367                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5368                 else
5369                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5370                 if (!p)
5371                         return -ENOMEM;
5372
5373                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5374                  * service next. */
5375                 (void) rm_rf(p, REMOVE_ROOT);
5376
5377                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5378                         _cleanup_free_ char *symlink_abs = NULL;
5379
5380                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5381                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5382                         else
5383                                 symlink_abs = path_join(runtime_prefix, *symlink);
5384                         if (!symlink_abs)
5385                                 return -ENOMEM;
5386
5387                         (void) unlink(symlink_abs);
5388                 }
5389         }
5390
5391         return 0;
5392 }
5393
5394 int exec_context_destroy_mount_ns_dir(Unit *u) {
5395         _cleanup_free_ char *p = NULL;
5396
5397         if (!u || !MANAGER_IS_SYSTEM(u->manager))
5398                 return 0;
5399
5400         p = path_join("/run/systemd/propagate/", u->id);
5401         if (!p)
5402                 return -ENOMEM;
5403
5404         /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5405         if (rmdir(p) < 0 && errno != ENOENT)
5406                 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5407
5408         return 0;
5409 }
5410
5411 static void exec_command_done(ExecCommand *c) {
5412         assert(c);
5413
5414         c->path = mfree(c->path);
5415         c->argv = strv_free(c->argv);
5416 }
5417
5418 void exec_command_done_array(ExecCommand *c, size_t n) {
5419         for (size_t i = 0; i < n; i++)
5420                 exec_command_done(c+i);
5421 }
5422
5423 ExecCommand* exec_command_free_list(ExecCommand *c) {
5424         ExecCommand *i;
5425
5426         while ((i = LIST_POP(command, c))) {
5427                 exec_command_done(i);
5428                 free(i);
5429         }
5430
5431         return NULL;
5432 }
5433
5434 void exec_command_free_array(ExecCommand **c, size_t n) {
5435         for (size_t i = 0; i < n; i++)
5436                 c[i] = exec_command_free_list(c[i]);
5437 }
5438
5439 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5440         for (size_t i = 0; i < n; i++)
5441                 exec_status_reset(&c[i].exec_status);
5442 }
5443
5444 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5445         for (size_t i = 0; i < n; i++)
5446                 LIST_FOREACH(command, z, c[i])
5447                         exec_status_reset(&z->exec_status);
5448 }
5449
5450 typedef struct InvalidEnvInfo {
5451         const Unit *unit;
5452         const char *path;
5453 } InvalidEnvInfo;
5454
5455 static void invalid_env(const char *p, void *userdata) {
5456         InvalidEnvInfo *info = userdata;
5457
5458         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5459 }
5460
5461 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5462         assert(c);
5463
5464         switch (fd_index) {
5465
5466         case STDIN_FILENO:
5467                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5468                         return NULL;
5469
5470                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5471
5472         case STDOUT_FILENO:
5473                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5474                         return NULL;
5475
5476                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5477
5478         case STDERR_FILENO:
5479                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5480                         return NULL;
5481
5482                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5483
5484         default:
5485                 return NULL;
5486         }
5487 }
5488
5489 static int exec_context_named_iofds(
5490                 const ExecContext *c,
5491                 const ExecParameters *p,
5492                 int named_iofds[static 3]) {
5493
5494         size_t targets;
5495         const char* stdio_fdname[3];
5496         size_t n_fds;
5497
5498         assert(c);
5499         assert(p);
5500         assert(named_iofds);
5501
5502         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5503                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5504                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5505
5506         for (size_t i = 0; i < 3; i++)
5507                 stdio_fdname[i] = exec_context_fdname(c, i);
5508
5509         n_fds = p->n_storage_fds + p->n_socket_fds;
5510
5511         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5512                 if (named_iofds[STDIN_FILENO] < 0 &&
5513                     c->std_input == EXEC_INPUT_NAMED_FD &&
5514                     stdio_fdname[STDIN_FILENO] &&
5515                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5516
5517                         named_iofds[STDIN_FILENO] = p->fds[i];
5518                         targets--;
5519
5520                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5521                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5522                            stdio_fdname[STDOUT_FILENO] &&
5523                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5524
5525                         named_iofds[STDOUT_FILENO] = p->fds[i];
5526                         targets--;
5527
5528                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5529                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5530                            stdio_fdname[STDERR_FILENO] &&
5531                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5532
5533                         named_iofds[STDERR_FILENO] = p->fds[i];
5534                         targets--;
5535                 }
5536
5537         return targets == 0 ? 0 : -ENOENT;
5538 }
5539
5540 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5541         _cleanup_strv_free_ char **v = NULL;
5542         int r;
5543
5544         assert(c);
5545         assert(ret);
5546
5547         STRV_FOREACH(i, c->environment_files) {
5548                 _cleanup_globfree_ glob_t pglob = {};
5549                 bool ignore = false;
5550                 char *fn = *i;
5551
5552                 if (fn[0] == '-') {
5553                         ignore = true;
5554                         fn++;
5555                 }
5556
5557                 if (!path_is_absolute(fn)) {
5558                         if (ignore)
5559                                 continue;
5560                         return -EINVAL;
5561                 }
5562
5563                 /* Filename supports globbing, take all matching files */
5564                 r = safe_glob(fn, 0, &pglob);
5565                 if (r < 0) {
5566                         if (ignore)
5567                                 continue;
5568                         return r;
5569                 }
5570
5571                 /* When we don't match anything, -ENOENT should be returned */
5572                 assert(pglob.gl_pathc > 0);
5573
5574                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
5575                         _cleanup_strv_free_ char **p = NULL;
5576
5577                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5578                         if (r < 0) {
5579                                 if (ignore)
5580                                         continue;
5581                                 return r;
5582                         }
5583
5584                         /* Log invalid environment variables with filename */
5585                         if (p) {
5586                                 InvalidEnvInfo info = {
5587                                         .unit = unit,
5588                                         .path = pglob.gl_pathv[n]
5589                                 };
5590
5591                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5592                         }
5593
5594                         if (!v)
5595                                 v = TAKE_PTR(p);
5596                         else {
5597                                 char **m = strv_env_merge(v, p);
5598                                 if (!m)
5599                                         return -ENOMEM;
5600
5601                                 strv_free_and_replace(v, m);
5602                         }
5603                 }
5604         }
5605
5606         *ret = TAKE_PTR(v);
5607
5608         return 0;
5609 }
5610
5611 static bool tty_may_match_dev_console(const char *tty) {
5612         _cleanup_free_ char *resolved = NULL;
5613
5614         if (!tty)
5615                 return true;
5616
5617         tty = skip_dev_prefix(tty);
5618
5619         /* trivial identity? */
5620         if (streq(tty, "console"))
5621                 return true;
5622
5623         if (resolve_dev_console(&resolved) < 0)
5624                 return true; /* if we could not resolve, assume it may */
5625
5626         /* "tty0" means the active VC, so it may be the same sometimes */
5627         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5628 }
5629
5630 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5631         assert(ec);
5632
5633         return ec->tty_reset ||
5634                 ec->tty_vhangup ||
5635                 ec->tty_vt_disallocate ||
5636                 is_terminal_input(ec->std_input) ||
5637                 is_terminal_output(ec->std_output) ||
5638                 is_terminal_output(ec->std_error);
5639 }
5640
5641 bool exec_context_may_touch_console(const ExecContext *ec) {
5642
5643         return exec_context_may_touch_tty(ec) &&
5644                tty_may_match_dev_console(exec_context_tty_path(ec));
5645 }
5646
5647 static void strv_fprintf(FILE *f, char **l) {
5648         assert(f);
5649
5650         STRV_FOREACH(g, l)
5651                 fprintf(f, " %s", *g);
5652 }
5653
5654 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5655         assert(f);
5656         assert(prefix);
5657         assert(name);
5658
5659         if (!strv_isempty(strv)) {
5660                 fprintf(f, "%s%s:", prefix, name);
5661                 strv_fprintf(f, strv);
5662                 fputs("\n", f);
5663         }
5664 }
5665
5666 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5667         int r;
5668
5669         assert(c);
5670         assert(f);
5671
5672         prefix = strempty(prefix);
5673
5674         fprintf(f,
5675                 "%sUMask: %04o\n"
5676                 "%sWorkingDirectory: %s\n"
5677                 "%sRootDirectory: %s\n"
5678                 "%sRootEphemeral: %s\n"
5679                 "%sNonBlocking: %s\n"
5680                 "%sPrivateTmp: %s\n"
5681                 "%sPrivateDevices: %s\n"
5682                 "%sProtectKernelTunables: %s\n"
5683                 "%sProtectKernelModules: %s\n"
5684                 "%sProtectKernelLogs: %s\n"
5685                 "%sProtectClock: %s\n"
5686                 "%sProtectControlGroups: %s\n"
5687                 "%sPrivateNetwork: %s\n"
5688                 "%sPrivateUsers: %s\n"
5689                 "%sProtectHome: %s\n"
5690                 "%sProtectSystem: %s\n"
5691                 "%sMountAPIVFS: %s\n"
5692                 "%sIgnoreSIGPIPE: %s\n"
5693                 "%sMemoryDenyWriteExecute: %s\n"
5694                 "%sRestrictRealtime: %s\n"
5695                 "%sRestrictSUIDSGID: %s\n"
5696                 "%sKeyringMode: %s\n"
5697                 "%sProtectHostname: %s\n"
5698                 "%sProtectProc: %s\n"
5699                 "%sProcSubset: %s\n",
5700                 prefix, c->umask,
5701                 prefix, empty_to_root(c->working_directory),
5702                 prefix, empty_to_root(c->root_directory),
5703                 prefix, yes_no(c->root_ephemeral),
5704                 prefix, yes_no(c->non_blocking),
5705                 prefix, yes_no(c->private_tmp),
5706                 prefix, yes_no(c->private_devices),
5707                 prefix, yes_no(c->protect_kernel_tunables),
5708                 prefix, yes_no(c->protect_kernel_modules),
5709                 prefix, yes_no(c->protect_kernel_logs),
5710                 prefix, yes_no(c->protect_clock),
5711                 prefix, yes_no(c->protect_control_groups),
5712                 prefix, yes_no(c->private_network),
5713                 prefix, yes_no(c->private_users),
5714                 prefix, protect_home_to_string(c->protect_home),
5715                 prefix, protect_system_to_string(c->protect_system),
5716                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5717                 prefix, yes_no(c->ignore_sigpipe),
5718                 prefix, yes_no(c->memory_deny_write_execute),
5719                 prefix, yes_no(c->restrict_realtime),
5720                 prefix, yes_no(c->restrict_suid_sgid),
5721                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5722                 prefix, yes_no(c->protect_hostname),
5723                 prefix, protect_proc_to_string(c->protect_proc),
5724                 prefix, proc_subset_to_string(c->proc_subset));
5725
5726         if (c->root_image)
5727                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5728
5729         if (c->root_image_options) {
5730                 fprintf(f, "%sRootImageOptions:", prefix);
5731                 LIST_FOREACH(mount_options, o, c->root_image_options)
5732                         if (!isempty(o->options))
5733                                 fprintf(f, " %s:%s",
5734                                         partition_designator_to_string(o->partition_designator),
5735                                         o->options);
5736                 fprintf(f, "\n");
5737         }
5738
5739         if (c->root_hash) {
5740                 _cleanup_free_ char *encoded = NULL;
5741                 encoded = hexmem(c->root_hash, c->root_hash_size);
5742                 if (encoded)
5743                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5744         }
5745
5746         if (c->root_hash_path)
5747                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5748
5749         if (c->root_hash_sig) {
5750                 _cleanup_free_ char *encoded = NULL;
5751                 ssize_t len;
5752                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5753                 if (len)
5754                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5755         }
5756
5757         if (c->root_hash_sig_path)
5758                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5759
5760         if (c->root_verity)
5761                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5762
5763         STRV_FOREACH(e, c->environment)
5764                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5765
5766         STRV_FOREACH(e, c->environment_files)
5767                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5768
5769         STRV_FOREACH(e, c->pass_environment)
5770                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5771
5772         STRV_FOREACH(e, c->unset_environment)
5773                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5774
5775         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5776
5777         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5778                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5779
5780                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5781                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5782
5783                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5784                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5785                 }
5786         }
5787
5788         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5789
5790         if (c->nice_set)
5791                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5792
5793         if (c->oom_score_adjust_set)
5794                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5795
5796         if (c->coredump_filter_set)
5797                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5798
5799         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5800                 if (c->rlimit[i]) {
5801                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5802                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5803                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5804                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5805                 }
5806
5807         if (c->ioprio_set) {
5808                 _cleanup_free_ char *class_str = NULL;
5809
5810                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5811                 if (r >= 0)
5812                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5813
5814                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5815         }
5816
5817         if (c->cpu_sched_set) {
5818                 _cleanup_free_ char *policy_str = NULL;
5819
5820                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5821                 if (r >= 0)
5822                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5823
5824                 fprintf(f,
5825                         "%sCPUSchedulingPriority: %i\n"
5826                         "%sCPUSchedulingResetOnFork: %s\n",
5827                         prefix, c->cpu_sched_priority,
5828                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5829         }
5830
5831         if (c->cpu_set.set) {
5832                 _cleanup_free_ char *affinity = NULL;
5833
5834                 affinity = cpu_set_to_range_string(&c->cpu_set);
5835                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5836         }
5837
5838         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5839                 _cleanup_free_ char *nodes = NULL;
5840
5841                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5842                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5843                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5844         }
5845
5846         if (c->timer_slack_nsec != NSEC_INFINITY)
5847                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5848
5849         fprintf(f,
5850                 "%sStandardInput: %s\n"
5851                 "%sStandardOutput: %s\n"
5852                 "%sStandardError: %s\n",
5853                 prefix, exec_input_to_string(c->std_input),
5854                 prefix, exec_output_to_string(c->std_output),
5855                 prefix, exec_output_to_string(c->std_error));
5856
5857         if (c->std_input == EXEC_INPUT_NAMED_FD)
5858                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5859         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5860                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5861         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5862                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5863
5864         if (c->std_input == EXEC_INPUT_FILE)
5865                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5866         if (c->std_output == EXEC_OUTPUT_FILE)
5867                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5868         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5869                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5870         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5871                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5872         if (c->std_error == EXEC_OUTPUT_FILE)
5873                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5874         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5875                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5876         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5877                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5878
5879         if (c->tty_path)
5880                 fprintf(f,
5881                         "%sTTYPath: %s\n"
5882                         "%sTTYReset: %s\n"
5883                         "%sTTYVHangup: %s\n"
5884                         "%sTTYVTDisallocate: %s\n"
5885                         "%sTTYRows: %u\n"
5886                         "%sTTYColumns: %u\n",
5887                         prefix, c->tty_path,
5888                         prefix, yes_no(c->tty_reset),
5889                         prefix, yes_no(c->tty_vhangup),
5890                         prefix, yes_no(c->tty_vt_disallocate),
5891                         prefix, c->tty_rows,
5892                         prefix, c->tty_cols);
5893
5894         if (IN_SET(c->std_output,
5895                    EXEC_OUTPUT_KMSG,
5896                    EXEC_OUTPUT_JOURNAL,
5897                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5898                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5899             IN_SET(c->std_error,
5900                    EXEC_OUTPUT_KMSG,
5901                    EXEC_OUTPUT_JOURNAL,
5902                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5903                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5904
5905                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5906
5907                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5908                 if (r >= 0)
5909                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5910
5911                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5912                 if (r >= 0)
5913                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5914         }
5915
5916         if (c->log_level_max >= 0) {
5917                 _cleanup_free_ char *t = NULL;
5918
5919                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5920
5921                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5922         }
5923
5924         if (c->log_ratelimit_interval_usec > 0)
5925                 fprintf(f,
5926                         "%sLogRateLimitIntervalSec: %s\n",
5927                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5928
5929         if (c->log_ratelimit_burst > 0)
5930                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5931
5932         if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5933                 fprintf(f, "%sLogFilterPatterns:", prefix);
5934
5935                 char *pattern;
5936                 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5937                         fprintf(f, " %s", pattern);
5938                 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5939                         fprintf(f, " ~%s", pattern);
5940                 fputc('\n', f);
5941         }
5942
5943         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5944                 fprintf(f, "%sLogExtraFields: ", prefix);
5945                 fwrite(c->log_extra_fields[j].iov_base,
5946                        1, c->log_extra_fields[j].iov_len,
5947                        f);
5948                 fputc('\n', f);
5949         }
5950
5951         if (c->log_namespace)
5952                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5953
5954         if (c->secure_bits) {
5955                 _cleanup_free_ char *str = NULL;
5956
5957                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5958                 if (r >= 0)
5959                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5960         }
5961
5962         if (c->capability_bounding_set != CAP_MASK_UNSET) {
5963                 _cleanup_free_ char *str = NULL;
5964
5965                 r = capability_set_to_string(c->capability_bounding_set, &str);
5966                 if (r >= 0)
5967                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5968         }
5969
5970         if (c->capability_ambient_set != 0) {
5971                 _cleanup_free_ char *str = NULL;
5972
5973                 r = capability_set_to_string(c->capability_ambient_set, &str);
5974                 if (r >= 0)
5975                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5976         }
5977
5978         if (c->user)
5979                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5980         if (c->group)
5981                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5982
5983         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5984
5985         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5986
5987         if (c->pam_name)
5988                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5989
5990         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5991         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5992         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5993         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5994         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5995         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
5996
5997         for (size_t i = 0; i < c->n_bind_mounts; i++)
5998                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5999                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6000                         c->bind_mounts[i].ignore_enoent ? "-": "",
6001                         c->bind_mounts[i].source,
6002                         c->bind_mounts[i].destination,
6003                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6004
6005         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6006                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6007
6008                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6009                         t->path,
6010                         isempty(t->options) ? "" : ":",
6011                         strempty(t->options));
6012         }
6013
6014         if (c->utmp_id)
6015                 fprintf(f,
6016                         "%sUtmpIdentifier: %s\n",
6017                         prefix, c->utmp_id);
6018
6019         if (c->selinux_context)
6020                 fprintf(f,
6021                         "%sSELinuxContext: %s%s\n",
6022                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6023
6024         if (c->apparmor_profile)
6025                 fprintf(f,
6026                         "%sAppArmorProfile: %s%s\n",
6027                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6028
6029         if (c->smack_process_label)
6030                 fprintf(f,
6031                         "%sSmackProcessLabel: %s%s\n",
6032                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6033
6034         if (c->personality != PERSONALITY_INVALID)
6035                 fprintf(f,
6036                         "%sPersonality: %s\n",
6037                         prefix, strna(personality_to_string(c->personality)));
6038
6039         fprintf(f,
6040                 "%sLockPersonality: %s\n",
6041                 prefix, yes_no(c->lock_personality));
6042
6043         if (c->syscall_filter) {
6044                 fprintf(f,
6045                         "%sSystemCallFilter: ",
6046                         prefix);
6047
6048                 if (!c->syscall_allow_list)
6049                         fputc('~', f);
6050
6051 #if HAVE_SECCOMP
6052                 void *id, *val;
6053                 bool first = true;
6054                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6055                         _cleanup_free_ char *name = NULL;
6056                         const char *errno_name = NULL;
6057                         int num = PTR_TO_INT(val);
6058
6059                         if (first)
6060                                 first = false;
6061                         else
6062                                 fputc(' ', f);
6063
6064                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6065                         fputs(strna(name), f);
6066
6067                         if (num >= 0) {
6068                                 errno_name = seccomp_errno_or_action_to_string(num);
6069                                 if (errno_name)
6070                                         fprintf(f, ":%s", errno_name);
6071                                 else
6072                                         fprintf(f, ":%d", num);
6073                         }
6074                 }
6075 #endif
6076
6077                 fputc('\n', f);
6078         }
6079
6080         if (c->syscall_archs) {
6081                 fprintf(f,
6082                         "%sSystemCallArchitectures:",
6083                         prefix);
6084
6085 #if HAVE_SECCOMP
6086                 void *id;
6087                 SET_FOREACH(id, c->syscall_archs)
6088                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6089 #endif
6090                 fputc('\n', f);
6091         }
6092
6093         if (exec_context_restrict_namespaces_set(c)) {
6094                 _cleanup_free_ char *s = NULL;
6095
6096                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6097                 if (r >= 0)
6098                         fprintf(f, "%sRestrictNamespaces: %s\n",
6099                                 prefix, strna(s));
6100         }
6101
6102 #if HAVE_LIBBPF
6103         if (exec_context_restrict_filesystems_set(c)) {
6104                 char *fs;
6105                 SET_FOREACH(fs, c->restrict_filesystems)
6106                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6107         }
6108 #endif
6109
6110         if (c->network_namespace_path)
6111                 fprintf(f,
6112                         "%sNetworkNamespacePath: %s\n",
6113                         prefix, c->network_namespace_path);
6114
6115         if (c->syscall_errno > 0) {
6116                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6117
6118 #if HAVE_SECCOMP
6119                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6120                 if (errno_name)
6121                         fputs(errno_name, f);
6122                 else
6123                         fprintf(f, "%d", c->syscall_errno);
6124 #endif
6125                 fputc('\n', f);
6126         }
6127
6128         for (size_t i = 0; i < c->n_mount_images; i++) {
6129                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6130                         c->mount_images[i].ignore_enoent ? "-": "",
6131                         c->mount_images[i].source,
6132                         c->mount_images[i].destination);
6133                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6134                         fprintf(f, ":%s:%s",
6135                                 partition_designator_to_string(o->partition_designator),
6136                                 strempty(o->options));
6137                 fprintf(f, "\n");
6138         }
6139
6140         for (size_t i = 0; i < c->n_extension_images; i++) {
6141                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6142                         c->extension_images[i].ignore_enoent ? "-": "",
6143                         c->extension_images[i].source);
6144                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6145                         fprintf(f, ":%s:%s",
6146                                 partition_designator_to_string(o->partition_designator),
6147                                 strempty(o->options));
6148                 fprintf(f, "\n");
6149         }
6150
6151         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6152 }
6153
6154 bool exec_context_maintains_privileges(const ExecContext *c) {
6155         assert(c);
6156
6157         /* Returns true if the process forked off would run under
6158          * an unchanged UID or as root. */
6159
6160         if (!c->user)
6161                 return true;
6162
6163         if (streq(c->user, "root") || streq(c->user, "0"))
6164                 return true;
6165
6166         return false;
6167 }
6168
6169 int exec_context_get_effective_ioprio(const ExecContext *c) {
6170         int p;
6171
6172         assert(c);
6173
6174         if (c->ioprio_set)
6175                 return c->ioprio;
6176
6177         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6178         if (p < 0)
6179                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6180
6181         return ioprio_normalize(p);
6182 }
6183
6184 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6185         assert(c);
6186
6187         /* Explicit setting wins */
6188         if (c->mount_apivfs_set)
6189                 return c->mount_apivfs;
6190
6191         /* Default to "yes" if root directory or image are specified */
6192         if (exec_context_with_rootfs(c))
6193                 return true;
6194
6195         return false;
6196 }
6197
6198 void exec_context_free_log_extra_fields(ExecContext *c) {
6199         assert(c);
6200
6201         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6202                 free(c->log_extra_fields[l].iov_base);
6203         c->log_extra_fields = mfree(c->log_extra_fields);
6204         c->n_log_extra_fields = 0;
6205 }
6206
6207 void exec_context_revert_tty(ExecContext *c) {
6208         _cleanup_close_ int fd = -EBADF;
6209         const char *path;
6210         struct stat st;
6211         int r;
6212
6213         assert(c);
6214
6215         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6216         exec_context_tty_reset(c, NULL);
6217
6218         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6219          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6220          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6221         if (!exec_context_may_touch_tty(c))
6222                 return;
6223
6224         path = exec_context_tty_path(c);
6225         if (!path)
6226                 return;
6227
6228         fd = open(path, O_PATH|O_CLOEXEC);
6229         if (fd < 0)
6230                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6231                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6232                                              path);
6233
6234         if (fstat(fd, &st) < 0)
6235                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6236
6237         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6238          * if things are a character device, since a proper check either means we'd have to open the TTY and
6239          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6240          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6241          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6242         if (!S_ISCHR(st.st_mode))
6243                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6244
6245         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6246         if (r < 0)
6247                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6248 }
6249
6250 int exec_context_get_clean_directories(
6251                 ExecContext *c,
6252                 char **prefix,
6253                 ExecCleanMask mask,
6254                 char ***ret) {
6255
6256         _cleanup_strv_free_ char **l = NULL;
6257         int r;
6258
6259         assert(c);
6260         assert(prefix);
6261         assert(ret);
6262
6263         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6264                 if (!FLAGS_SET(mask, 1U << t))
6265                         continue;
6266
6267                 if (!prefix[t])
6268                         continue;
6269
6270                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6271                         char *j;
6272
6273                         j = path_join(prefix[t], c->directories[t].items[i].path);
6274                         if (!j)
6275                                 return -ENOMEM;
6276
6277                         r = strv_consume(&l, j);
6278                         if (r < 0)
6279                                 return r;
6280
6281                         /* Also remove private directories unconditionally. */
6282                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6283                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6284                                 if (!j)
6285                                         return -ENOMEM;
6286
6287                                 r = strv_consume(&l, j);
6288                                 if (r < 0)
6289                                         return r;
6290                         }
6291
6292                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6293                                 j = path_join(prefix[t], *symlink);
6294                                 if (!j)
6295                                         return -ENOMEM;
6296
6297                                 r = strv_consume(&l, j);
6298                                 if (r < 0)
6299                                         return r;
6300                         }
6301                 }
6302         }
6303
6304         *ret = TAKE_PTR(l);
6305         return 0;
6306 }
6307
6308 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6309         ExecCleanMask mask = 0;
6310
6311         assert(c);
6312         assert(ret);
6313
6314         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6315                 if (c->directories[t].n_items > 0)
6316                         mask |= 1U << t;
6317
6318         *ret = mask;
6319         return 0;
6320 }
6321
6322 void exec_status_start(ExecStatus *s, pid_t pid) {
6323         assert(s);
6324
6325         *s = (ExecStatus) {
6326                 .pid = pid,
6327         };
6328
6329         dual_timestamp_get(&s->start_timestamp);
6330 }
6331
6332 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6333         assert(s);
6334
6335         if (s->pid != pid)
6336                 *s = (ExecStatus) {
6337                         .pid = pid,
6338                 };
6339
6340         dual_timestamp_get(&s->exit_timestamp);
6341
6342         s->code = code;
6343         s->status = status;
6344
6345         if (context && context->utmp_id)
6346                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6347 }
6348
6349 void exec_status_reset(ExecStatus *s) {
6350         assert(s);
6351
6352         *s = (ExecStatus) {};
6353 }
6354
6355 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6356         assert(s);
6357         assert(f);
6358
6359         if (s->pid <= 0)
6360                 return;
6361
6362         prefix = strempty(prefix);
6363
6364         fprintf(f,
6365                 "%sPID: "PID_FMT"\n",
6366                 prefix, s->pid);
6367
6368         if (dual_timestamp_is_set(&s->start_timestamp))
6369                 fprintf(f,
6370                         "%sStart Timestamp: %s\n",
6371                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6372
6373         if (dual_timestamp_is_set(&s->exit_timestamp))
6374                 fprintf(f,
6375                         "%sExit Timestamp: %s\n"
6376                         "%sExit Code: %s\n"
6377                         "%sExit Status: %i\n",
6378                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6379                         prefix, sigchld_code_to_string(s->code),
6380                         prefix, s->status);
6381 }
6382
6383 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6384         _cleanup_free_ char *cmd = NULL;
6385         const char *prefix2;
6386
6387         assert(c);
6388         assert(f);
6389
6390         prefix = strempty(prefix);
6391         prefix2 = strjoina(prefix, "\t");
6392
6393         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6394
6395         fprintf(f,
6396                 "%sCommand Line: %s\n",
6397                 prefix, strnull(cmd));
6398
6399         exec_status_dump(&c->exec_status, f, prefix2);
6400 }
6401
6402 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6403         assert(f);
6404
6405         prefix = strempty(prefix);
6406
6407         LIST_FOREACH(command, i, c)
6408                 exec_command_dump(i, f, prefix);
6409 }
6410
6411 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6412         ExecCommand *end;
6413
6414         assert(l);
6415         assert(e);
6416
6417         if (*l) {
6418                 /* It's kind of important, that we keep the order here */
6419                 end = LIST_FIND_TAIL(command, *l);
6420                 LIST_INSERT_AFTER(command, *l, end, e);
6421         } else
6422                 *l = e;
6423 }
6424
6425 int exec_command_set(ExecCommand *c, const char *path, ...) {
6426         va_list ap;
6427         char **l, *p;
6428
6429         assert(c);
6430         assert(path);
6431
6432         va_start(ap, path);
6433         l = strv_new_ap(path, ap);
6434         va_end(ap);
6435
6436         if (!l)
6437                 return -ENOMEM;
6438
6439         p = strdup(path);
6440         if (!p) {
6441                 strv_free(l);
6442                 return -ENOMEM;
6443         }
6444
6445         free_and_replace(c->path, p);
6446
6447         return strv_free_and_replace(c->argv, l);
6448 }
6449
6450 int exec_command_append(ExecCommand *c, const char *path, ...) {
6451         _cleanup_strv_free_ char **l = NULL;
6452         va_list ap;
6453         int r;
6454
6455         assert(c);
6456         assert(path);
6457
6458         va_start(ap, path);
6459         l = strv_new_ap(path, ap);
6460         va_end(ap);
6461
6462         if (!l)
6463                 return -ENOMEM;
6464
6465         r = strv_extend_strv(&c->argv, l, false);
6466         if (r < 0)
6467                 return r;
6468
6469         return 0;
6470 }
6471
6472 static char *destroy_tree(char *path) {
6473         if (!path)
6474                 return NULL;
6475
6476         if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6477                 log_debug("Spawning process to nuke '%s'", path);
6478
6479                 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6480         }
6481
6482         return mfree(path);
6483 }
6484
6485 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
6486         if (!rt)
6487                 return NULL;
6488
6489         if (rt->manager)
6490                 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
6491
6492         rt->id = mfree(rt->id);
6493         rt->tmp_dir = mfree(rt->tmp_dir);
6494         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6495         safe_close_pair(rt->netns_storage_socket);
6496         safe_close_pair(rt->ipcns_storage_socket);
6497         return mfree(rt);
6498 }
6499
6500 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6501 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6502
6503 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
6504         if (!rt)
6505                 return NULL;
6506
6507         assert(rt->n_ref > 0);
6508         rt->n_ref--;
6509
6510         if (rt->n_ref > 0)
6511                 return NULL;
6512
6513         rt->tmp_dir = destroy_tree(rt->tmp_dir);
6514         rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
6515
6516         return exec_shared_runtime_free(rt);
6517 }
6518
6519 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
6520         _cleanup_free_ char *id_copy = NULL;
6521         ExecSharedRuntime *n;
6522
6523         assert(ret);
6524
6525         id_copy = strdup(id);
6526         if (!id_copy)
6527                 return -ENOMEM;
6528
6529         n = new(ExecSharedRuntime, 1);
6530         if (!n)
6531                 return -ENOMEM;
6532
6533         *n = (ExecSharedRuntime) {
6534                 .id = TAKE_PTR(id_copy),
6535                 .netns_storage_socket = PIPE_EBADF,
6536                 .ipcns_storage_socket = PIPE_EBADF,
6537         };
6538
6539         *ret = n;
6540         return 0;
6541 }
6542
6543 static int exec_shared_runtime_add(
6544                 Manager *m,
6545                 const char *id,
6546                 char **tmp_dir,
6547                 char **var_tmp_dir,
6548                 int netns_storage_socket[2],
6549                 int ipcns_storage_socket[2],
6550                 ExecSharedRuntime **ret) {
6551
6552         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
6553         int r;
6554
6555         assert(m);
6556         assert(id);
6557
6558         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6559
6560         r = exec_shared_runtime_allocate(&rt, id);
6561         if (r < 0)
6562                 return r;
6563
6564         r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
6565         if (r < 0)
6566                 return r;
6567
6568         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6569         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6570         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6571
6572         if (netns_storage_socket) {
6573                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6574                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6575         }
6576
6577         if (ipcns_storage_socket) {
6578                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6579                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6580         }
6581
6582         rt->manager = m;
6583
6584         if (ret)
6585                 *ret = rt;
6586         /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6587         TAKE_PTR(rt);
6588         return 0;
6589 }
6590
6591 static int exec_shared_runtime_make(
6592                 Manager *m,
6593                 const ExecContext *c,
6594                 const char *id,
6595                 ExecSharedRuntime **ret) {
6596
6597         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6598         _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6599         int r;
6600
6601         assert(m);
6602         assert(c);
6603         assert(id);
6604
6605         /* It is not necessary to create ExecSharedRuntime object. */
6606         if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
6607                 *ret = NULL;
6608                 return 0;
6609         }
6610
6611         if (c->private_tmp &&
6612             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6613               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6614                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6615                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6616                 if (r < 0)
6617                         return r;
6618         }
6619
6620         if (exec_needs_network_namespace(c)) {
6621                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6622                         return -errno;
6623         }
6624
6625         if (exec_needs_ipc_namespace(c)) {
6626                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6627                         return -errno;
6628         }
6629
6630         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6631         if (r < 0)
6632                 return r;
6633
6634         return 1;
6635 }
6636
6637 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6638         ExecSharedRuntime *rt;
6639         int r;
6640
6641         assert(m);
6642         assert(id);
6643         assert(ret);
6644
6645         rt = hashmap_get(m->exec_shared_runtime_by_id, id);
6646         if (rt)
6647                 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6648                 goto ref;
6649
6650         if (!create) {
6651                 *ret = NULL;
6652                 return 0;
6653         }
6654
6655         /* If not found, then create a new object. */
6656         r = exec_shared_runtime_make(m, c, id, &rt);
6657         if (r < 0)
6658                 return r;
6659         if (r == 0) {
6660                 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6661                 *ret = NULL;
6662                 return 0;
6663         }
6664
6665 ref:
6666         /* increment reference counter. */
6667         rt->n_ref++;
6668         *ret = rt;
6669         return 1;
6670 }
6671
6672 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6673         ExecSharedRuntime *rt;
6674
6675         assert(m);
6676         assert(f);
6677         assert(fds);
6678
6679         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6680                 fprintf(f, "exec-runtime=%s", rt->id);
6681
6682                 if (rt->tmp_dir)
6683                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6684
6685                 if (rt->var_tmp_dir)
6686                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6687
6688                 if (rt->netns_storage_socket[0] >= 0) {
6689                         int copy;
6690
6691                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6692                         if (copy < 0)
6693                                 return copy;
6694
6695                         fprintf(f, " netns-socket-0=%i", copy);
6696                 }
6697
6698                 if (rt->netns_storage_socket[1] >= 0) {
6699                         int copy;
6700
6701                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6702                         if (copy < 0)
6703                                 return copy;
6704
6705                         fprintf(f, " netns-socket-1=%i", copy);
6706                 }
6707
6708                 if (rt->ipcns_storage_socket[0] >= 0) {
6709                         int copy;
6710
6711                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6712                         if (copy < 0)
6713                                 return copy;
6714
6715                         fprintf(f, " ipcns-socket-0=%i", copy);
6716                 }
6717
6718                 if (rt->ipcns_storage_socket[1] >= 0) {
6719                         int copy;
6720
6721                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6722                         if (copy < 0)
6723                                 return copy;
6724
6725                         fprintf(f, " ipcns-socket-1=%i", copy);
6726                 }
6727
6728                 fputc('\n', f);
6729         }
6730
6731         return 0;
6732 }
6733
6734 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6735         _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6736         ExecSharedRuntime *rt;
6737         int r;
6738
6739         /* This is for the migration from old (v237 or earlier) deserialization text.
6740          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6741          * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6742          * so or not from the serialized text, then we always creates a new object owned by this. */
6743
6744         assert(u);
6745         assert(key);
6746         assert(value);
6747
6748         /* Manager manages ExecSharedRuntime objects by the unit id.
6749          * So, we omit the serialized text when the unit does not have id (yet?)... */
6750         if (isempty(u->id)) {
6751                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6752                 return 0;
6753         }
6754
6755         if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6756                 return log_oom();
6757
6758         rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6759         if (!rt) {
6760                 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
6761                         return log_oom();
6762
6763                 rt = rt_create;
6764         }
6765
6766         if (streq(key, "tmp-dir")) {
6767                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6768                         return -ENOMEM;
6769
6770         } else if (streq(key, "var-tmp-dir")) {
6771                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6772                         return -ENOMEM;
6773
6774         } else if (streq(key, "netns-socket-0")) {
6775                 int fd;
6776
6777                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6778                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6779                         return 0;
6780                 }
6781
6782                 safe_close(rt->netns_storage_socket[0]);
6783                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6784
6785         } else if (streq(key, "netns-socket-1")) {
6786                 int fd;
6787
6788                 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6789                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6790                         return 0;
6791                 }
6792
6793                 safe_close(rt->netns_storage_socket[1]);
6794                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6795
6796         } else
6797                 return 0;
6798
6799         /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6800         if (rt_create) {
6801                 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
6802                 if (r < 0) {
6803                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6804                         return 0;
6805                 }
6806
6807                 rt_create->manager = u->manager;
6808
6809                 /* Avoid cleanup */
6810                 TAKE_PTR(rt_create);
6811         }
6812
6813         return 1;
6814 }
6815
6816 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6817         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6818         char *id = NULL;
6819         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6820         const char *p, *v = ASSERT_PTR(value);
6821         size_t n;
6822
6823         assert(m);
6824         assert(fds);
6825
6826         n = strcspn(v, " ");
6827         id = strndupa_safe(v, n);
6828         if (v[n] != ' ')
6829                 goto finalize;
6830         p = v + n + 1;
6831
6832         v = startswith(p, "tmp-dir=");
6833         if (v) {
6834                 n = strcspn(v, " ");
6835                 tmp_dir = strndup(v, n);
6836                 if (!tmp_dir)
6837                         return log_oom();
6838                 if (v[n] != ' ')
6839                         goto finalize;
6840                 p = v + n + 1;
6841         }
6842
6843         v = startswith(p, "var-tmp-dir=");
6844         if (v) {
6845                 n = strcspn(v, " ");
6846                 var_tmp_dir = strndup(v, n);
6847                 if (!var_tmp_dir)
6848                         return log_oom();
6849                 if (v[n] != ' ')
6850                         goto finalize;
6851                 p = v + n + 1;
6852         }
6853
6854         v = startswith(p, "netns-socket-0=");
6855         if (v) {
6856                 char *buf;
6857
6858                 n = strcspn(v, " ");
6859                 buf = strndupa_safe(v, n);
6860
6861                 netns_fdpair[0] = parse_fd(buf);
6862                 if (netns_fdpair[0] < 0)
6863                         return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6864                 if (!fdset_contains(fds, netns_fdpair[0]))
6865                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6866                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6867                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6868                 if (v[n] != ' ')
6869                         goto finalize;
6870                 p = v + n + 1;
6871         }
6872
6873         v = startswith(p, "netns-socket-1=");
6874         if (v) {
6875                 char *buf;
6876
6877                 n = strcspn(v, " ");
6878                 buf = strndupa_safe(v, n);
6879
6880                 netns_fdpair[1] = parse_fd(buf);
6881                 if (netns_fdpair[1] < 0)
6882                         return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6883                 if (!fdset_contains(fds, netns_fdpair[1]))
6884                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6885                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6886                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6887                 if (v[n] != ' ')
6888                         goto finalize;
6889                 p = v + n + 1;
6890         }
6891
6892         v = startswith(p, "ipcns-socket-0=");
6893         if (v) {
6894                 char *buf;
6895
6896                 n = strcspn(v, " ");
6897                 buf = strndupa_safe(v, n);
6898
6899                 ipcns_fdpair[0] = parse_fd(buf);
6900                 if (ipcns_fdpair[0] < 0)
6901                         return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6902                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6903                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6904                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6905                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6906                 if (v[n] != ' ')
6907                         goto finalize;
6908                 p = v + n + 1;
6909         }
6910
6911         v = startswith(p, "ipcns-socket-1=");
6912         if (v) {
6913                 char *buf;
6914
6915                 n = strcspn(v, " ");
6916                 buf = strndupa_safe(v, n);
6917
6918                 ipcns_fdpair[1] = parse_fd(buf);
6919                 if (ipcns_fdpair[1] < 0)
6920                         return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6921                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6922                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6923                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6924                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6925         }
6926
6927 finalize:
6928         r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6929         if (r < 0)
6930                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6931         return 0;
6932 }
6933
6934 void exec_shared_runtime_vacuum(Manager *m) {
6935         ExecSharedRuntime *rt;
6936
6937         assert(m);
6938
6939         /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6940
6941         HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6942                 if (rt->n_ref > 0)
6943                         continue;
6944
6945                 (void) exec_shared_runtime_free(rt);
6946         }
6947 }
6948
6949 int exec_runtime_make(
6950                 const Unit *unit,
6951                 const ExecContext *context,
6952                 ExecSharedRuntime *shared,
6953                 DynamicCreds *creds,
6954                 ExecRuntime **ret) {
6955         _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6956         _cleanup_free_ char *ephemeral = NULL;
6957         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6958         int r;
6959
6960         assert(unit);
6961         assert(context);
6962         assert(ret);
6963
6964         if (!shared && !creds && !exec_needs_ephemeral(context)) {
6965                 *ret = NULL;
6966                 return 0;
6967         }
6968
6969         if (exec_needs_ephemeral(context)) {
6970                 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
6971                 if (r < 0)
6972                         return r;
6973
6974                 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
6975                 if (r < 0)
6976                         return r;
6977
6978                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
6979                         return -errno;
6980         }
6981
6982         rt = new(ExecRuntime, 1);
6983         if (!rt)
6984                 return -ENOMEM;
6985
6986         *rt = (ExecRuntime) {
6987                 .shared = shared,
6988                 .dynamic_creds = creds,
6989                 .ephemeral_copy = TAKE_PTR(ephemeral),
6990                 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
6991                 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
6992         };
6993
6994         *ret = TAKE_PTR(rt);
6995         return 1;
6996 }
6997
6998 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
6999         if (!rt)
7000                 return NULL;
7001
7002         exec_shared_runtime_unref(rt->shared);
7003         dynamic_creds_unref(rt->dynamic_creds);
7004
7005         rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7006
7007         safe_close_pair(rt->ephemeral_storage_socket);
7008         return mfree(rt);
7009 }
7010
7011 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7012         if (!rt)
7013                 return NULL;
7014
7015         rt->shared = exec_shared_runtime_destroy(rt->shared);
7016         rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7017         return exec_runtime_free(rt);
7018 }
7019
7020 void exec_params_clear(ExecParameters *p) {
7021         if (!p)
7022                 return;
7023
7024         p->environment = strv_free(p->environment);
7025         p->fd_names = strv_free(p->fd_names);
7026         p->fds = mfree(p->fds);
7027         p->exec_fd = safe_close(p->exec_fd);
7028 }
7029
7030 void exec_directory_done(ExecDirectory *d) {
7031         if (!d)
7032                 return;
7033
7034         for (size_t i = 0; i < d->n_items; i++) {
7035                 free(d->items[i].path);
7036                 strv_free(d->items[i].symlinks);
7037         }
7038
7039         d->items = mfree(d->items);
7040         d->n_items = 0;
7041         d->mode = 0755;
7042 }
7043
7044 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7045         assert(d);
7046         assert(path);
7047
7048         for (size_t i = 0; i < d->n_items; i++)
7049                 if (path_equal(d->items[i].path, path))
7050                         return &d->items[i];
7051
7052         return NULL;
7053 }
7054
7055 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7056         _cleanup_strv_free_ char **s = NULL;
7057         _cleanup_free_ char *p = NULL;
7058         ExecDirectoryItem *existing;
7059         int r;
7060
7061         assert(d);
7062         assert(path);
7063
7064         existing = exec_directory_find(d, path);
7065         if (existing) {
7066                 r = strv_extend(&existing->symlinks, symlink);
7067                 if (r < 0)
7068                         return r;
7069
7070                 return 0; /* existing item is updated */
7071         }
7072
7073         p = strdup(path);
7074         if (!p)
7075                 return -ENOMEM;
7076
7077         if (symlink) {
7078                 s = strv_new(symlink);
7079                 if (!s)
7080                         return -ENOMEM;
7081         }
7082
7083         if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7084                 return -ENOMEM;
7085
7086         d->items[d->n_items++] = (ExecDirectoryItem) {
7087                 .path = TAKE_PTR(p),
7088                 .symlinks = TAKE_PTR(s),
7089         };
7090
7091         return 1; /* new item is added */
7092 }
7093
7094 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7095         assert(a);
7096         assert(b);
7097
7098         return path_compare(a->path, b->path);
7099 }
7100
7101 void exec_directory_sort(ExecDirectory *d) {
7102         assert(d);
7103
7104         /* Sort the exec directories to make always parent directories processed at first in
7105          * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7106          * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7107          * list. See also comments in setup_exec_directory() and issue #24783. */
7108
7109         if (d->n_items <= 1)
7110                 return;
7111
7112         typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7113
7114         for (size_t i = 1; i < d->n_items; i++)
7115                 for (size_t j = 0; j < i; j++)
7116                         if (path_startswith(d->items[i].path, d->items[j].path)) {
7117                                 d->items[i].only_create = true;
7118                                 break;
7119                         }
7120 }
7121
7122 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7123         ExecDirectoryType t;
7124
7125         assert(s);
7126
7127         if (streq(s, "all"))
7128                 return EXEC_CLEAN_ALL;
7129         if (streq(s, "fdstore"))
7130                 return EXEC_CLEAN_FDSTORE;
7131
7132         t = exec_resource_type_from_string(s);
7133         if (t < 0)
7134                 return (ExecCleanMask) t;
7135
7136         return 1U << t;
7137 }
7138
7139 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7140         [EXEC_INPUT_NULL] = "null",
7141         [EXEC_INPUT_TTY] = "tty",
7142         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7143         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7144         [EXEC_INPUT_SOCKET] = "socket",
7145         [EXEC_INPUT_NAMED_FD] = "fd",
7146         [EXEC_INPUT_DATA] = "data",
7147         [EXEC_INPUT_FILE] = "file",
7148 };
7149
7150 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7151
7152 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7153         [EXEC_OUTPUT_INHERIT] = "inherit",
7154         [EXEC_OUTPUT_NULL] = "null",
7155         [EXEC_OUTPUT_TTY] = "tty",
7156         [EXEC_OUTPUT_KMSG] = "kmsg",
7157         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7158         [EXEC_OUTPUT_JOURNAL] = "journal",
7159         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7160         [EXEC_OUTPUT_SOCKET] = "socket",
7161         [EXEC_OUTPUT_NAMED_FD] = "fd",
7162         [EXEC_OUTPUT_FILE] = "file",
7163         [EXEC_OUTPUT_FILE_APPEND] = "append",
7164         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7165 };
7166
7167 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7168
7169 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7170         [EXEC_UTMP_INIT] = "init",
7171         [EXEC_UTMP_LOGIN] = "login",
7172         [EXEC_UTMP_USER] = "user",
7173 };
7174
7175 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7176
7177 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7178         [EXEC_PRESERVE_NO] = "no",
7179         [EXEC_PRESERVE_YES] = "yes",
7180         [EXEC_PRESERVE_RESTART] = "restart",
7181 };
7182
7183 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7184
7185 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7186 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7187         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7188         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7189         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7190         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7191         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7192 };
7193
7194 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7195
7196 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7197 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7198         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7199         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7200         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7201         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7202         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7203 };
7204
7205 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7206
7207 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7208  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7209  * directories, specifically .timer units with their timestamp touch file. */
7210 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7211         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7212         [EXEC_DIRECTORY_STATE] = "state",
7213         [EXEC_DIRECTORY_CACHE] = "cache",
7214         [EXEC_DIRECTORY_LOGS] = "logs",
7215         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7216 };
7217
7218 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7219
7220 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7221  * the service payload in. */
7222 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7223         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7224         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7225         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7226         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7227         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7228 };
7229
7230 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7231
7232 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7233         [EXEC_KEYRING_INHERIT] = "inherit",
7234         [EXEC_KEYRING_PRIVATE] = "private",
7235         [EXEC_KEYRING_SHARED] = "shared",
7236 };
7237
7238 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);