src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "cap-list.h"
  45 #include "capability-util.h"
  46 #include "cgroup-setup.h"
  47 #include "chown-recursive.h"
  48 #include "cpu-set-util.h"
  49 #include "data-fd-util.h"
  50 #include "def.h"
  51 #include "env-file.h"
  52 #include "env-util.h"
  53 #include "errno-list.h"
  54 #include "execute.h"
  55 #include "exit-status.h"
  56 #include "fd-util.h"
  57 #include "fileio.h"
  58 #include "format-util.h"
  59 #include "fs-util.h"
  60 #include "glob-util.h"
  61 #include "hexdecoct.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "manager-dump.h"
  69 #include "memory-util.h"
  70 #include "missing_fs.h"
  71 #include "mkdir.h"
  72 #include "mount-util.h"
  73 #include "mountpoint-util.h"
  74 #include "namespace.h"
  75 #include "parse-util.h"
  76 #include "path-util.h"
  77 #include "process-util.h"
  78 #include "random-util.h"
  79 #include "rlimit-util.h"
  80 #include "rm-rf.h"
  81 #if HAVE_SECCOMP
  82 #include "seccomp-util.h"
  83 #endif
  84 #include "securebits-util.h"
  85 #include "selinux-util.h"
  86 #include "signal-util.h"
  87 #include "smack-util.h"
  88 #include "socket-util.h"
  89 #include "special.h"
  90 #include "stat-util.h"
  91 #include "string-table.h"
  92 #include "string-util.h"
  93 #include "strv.h"
  94 #include "syslog-util.h"
  95 #include "terminal-util.h"
  96 #include "tmpfile-util.h"
  97 #include "umask-util.h"
  98 #include "unit-serialize.h"
  99 #include "user-util.h"
 100 #include "utmp-wtmp.h"
 101
 102 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 103 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 104
 105 #define SNDBUF_SIZE (8*1024*1024)
 106
 107 static int shift_fds(int fds[], size_t n_fds) {
 108         if (n_fds <= 0)
 109                 return 0;
 110
 111         /* Modifies the fds array! (sorts it) */
 112
 113         assert(fds);
 114
 115         for (int start = 0;;) {
 116                 int restart_from = -1;
 117
 118                 for (int i = start; i < (int) n_fds; i++) {
 119                         int nfd;
 120
 121                         /* Already at right index? */
 122                         if (fds[i] == i+3)
 123                                 continue;
 124
 125                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 126                         if (nfd < 0)
 127                                 return -errno;
 128
 129                         safe_close(fds[i]);
 130                         fds[i] = nfd;
 131
 132                         /* Hmm, the fd we wanted isn't free? Then
 133                          * let's remember that and try again from here */
 134                         if (nfd != i+3 && restart_from < 0)
 135                                 restart_from = i;
 136                 }
 137
 138                 if (restart_from < 0)
 139                         break;
 140
 141                 start = restart_from;
 142         }
 143
 144         return 0;
 145 }
 146
 147 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 148         size_t n_fds;
 149         int r;
 150
 151         n_fds = n_socket_fds + n_storage_fds;
 152         if (n_fds <= 0)
 153                 return 0;
 154
 155         assert(fds);
 156
 157         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 158          * O_NONBLOCK only applies to socket activation though. */
 159
 160         for (size_t i = 0; i < n_fds; i++) {
 161
 162                 if (i < n_socket_fds) {
 163                         r = fd_nonblock(fds[i], nonblock);
 164                         if (r < 0)
 165                                 return r;
 166                 }
 167
 168                 /* We unconditionally drop FD_CLOEXEC from the fds,
 169                  * since after all we want to pass these fds to our
 170                  * children */
 171
 172                 r = fd_cloexec(fds[i], false);
 173                 if (r < 0)
 174                         return r;
 175         }
 176
 177         return 0;
 178 }
 179
 180 static const char *exec_context_tty_path(const ExecContext *context) {
 181         assert(context);
 182
 183         if (context->stdio_as_fds)
 184                 return NULL;
 185
 186         if (context->tty_path)
 187                 return context->tty_path;
 188
 189         return "/dev/console";
 190 }
 191
 192 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 193         const char *path;
 194
 195         assert(context);
 196
 197         path = exec_context_tty_path(context);
 198
 199         if (context->tty_vhangup) {
 200                 if (p && p->stdin_fd >= 0)
 201                         (void) terminal_vhangup_fd(p->stdin_fd);
 202                 else if (path)
 203                         (void) terminal_vhangup(path);
 204         }
 205
 206         if (context->tty_reset) {
 207                 if (p && p->stdin_fd >= 0)
 208                         (void) reset_terminal_fd(p->stdin_fd, true);
 209                 else if (path)
 210                         (void) reset_terminal(path);
 211         }
 212
 213         if (context->tty_vt_disallocate && path)
 214                 (void) vt_disallocate(path);
 215 }
 216
 217 static bool is_terminal_input(ExecInput i) {
 218         return IN_SET(i,
 219                       EXEC_INPUT_TTY,
 220                       EXEC_INPUT_TTY_FORCE,
 221                       EXEC_INPUT_TTY_FAIL);
 222 }
 223
 224 static bool is_terminal_output(ExecOutput o) {
 225         return IN_SET(o,
 226                       EXEC_OUTPUT_TTY,
 227                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 228                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 229 }
 230
 231 static bool is_kmsg_output(ExecOutput o) {
 232         return IN_SET(o,
 233                       EXEC_OUTPUT_KMSG,
 234                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 235 }
 236
 237 static bool exec_context_needs_term(const ExecContext *c) {
 238         assert(c);
 239
 240         /* Return true if the execution context suggests we should set $TERM to something useful. */
 241
 242         if (is_terminal_input(c->std_input))
 243                 return true;
 244
 245         if (is_terminal_output(c->std_output))
 246                 return true;
 247
 248         if (is_terminal_output(c->std_error))
 249                 return true;
 250
 251         return !!c->tty_path;
 252 }
 253
 254 static int open_null_as(int flags, int nfd) {
 255         int fd;
 256
 257         assert(nfd >= 0);
 258
 259         fd = open("/dev/null", flags|O_NOCTTY);
 260         if (fd < 0)
 261                 return -errno;
 262
 263         return move_fd(fd, nfd, false);
 264 }
 265
 266 static int connect_journal_socket(
 267                 int fd,
 268                 const char *log_namespace,
 269                 uid_t uid,
 270                 gid_t gid) {
 271
 272         union sockaddr_union sa;
 273         socklen_t sa_len;
 274         uid_t olduid = UID_INVALID;
 275         gid_t oldgid = GID_INVALID;
 276         const char *j;
 277         int r;
 278
 279         j = log_namespace ?
 280                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 281                 "/run/systemd/journal/stdout";
 282         r = sockaddr_un_set_path(&sa.un, j);
 283         if (r < 0)
 284                 return r;
 285         sa_len = r;
 286
 287         if (gid_is_valid(gid)) {
 288                 oldgid = getgid();
 289
 290                 if (setegid(gid) < 0)
 291                         return -errno;
 292         }
 293
 294         if (uid_is_valid(uid)) {
 295                 olduid = getuid();
 296
 297                 if (seteuid(uid) < 0) {
 298                         r = -errno;
 299                         goto restore_gid;
 300                 }
 301         }
 302
 303         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 304
 305         /* If we fail to restore the uid or gid, things will likely
 306            fail later on. This should only happen if an LSM interferes. */
 307
 308         if (uid_is_valid(uid))
 309                 (void) seteuid(olduid);
 310
 311  restore_gid:
 312         if (gid_is_valid(gid))
 313                 (void) setegid(oldgid);
 314
 315         return r;
 316 }
 317
 318 static int connect_logger_as(
 319                 const Unit *unit,
 320                 const ExecContext *context,
 321                 const ExecParameters *params,
 322                 ExecOutput output,
 323                 const char *ident,
 324                 int nfd,
 325                 uid_t uid,
 326                 gid_t gid) {
 327
 328         _cleanup_close_ int fd = -1;
 329         int r;
 330
 331         assert(context);
 332         assert(params);
 333         assert(output < _EXEC_OUTPUT_MAX);
 334         assert(ident);
 335         assert(nfd >= 0);
 336
 337         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 338         if (fd < 0)
 339                 return -errno;
 340
 341         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 342         if (r < 0)
 343                 return r;
 344
 345         if (shutdown(fd, SHUT_RD) < 0)
 346                 return -errno;
 347
 348         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 349
 350         if (dprintf(fd,
 351                 "%s\n"
 352                 "%s\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n"
 356                 "%i\n"
 357                 "%i\n",
 358                 context->syslog_identifier ?: ident,
 359                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 360                 context->syslog_priority,
 361                 !!context->syslog_level_prefix,
 362                 false,
 363                 is_kmsg_output(output),
 364                 is_terminal_output(output)) < 0)
 365                 return -errno;
 366
 367         return move_fd(TAKE_FD(fd), nfd, false);
 368 }
 369
 370 static int open_terminal_as(const char *path, int flags, int nfd) {
 371         int fd;
 372
 373         assert(path);
 374         assert(nfd >= 0);
 375
 376         fd = open_terminal(path, flags | O_NOCTTY);
 377         if (fd < 0)
 378                 return fd;
 379
 380         return move_fd(fd, nfd, false);
 381 }
 382
 383 static int acquire_path(const char *path, int flags, mode_t mode) {
 384         union sockaddr_union sa;
 385         socklen_t sa_len;
 386         _cleanup_close_ int fd = -1;
 387         int r;
 388
 389         assert(path);
 390
 391         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 392                 flags |= O_CREAT;
 393
 394         fd = open(path, flags|O_NOCTTY, mode);
 395         if (fd >= 0)
 396                 return TAKE_FD(fd);
 397
 398         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 399                 return -errno;
 400
 401         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 402
 403         r = sockaddr_un_set_path(&sa.un, path);
 404         if (r < 0)
 405                 return r == -EINVAL ? -ENXIO : r;
 406         sa_len = r;
 407
 408         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 409         if (fd < 0)
 410                 return -errno;
 411
 412         if (connect(fd, &sa.sa, sa_len) < 0)
 413                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 414                                                            * indication that this wasn't an AF_UNIX socket after all */
 415
 416         if ((flags & O_ACCMODE) == O_RDONLY)
 417                 r = shutdown(fd, SHUT_WR);
 418         else if ((flags & O_ACCMODE) == O_WRONLY)
 419                 r = shutdown(fd, SHUT_RD);
 420         else
 421                 r = 0;
 422         if (r < 0)
 423                 return -errno;
 424
 425         return TAKE_FD(fd);
 426 }
 427
 428 static int fixup_input(
 429                 const ExecContext *context,
 430                 int socket_fd,
 431                 bool apply_tty_stdin) {
 432
 433         ExecInput std_input;
 434
 435         assert(context);
 436
 437         std_input = context->std_input;
 438
 439         if (is_terminal_input(std_input) && !apply_tty_stdin)
 440                 return EXEC_INPUT_NULL;
 441
 442         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 443                 return EXEC_INPUT_NULL;
 444
 445         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 446                 return EXEC_INPUT_NULL;
 447
 448         return std_input;
 449 }
 450
 451 static int fixup_output(ExecOutput output, int socket_fd) {
 452
 453         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 454                 return EXEC_OUTPUT_INHERIT;
 455
 456         return output;
 457 }
 458
 459 static int setup_input(
 460                 const ExecContext *context,
 461                 const ExecParameters *params,
 462                 int socket_fd,
 463                 const int named_iofds[static 3]) {
 464
 465         ExecInput i;
 466
 467         assert(context);
 468         assert(params);
 469         assert(named_iofds);
 470
 471         if (params->stdin_fd >= 0) {
 472                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 473                         return -errno;
 474
 475                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 476                 if (isatty(STDIN_FILENO)) {
 477                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 478                         (void) reset_terminal_fd(STDIN_FILENO, true);
 479                 }
 480
 481                 return STDIN_FILENO;
 482         }
 483
 484         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 485
 486         switch (i) {
 487
 488         case EXEC_INPUT_NULL:
 489                 return open_null_as(O_RDONLY, STDIN_FILENO);
 490
 491         case EXEC_INPUT_TTY:
 492         case EXEC_INPUT_TTY_FORCE:
 493         case EXEC_INPUT_TTY_FAIL: {
 494                 int fd;
 495
 496                 fd = acquire_terminal(exec_context_tty_path(context),
 497                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 498                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 499                                                                   ACQUIRE_TERMINAL_WAIT,
 500                                       USEC_INFINITY);
 501                 if (fd < 0)
 502                         return fd;
 503
 504                 return move_fd(fd, STDIN_FILENO, false);
 505         }
 506
 507         case EXEC_INPUT_SOCKET:
 508                 assert(socket_fd >= 0);
 509
 510                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 511
 512         case EXEC_INPUT_NAMED_FD:
 513                 assert(named_iofds[STDIN_FILENO] >= 0);
 514
 515                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 516                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 517
 518         case EXEC_INPUT_DATA: {
 519                 int fd;
 520
 521                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 522                 if (fd < 0)
 523                         return fd;
 524
 525                 return move_fd(fd, STDIN_FILENO, false);
 526         }
 527
 528         case EXEC_INPUT_FILE: {
 529                 bool rw;
 530                 int fd;
 531
 532                 assert(context->stdio_file[STDIN_FILENO]);
 533
 534                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 535                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 536
 537                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 538                 if (fd < 0)
 539                         return fd;
 540
 541                 return move_fd(fd, STDIN_FILENO, false);
 542         }
 543
 544         default:
 545                 assert_not_reached("Unknown input type");
 546         }
 547 }
 548
 549 static bool can_inherit_stderr_from_stdout(
 550                 const ExecContext *context,
 551                 ExecOutput o,
 552                 ExecOutput e) {
 553
 554         assert(context);
 555
 556         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 557          * stderr fd */
 558
 559         if (e == EXEC_OUTPUT_INHERIT)
 560                 return true;
 561         if (e != o)
 562                 return false;
 563
 564         if (e == EXEC_OUTPUT_NAMED_FD)
 565                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 566
 567         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 568                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 569
 570         return true;
 571 }
 572
 573 static int setup_output(
 574                 const Unit *unit,
 575                 const ExecContext *context,
 576                 const ExecParameters *params,
 577                 int fileno,
 578                 int socket_fd,
 579                 const int named_iofds[static 3],
 580                 const char *ident,
 581                 uid_t uid,
 582                 gid_t gid,
 583                 dev_t *journal_stream_dev,
 584                 ino_t *journal_stream_ino) {
 585
 586         ExecOutput o;
 587         ExecInput i;
 588         int r;
 589
 590         assert(unit);
 591         assert(context);
 592         assert(params);
 593         assert(ident);
 594         assert(journal_stream_dev);
 595         assert(journal_stream_ino);
 596
 597         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 598
 599                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 600                         return -errno;
 601
 602                 return STDOUT_FILENO;
 603         }
 604
 605         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 606                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 607                         return -errno;
 608
 609                 return STDERR_FILENO;
 610         }
 611
 612         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 613         o = fixup_output(context->std_output, socket_fd);
 614
 615         if (fileno == STDERR_FILENO) {
 616                 ExecOutput e;
 617                 e = fixup_output(context->std_error, socket_fd);
 618
 619                 /* This expects the input and output are already set up */
 620
 621                 /* Don't change the stderr file descriptor if we inherit all
 622                  * the way and are not on a tty */
 623                 if (e == EXEC_OUTPUT_INHERIT &&
 624                     o == EXEC_OUTPUT_INHERIT &&
 625                     i == EXEC_INPUT_NULL &&
 626                     !is_terminal_input(context->std_input) &&
 627                     getppid() != 1)
 628                         return fileno;
 629
 630                 /* Duplicate from stdout if possible */
 631                 if (can_inherit_stderr_from_stdout(context, o, e))
 632                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 633
 634                 o = e;
 635
 636         } else if (o == EXEC_OUTPUT_INHERIT) {
 637                 /* If input got downgraded, inherit the original value */
 638                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 639                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 640
 641                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 642                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 643                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 644
 645                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 646                 if (getppid() != 1)
 647                         return fileno;
 648
 649                 /* We need to open /dev/null here anew, to get the right access mode. */
 650                 return open_null_as(O_WRONLY, fileno);
 651         }
 652
 653         switch (o) {
 654
 655         case EXEC_OUTPUT_NULL:
 656                 return open_null_as(O_WRONLY, fileno);
 657
 658         case EXEC_OUTPUT_TTY:
 659                 if (is_terminal_input(i))
 660                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 661
 662                 /* We don't reset the terminal if this is just about output */
 663                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 664
 665         case EXEC_OUTPUT_KMSG:
 666         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 667         case EXEC_OUTPUT_JOURNAL:
 668         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 669                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 670                 if (r < 0) {
 671                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 672                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 673                         r = open_null_as(O_WRONLY, fileno);
 674                 } else {
 675                         struct stat st;
 676
 677                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 678                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 679                          * services to detect whether they are connected to the journal or not.
 680                          *
 681                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 682                          * about STDERR as that's usually the best way to do logging. */
 683
 684                         if (fstat(fileno, &st) >= 0 &&
 685                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 686                                 *journal_stream_dev = st.st_dev;
 687                                 *journal_stream_ino = st.st_ino;
 688                         }
 689                 }
 690                 return r;
 691
 692         case EXEC_OUTPUT_SOCKET:
 693                 assert(socket_fd >= 0);
 694
 695                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 696
 697         case EXEC_OUTPUT_NAMED_FD:
 698                 assert(named_iofds[fileno] >= 0);
 699
 700                 (void) fd_nonblock(named_iofds[fileno], false);
 701                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 702
 703         case EXEC_OUTPUT_FILE:
 704         case EXEC_OUTPUT_FILE_APPEND:
 705         case EXEC_OUTPUT_FILE_TRUNCATE: {
 706                 bool rw;
 707                 int fd, flags;
 708
 709                 assert(context->stdio_file[fileno]);
 710
 711                 rw = context->std_input == EXEC_INPUT_FILE &&
 712                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 713
 714                 if (rw)
 715                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 716
 717                 flags = O_WRONLY;
 718                 if (o == EXEC_OUTPUT_FILE_APPEND)
 719                         flags |= O_APPEND;
 720                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 721                         flags |= O_TRUNC;
 722
 723                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 724                 if (fd < 0)
 725                         return fd;
 726
 727                 return move_fd(fd, fileno, 0);
 728         }
 729
 730         default:
 731                 assert_not_reached("Unknown error type");
 732         }
 733 }
 734
 735 static int chown_terminal(int fd, uid_t uid) {
 736         int r;
 737
 738         assert(fd >= 0);
 739
 740         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 741         if (isatty(fd) < 1) {
 742                 if (IN_SET(errno, EINVAL, ENOTTY))
 743                         return 0; /* not a tty */
 744
 745                 return -errno;
 746         }
 747
 748         /* This might fail. What matters are the results. */
 749         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 750         if (r < 0)
 751                 return r;
 752
 753         return 1;
 754 }
 755
 756 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 757         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 758         int r;
 759
 760         assert(_saved_stdin);
 761         assert(_saved_stdout);
 762
 763         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 764         if (saved_stdin < 0)
 765                 return -errno;
 766
 767         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 768         if (saved_stdout < 0)
 769                 return -errno;
 770
 771         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 772         if (fd < 0)
 773                 return fd;
 774
 775         r = chown_terminal(fd, getuid());
 776         if (r < 0)
 777                 return r;
 778
 779         r = reset_terminal_fd(fd, true);
 780         if (r < 0)
 781                 return r;
 782
 783         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 784         fd = -1;
 785         if (r < 0)
 786                 return r;
 787
 788         *_saved_stdin = saved_stdin;
 789         *_saved_stdout = saved_stdout;
 790
 791         saved_stdin = saved_stdout = -1;
 792
 793         return 0;
 794 }
 795
 796 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 797         assert(err < 0);
 798
 799         if (err == -ETIMEDOUT)
 800                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 801         else {
 802                 errno = -err;
 803                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 804         }
 805 }
 806
 807 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 808         _cleanup_close_ int fd = -1;
 809
 810         assert(vc);
 811
 812         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 813         if (fd < 0)
 814                 return;
 815
 816         write_confirm_error_fd(err, fd, u);
 817 }
 818
 819 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 820         int r = 0;
 821
 822         assert(saved_stdin);
 823         assert(saved_stdout);
 824
 825         release_terminal();
 826
 827         if (*saved_stdin >= 0)
 828                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 829                         r = -errno;
 830
 831         if (*saved_stdout >= 0)
 832                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 833                         r = -errno;
 834
 835         *saved_stdin = safe_close(*saved_stdin);
 836         *saved_stdout = safe_close(*saved_stdout);
 837
 838         return r;
 839 }
 840
 841 enum {
 842         CONFIRM_PRETEND_FAILURE = -1,
 843         CONFIRM_PRETEND_SUCCESS =  0,
 844         CONFIRM_EXECUTE = 1,
 845 };
 846
 847 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 848         int saved_stdout = -1, saved_stdin = -1, r;
 849         _cleanup_free_ char *e = NULL;
 850         char c;
 851
 852         /* For any internal errors, assume a positive response. */
 853         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 854         if (r < 0) {
 855                 write_confirm_error(r, vc, u);
 856                 return CONFIRM_EXECUTE;
 857         }
 858
 859         /* confirm_spawn might have been disabled while we were sleeping. */
 860         if (manager_is_confirm_spawn_disabled(u->manager)) {
 861                 r = 1;
 862                 goto restore_stdio;
 863         }
 864
 865         e = ellipsize(cmdline, 60, 100);
 866         if (!e) {
 867                 log_oom();
 868                 r = CONFIRM_EXECUTE;
 869                 goto restore_stdio;
 870         }
 871
 872         for (;;) {
 873                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 874                 if (r < 0) {
 875                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 876                         r = CONFIRM_EXECUTE;
 877                         goto restore_stdio;
 878                 }
 879
 880                 switch (c) {
 881                 case 'c':
 882                         printf("Resuming normal execution.\n");
 883                         manager_disable_confirm_spawn();
 884                         r = 1;
 885                         break;
 886                 case 'D':
 887                         unit_dump(u, stdout, "  ");
 888                         continue; /* ask again */
 889                 case 'f':
 890                         printf("Failing execution.\n");
 891                         r = CONFIRM_PRETEND_FAILURE;
 892                         break;
 893                 case 'h':
 894                         printf("  c - continue, proceed without asking anymore\n"
 895                                "  D - dump, show the state of the unit\n"
 896                                "  f - fail, don't execute the command and pretend it failed\n"
 897                                "  h - help\n"
 898                                "  i - info, show a short summary of the unit\n"
 899                                "  j - jobs, show jobs that are in progress\n"
 900                                "  s - skip, don't execute the command and pretend it succeeded\n"
 901                                "  y - yes, execute the command\n");
 902                         continue; /* ask again */
 903                 case 'i':
 904                         printf("  Description: %s\n"
 905                                "  Unit:        %s\n"
 906                                "  Command:     %s\n",
 907                                u->id, u->description, cmdline);
 908                         continue; /* ask again */
 909                 case 'j':
 910                         manager_dump_jobs(u->manager, stdout, "  ");
 911                         continue; /* ask again */
 912                 case 'n':
 913                         /* 'n' was removed in favor of 'f'. */
 914                         printf("Didn't understand 'n', did you mean 'f'?\n");
 915                         continue; /* ask again */
 916                 case 's':
 917                         printf("Skipping execution.\n");
 918                         r = CONFIRM_PRETEND_SUCCESS;
 919                         break;
 920                 case 'y':
 921                         r = CONFIRM_EXECUTE;
 922                         break;
 923                 default:
 924                         assert_not_reached("Unhandled choice");
 925                 }
 926                 break;
 927         }
 928
 929 restore_stdio:
 930         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 931         return r;
 932 }
 933
 934 static int get_fixed_user(const ExecContext *c, const char **user,
 935                           uid_t *uid, gid_t *gid,
 936                           const char **home, const char **shell) {
 937         int r;
 938         const char *name;
 939
 940         assert(c);
 941
 942         if (!c->user)
 943                 return 0;
 944
 945         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 946          * (i.e. are "/" or "/bin/nologin"). */
 947
 948         name = c->user;
 949         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 950         if (r < 0)
 951                 return r;
 952
 953         *user = name;
 954         return 0;
 955 }
 956
 957 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 958         int r;
 959         const char *name;
 960
 961         assert(c);
 962
 963         if (!c->group)
 964                 return 0;
 965
 966         name = c->group;
 967         r = get_group_creds(&name, gid, 0);
 968         if (r < 0)
 969                 return r;
 970
 971         *group = name;
 972         return 0;
 973 }
 974
 975 static int get_supplementary_groups(const ExecContext *c, const char *user,
 976                                     const char *group, gid_t gid,
 977                                     gid_t **supplementary_gids, int *ngids) {
 978         char **i;
 979         int r, k = 0;
 980         int ngroups_max;
 981         bool keep_groups = false;
 982         gid_t *groups = NULL;
 983         _cleanup_free_ gid_t *l_gids = NULL;
 984
 985         assert(c);
 986
 987         /*
 988          * If user is given, then lookup GID and supplementary groups list.
 989          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 990          * here and as early as possible so we keep the list of supplementary
 991          * groups of the caller.
 992          */
 993         if (user && gid_is_valid(gid) && gid != 0) {
 994                 /* First step, initialize groups from /etc/groups */
 995                 if (initgroups(user, gid) < 0)
 996                         return -errno;
 997
 998                 keep_groups = true;
 999         }
1000
1001         if (strv_isempty(c->supplementary_groups))
1002                 return 0;
1003
1004         /*
1005          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006          * be positive, otherwise fail.
1007          */
1008         errno = 0;
1009         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1010         if (ngroups_max <= 0)
1011                 return errno_or_else(EOPNOTSUPP);
1012
1013         l_gids = new(gid_t, ngroups_max);
1014         if (!l_gids)
1015                 return -ENOMEM;
1016
1017         if (keep_groups) {
1018                 /*
1019                  * Lookup the list of groups that the user belongs to, we
1020                  * avoid NSS lookups here too for gid=0.
1021                  */
1022                 k = ngroups_max;
1023                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024                         return -EINVAL;
1025         } else
1026                 k = 0;
1027
1028         STRV_FOREACH(i, c->supplementary_groups) {
1029                 const char *g;
1030
1031                 if (k >= ngroups_max)
1032                         return -E2BIG;
1033
1034                 g = *i;
1035                 r = get_group_creds(&g, l_gids+k, 0);
1036                 if (r < 0)
1037                         return r;
1038
1039                 k++;
1040         }
1041
1042         /*
1043          * Sets ngids to zero to drop all supplementary groups, happens
1044          * when we are under root and SupplementaryGroups= is empty.
1045          */
1046         if (k == 0) {
1047                 *ngids = 0;
1048                 return 0;
1049         }
1050
1051         /* Otherwise get the final list of supplementary groups */
1052         groups = memdup(l_gids, sizeof(gid_t) * k);
1053         if (!groups)
1054                 return -ENOMEM;
1055
1056         *supplementary_gids = groups;
1057         *ngids = k;
1058
1059         groups = NULL;
1060
1061         return 0;
1062 }
1063
1064 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1065         int r;
1066
1067         /* Handle SupplementaryGroups= if it is not empty */
1068         if (ngids > 0) {
1069                 r = maybe_setgroups(ngids, supplementary_gids);
1070                 if (r < 0)
1071                         return r;
1072         }
1073
1074         if (gid_is_valid(gid)) {
1075                 /* Then set our gids */
1076                 if (setresgid(gid, gid, gid) < 0)
1077                         return -errno;
1078         }
1079
1080         return 0;
1081 }
1082
1083 static int set_securebits(int bits, int mask) {
1084         int current, applied;
1085         current = prctl(PR_GET_SECUREBITS);
1086         if (current < 0)
1087                 return -errno;
1088         /* Clear all securebits defined in mask and set bits */
1089         applied = (current & ~mask) | bits;
1090         if (current == applied)
1091                 return 0;
1092         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1093                 return -errno;
1094         return 1;
1095 }
1096
1097 static int enforce_user(const ExecContext *context, uid_t uid) {
1098         assert(context);
1099         int r;
1100
1101         if (!uid_is_valid(uid))
1102                 return 0;
1103
1104         /* Sets (but doesn't look up) the uid and make sure we keep the
1105          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1106          * required, so we also need keep-caps in this case.
1107          */
1108
1109         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1110
1111                 /* First step: If we need to keep capabilities but
1112                  * drop privileges we need to make sure we keep our
1113                  * caps, while we drop privileges. */
1114                 if (uid != 0) {
1115                         /* Add KEEP_CAPS to the securebits */
1116                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1117                         if (r < 0)
1118                                 return r;
1119                 }
1120         }
1121
1122         /* Second step: actually set the uids */
1123         if (setresuid(uid, uid, uid) < 0)
1124                 return -errno;
1125
1126         /* At this point we should have all necessary capabilities but
1127            are otherwise a normal user. However, the caps might got
1128            corrupted due to the setresuid() so we need clean them up
1129            later. This is done outside of this call. */
1130
1131         return 0;
1132 }
1133
1134 #if HAVE_PAM
1135
1136 static int null_conv(
1137                 int num_msg,
1138                 const struct pam_message **msg,
1139                 struct pam_response **resp,
1140                 void *appdata_ptr) {
1141
1142         /* We don't support conversations */
1143
1144         return PAM_CONV_ERR;
1145 }
1146
1147 #endif
1148
1149 static int setup_pam(
1150                 const char *name,
1151                 const char *user,
1152                 uid_t uid,
1153                 gid_t gid,
1154                 const char *tty,
1155                 char ***env,
1156                 const int fds[], size_t n_fds) {
1157
1158 #if HAVE_PAM
1159
1160         static const struct pam_conv conv = {
1161                 .conv = null_conv,
1162                 .appdata_ptr = NULL
1163         };
1164
1165         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1166         pam_handle_t *handle = NULL;
1167         sigset_t old_ss;
1168         int pam_code = PAM_SUCCESS, r;
1169         char **nv, **e = NULL;
1170         bool close_session = false;
1171         pid_t pam_pid = 0, parent_pid;
1172         int flags = 0;
1173
1174         assert(name);
1175         assert(user);
1176         assert(env);
1177
1178         /* We set up PAM in the parent process, then fork. The child
1179          * will then stay around until killed via PR_GET_PDEATHSIG or
1180          * systemd via the cgroup logic. It will then remove the PAM
1181          * session again. The parent process will exec() the actual
1182          * daemon. We do things this way to ensure that the main PID
1183          * of the daemon is the one we initially fork()ed. */
1184
1185         r = barrier_create(&barrier);
1186         if (r < 0)
1187                 goto fail;
1188
1189         if (log_get_max_level() < LOG_DEBUG)
1190                 flags |= PAM_SILENT;
1191
1192         pam_code = pam_start(name, user, &conv, &handle);
1193         if (pam_code != PAM_SUCCESS) {
1194                 handle = NULL;
1195                 goto fail;
1196         }
1197
1198         if (!tty) {
1199                 _cleanup_free_ char *q = NULL;
1200
1201                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1202                  * out if that's the case, and read the TTY off it. */
1203
1204                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1205                         tty = strjoina("/dev/", q);
1206         }
1207
1208         if (tty) {
1209                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1210                 if (pam_code != PAM_SUCCESS)
1211                         goto fail;
1212         }
1213
1214         STRV_FOREACH(nv, *env) {
1215                 pam_code = pam_putenv(handle, *nv);
1216                 if (pam_code != PAM_SUCCESS)
1217                         goto fail;
1218         }
1219
1220         pam_code = pam_acct_mgmt(handle, flags);
1221         if (pam_code != PAM_SUCCESS)
1222                 goto fail;
1223
1224         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1225         if (pam_code != PAM_SUCCESS)
1226                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1227
1228         pam_code = pam_open_session(handle, flags);
1229         if (pam_code != PAM_SUCCESS)
1230                 goto fail;
1231
1232         close_session = true;
1233
1234         e = pam_getenvlist(handle);
1235         if (!e) {
1236                 pam_code = PAM_BUF_ERR;
1237                 goto fail;
1238         }
1239
1240         /* Block SIGTERM, so that we know that it won't get lost in
1241          * the child */
1242
1243         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1244
1245         parent_pid = getpid_cached();
1246
1247         r = safe_fork("(sd-pam)", 0, &pam_pid);
1248         if (r < 0)
1249                 goto fail;
1250         if (r == 0) {
1251                 int sig, ret = EXIT_PAM;
1252
1253                 /* The child's job is to reset the PAM session on
1254                  * termination */
1255                 barrier_set_role(&barrier, BARRIER_CHILD);
1256
1257                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1258                  * those fds are open here that have been opened by PAM. */
1259                 (void) close_many(fds, n_fds);
1260
1261                 /* Drop privileges - we don't need any to pam_close_session
1262                  * and this will make PR_SET_PDEATHSIG work in most cases.
1263                  * If this fails, ignore the error - but expect sd-pam threads
1264                  * to fail to exit normally */
1265
1266                 r = maybe_setgroups(0, NULL);
1267                 if (r < 0)
1268                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1269                 if (setresgid(gid, gid, gid) < 0)
1270                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1271                 if (setresuid(uid, uid, uid) < 0)
1272                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1273
1274                 (void) ignore_signals(SIGPIPE);
1275
1276                 /* Wait until our parent died. This will only work if
1277                  * the above setresuid() succeeds, otherwise the kernel
1278                  * will not allow unprivileged parents kill their privileged
1279                  * children this way. We rely on the control groups kill logic
1280                  * to do the rest for us. */
1281                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1282                         goto child_finish;
1283
1284                 /* Tell the parent that our setup is done. This is especially
1285                  * important regarding dropping privileges. Otherwise, unit
1286                  * setup might race against our setresuid(2) call.
1287                  *
1288                  * If the parent aborted, we'll detect this below, hence ignore
1289                  * return failure here. */
1290                 (void) barrier_place(&barrier);
1291
1292                 /* Check if our parent process might already have died? */
1293                 if (getppid() == parent_pid) {
1294                         sigset_t ss;
1295
1296                         assert_se(sigemptyset(&ss) >= 0);
1297                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1298
1299                         for (;;) {
1300                                 if (sigwait(&ss, &sig) < 0) {
1301                                         if (errno == EINTR)
1302                                                 continue;
1303
1304                                         goto child_finish;
1305                                 }
1306
1307                                 assert(sig == SIGTERM);
1308                                 break;
1309                         }
1310                 }
1311
1312                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1313                 if (pam_code != PAM_SUCCESS)
1314                         goto child_finish;
1315
1316                 /* If our parent died we'll end the session */
1317                 if (getppid() != parent_pid) {
1318                         pam_code = pam_close_session(handle, flags);
1319                         if (pam_code != PAM_SUCCESS)
1320                                 goto child_finish;
1321                 }
1322
1323                 ret = 0;
1324
1325         child_finish:
1326                 pam_end(handle, pam_code | flags);
1327                 _exit(ret);
1328         }
1329
1330         barrier_set_role(&barrier, BARRIER_PARENT);
1331
1332         /* If the child was forked off successfully it will do all the
1333          * cleanups, so forget about the handle here. */
1334         handle = NULL;
1335
1336         /* Unblock SIGTERM again in the parent */
1337         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1338
1339         /* We close the log explicitly here, since the PAM modules
1340          * might have opened it, but we don't want this fd around. */
1341         closelog();
1342
1343         /* Synchronously wait for the child to initialize. We don't care for
1344          * errors as we cannot recover. However, warn loudly if it happens. */
1345         if (!barrier_place_and_sync(&barrier))
1346                 log_error("PAM initialization failed");
1347
1348         return strv_free_and_replace(*env, e);
1349
1350 fail:
1351         if (pam_code != PAM_SUCCESS) {
1352                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1353                 r = -EPERM;  /* PAM errors do not map to errno */
1354         } else
1355                 log_error_errno(r, "PAM failed: %m");
1356
1357         if (handle) {
1358                 if (close_session)
1359                         pam_code = pam_close_session(handle, flags);
1360
1361                 pam_end(handle, pam_code | flags);
1362         }
1363
1364         strv_free(e);
1365         closelog();
1366
1367         return r;
1368 #else
1369         return 0;
1370 #endif
1371 }
1372
1373 static void rename_process_from_path(const char *path) {
1374         char process_name[11];
1375         const char *p;
1376         size_t l;
1377
1378         /* This resulting string must fit in 10 chars (i.e. the length
1379          * of "/sbin/init") to look pretty in /bin/ps */
1380
1381         p = basename(path);
1382         if (isempty(p)) {
1383                 rename_process("(...)");
1384                 return;
1385         }
1386
1387         l = strlen(p);
1388         if (l > 8) {
1389                 /* The end of the process name is usually more
1390                  * interesting, since the first bit might just be
1391                  * "systemd-" */
1392                 p = p + l - 8;
1393                 l = 8;
1394         }
1395
1396         process_name[0] = '(';
1397         memcpy(process_name+1, p, l);
1398         process_name[1+l] = ')';
1399         process_name[1+l+1] = 0;
1400
1401         rename_process(process_name);
1402 }
1403
1404 static bool context_has_address_families(const ExecContext *c) {
1405         assert(c);
1406
1407         return c->address_families_allow_list ||
1408                 !set_isempty(c->address_families);
1409 }
1410
1411 static bool context_has_syscall_filters(const ExecContext *c) {
1412         assert(c);
1413
1414         return c->syscall_allow_list ||
1415                 !hashmap_isempty(c->syscall_filter);
1416 }
1417
1418 static bool context_has_syscall_logs(const ExecContext *c) {
1419         assert(c);
1420
1421         return c->syscall_log_allow_list ||
1422                 !hashmap_isempty(c->syscall_log);
1423 }
1424
1425 static bool context_has_no_new_privileges(const ExecContext *c) {
1426         assert(c);
1427
1428         if (c->no_new_privileges)
1429                 return true;
1430
1431         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1432                 return false;
1433
1434         /* We need NNP if we have any form of seccomp and are unprivileged */
1435         return c->lock_personality ||
1436                 c->memory_deny_write_execute ||
1437                 c->private_devices ||
1438                 c->protect_clock ||
1439                 c->protect_hostname ||
1440                 c->protect_kernel_tunables ||
1441                 c->protect_kernel_modules ||
1442                 c->protect_kernel_logs ||
1443                 context_has_address_families(c) ||
1444                 exec_context_restrict_namespaces_set(c) ||
1445                 c->restrict_realtime ||
1446                 c->restrict_suid_sgid ||
1447                 !set_isempty(c->syscall_archs) ||
1448                 context_has_syscall_filters(c) ||
1449                 context_has_syscall_logs(c);
1450 }
1451
1452 static bool exec_context_has_credentials(const ExecContext *context) {
1453
1454         assert(context);
1455
1456         return !hashmap_isempty(context->set_credentials) ||
1457                 context->load_credentials;
1458 }
1459
1460 #if HAVE_SECCOMP
1461
1462 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1463
1464         if (is_seccomp_available())
1465                 return false;
1466
1467         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1468         return true;
1469 }
1470
1471 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1472         uint32_t negative_action, default_action, action;
1473         int r;
1474
1475         assert(u);
1476         assert(c);
1477
1478         if (!context_has_syscall_filters(c))
1479                 return 0;
1480
1481         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1482                 return 0;
1483
1484         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1485
1486         if (c->syscall_allow_list) {
1487                 default_action = negative_action;
1488                 action = SCMP_ACT_ALLOW;
1489         } else {
1490                 default_action = SCMP_ACT_ALLOW;
1491                 action = negative_action;
1492         }
1493
1494         if (needs_ambient_hack) {
1495                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1496                 if (r < 0)
1497                         return r;
1498         }
1499
1500         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1501 }
1502
1503 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1504 #ifdef SCMP_ACT_LOG
1505         uint32_t default_action, action;
1506 #endif
1507
1508         assert(u);
1509         assert(c);
1510
1511         if (!context_has_syscall_logs(c))
1512                 return 0;
1513
1514 #ifdef SCMP_ACT_LOG
1515         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1516                 return 0;
1517
1518         if (c->syscall_log_allow_list) {
1519                 /* Log nothing but the ones listed */
1520                 default_action = SCMP_ACT_ALLOW;
1521                 action = SCMP_ACT_LOG;
1522         } else {
1523                 /* Log everything but the ones listed */
1524                 default_action = SCMP_ACT_LOG;
1525                 action = SCMP_ACT_ALLOW;
1526         }
1527
1528         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1529 #else
1530         /* old libseccomp */
1531         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1532         return 0;
1533 #endif
1534 }
1535
1536 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1537         assert(u);
1538         assert(c);
1539
1540         if (set_isempty(c->syscall_archs))
1541                 return 0;
1542
1543         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1544                 return 0;
1545
1546         return seccomp_restrict_archs(c->syscall_archs);
1547 }
1548
1549 static int apply_address_families(const Unit* u, const ExecContext *c) {
1550         assert(u);
1551         assert(c);
1552
1553         if (!context_has_address_families(c))
1554                 return 0;
1555
1556         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1557                 return 0;
1558
1559         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1560 }
1561
1562 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1563         assert(u);
1564         assert(c);
1565
1566         if (!c->memory_deny_write_execute)
1567                 return 0;
1568
1569         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1570                 return 0;
1571
1572         return seccomp_memory_deny_write_execute();
1573 }
1574
1575 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1576         assert(u);
1577         assert(c);
1578
1579         if (!c->restrict_realtime)
1580                 return 0;
1581
1582         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1583                 return 0;
1584
1585         return seccomp_restrict_realtime();
1586 }
1587
1588 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1589         assert(u);
1590         assert(c);
1591
1592         if (!c->restrict_suid_sgid)
1593                 return 0;
1594
1595         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1596                 return 0;
1597
1598         return seccomp_restrict_suid_sgid();
1599 }
1600
1601 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1602         assert(u);
1603         assert(c);
1604
1605         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1606          * let's protect even those systems where this is left on in the kernel. */
1607
1608         if (!c->protect_kernel_tunables)
1609                 return 0;
1610
1611         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1612                 return 0;
1613
1614         return seccomp_protect_sysctl();
1615 }
1616
1617 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1618         assert(u);
1619         assert(c);
1620
1621         /* Turn off module syscalls on ProtectKernelModules=yes */
1622
1623         if (!c->protect_kernel_modules)
1624                 return 0;
1625
1626         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1627                 return 0;
1628
1629         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1630 }
1631
1632 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1633         assert(u);
1634         assert(c);
1635
1636         if (!c->protect_kernel_logs)
1637                 return 0;
1638
1639         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1640                 return 0;
1641
1642         return seccomp_protect_syslog();
1643 }
1644
1645 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1646         assert(u);
1647         assert(c);
1648
1649         if (!c->protect_clock)
1650                 return 0;
1651
1652         if (skip_seccomp_unavailable(u, "ProtectClock="))
1653                 return 0;
1654
1655         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1656 }
1657
1658 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1659         assert(u);
1660         assert(c);
1661
1662         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1663
1664         if (!c->private_devices)
1665                 return 0;
1666
1667         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1668                 return 0;
1669
1670         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1671 }
1672
1673 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1674         assert(u);
1675         assert(c);
1676
1677         if (!exec_context_restrict_namespaces_set(c))
1678                 return 0;
1679
1680         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1681                 return 0;
1682
1683         return seccomp_restrict_namespaces(c->restrict_namespaces);
1684 }
1685
1686 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1687         unsigned long personality;
1688         int r;
1689
1690         assert(u);
1691         assert(c);
1692
1693         if (!c->lock_personality)
1694                 return 0;
1695
1696         if (skip_seccomp_unavailable(u, "LockPersonality="))
1697                 return 0;
1698
1699         personality = c->personality;
1700
1701         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1702         if (personality == PERSONALITY_INVALID) {
1703
1704                 r = opinionated_personality(&personality);
1705                 if (r < 0)
1706                         return r;
1707         }
1708
1709         return seccomp_lock_personality(personality);
1710 }
1711
1712 #endif
1713
1714 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1715         assert(u);
1716         assert(c);
1717
1718         if (!c->protect_hostname)
1719                 return 0;
1720
1721         if (ns_type_supported(NAMESPACE_UTS)) {
1722                 if (unshare(CLONE_NEWUTS) < 0) {
1723                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1724                                 *ret_exit_status = EXIT_NAMESPACE;
1725                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1726                         }
1727
1728                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1729                 }
1730         } else
1731                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1732
1733 #if HAVE_SECCOMP
1734         int r;
1735
1736         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1737                 return 0;
1738
1739         r = seccomp_protect_hostname();
1740         if (r < 0) {
1741                 *ret_exit_status = EXIT_SECCOMP;
1742                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1743         }
1744 #endif
1745
1746         return 0;
1747 }
1748
1749 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1750         assert(idle_pipe);
1751
1752         idle_pipe[1] = safe_close(idle_pipe[1]);
1753         idle_pipe[2] = safe_close(idle_pipe[2]);
1754
1755         if (idle_pipe[0] >= 0) {
1756                 int r;
1757
1758                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1759
1760                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1761                         ssize_t n;
1762
1763                         /* Signal systemd that we are bored and want to continue. */
1764                         n = write(idle_pipe[3], "x", 1);
1765                         if (n > 0)
1766                                 /* Wait for systemd to react to the signal above. */
1767                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1768                 }
1769
1770                 idle_pipe[0] = safe_close(idle_pipe[0]);
1771
1772         }
1773
1774         idle_pipe[3] = safe_close(idle_pipe[3]);
1775 }
1776
1777 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1778
1779 static int build_environment(
1780                 const Unit *u,
1781                 const ExecContext *c,
1782                 const ExecParameters *p,
1783                 size_t n_fds,
1784                 const char *home,
1785                 const char *username,
1786                 const char *shell,
1787                 dev_t journal_stream_dev,
1788                 ino_t journal_stream_ino,
1789                 char ***ret) {
1790
1791         _cleanup_strv_free_ char **our_env = NULL;
1792         size_t n_env = 0;
1793         char *x;
1794
1795         assert(u);
1796         assert(c);
1797         assert(p);
1798         assert(ret);
1799
1800 #define N_ENV_VARS 17
1801         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1802         if (!our_env)
1803                 return -ENOMEM;
1804
1805         if (n_fds > 0) {
1806                 _cleanup_free_ char *joined = NULL;
1807
1808                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1809                         return -ENOMEM;
1810                 our_env[n_env++] = x;
1811
1812                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1813                         return -ENOMEM;
1814                 our_env[n_env++] = x;
1815
1816                 joined = strv_join(p->fd_names, ":");
1817                 if (!joined)
1818                         return -ENOMEM;
1819
1820                 x = strjoin("LISTEN_FDNAMES=", joined);
1821                 if (!x)
1822                         return -ENOMEM;
1823                 our_env[n_env++] = x;
1824         }
1825
1826         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1827                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1828                         return -ENOMEM;
1829                 our_env[n_env++] = x;
1830
1831                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1832                         return -ENOMEM;
1833                 our_env[n_env++] = x;
1834         }
1835
1836         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1837          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1838          * check the database directly. */
1839         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1840                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1841                 if (!x)
1842                         return -ENOMEM;
1843                 our_env[n_env++] = x;
1844         }
1845
1846         if (home) {
1847                 x = strjoin("HOME=", home);
1848                 if (!x)
1849                         return -ENOMEM;
1850
1851                 path_simplify(x + 5);
1852                 our_env[n_env++] = x;
1853         }
1854
1855         if (username) {
1856                 x = strjoin("LOGNAME=", username);
1857                 if (!x)
1858                         return -ENOMEM;
1859                 our_env[n_env++] = x;
1860
1861                 x = strjoin("USER=", username);
1862                 if (!x)
1863                         return -ENOMEM;
1864                 our_env[n_env++] = x;
1865         }
1866
1867         if (shell) {
1868                 x = strjoin("SHELL=", shell);
1869                 if (!x)
1870                         return -ENOMEM;
1871
1872                 path_simplify(x + 6);
1873                 our_env[n_env++] = x;
1874         }
1875
1876         if (!sd_id128_is_null(u->invocation_id)) {
1877                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1878                         return -ENOMEM;
1879
1880                 our_env[n_env++] = x;
1881         }
1882
1883         if (exec_context_needs_term(c)) {
1884                 const char *tty_path, *term = NULL;
1885
1886                 tty_path = exec_context_tty_path(c);
1887
1888                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1889                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1890                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1891
1892                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1893                         term = getenv("TERM");
1894
1895                 if (!term)
1896                         term = default_term_for_tty(tty_path);
1897
1898                 x = strjoin("TERM=", term);
1899                 if (!x)
1900                         return -ENOMEM;
1901                 our_env[n_env++] = x;
1902         }
1903
1904         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1905                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1906                         return -ENOMEM;
1907
1908                 our_env[n_env++] = x;
1909         }
1910
1911         if (c->log_namespace) {
1912                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1913                 if (!x)
1914                         return -ENOMEM;
1915
1916                 our_env[n_env++] = x;
1917         }
1918
1919         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1920                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1921                 const char *n;
1922
1923                 if (!p->prefix[t])
1924                         continue;
1925
1926                 if (strv_isempty(c->directories[t].paths))
1927                         continue;
1928
1929                 n = exec_directory_env_name_to_string(t);
1930                 if (!n)
1931                         continue;
1932
1933                 pre = strjoin(p->prefix[t], "/");
1934                 if (!pre)
1935                         return -ENOMEM;
1936
1937                 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
1938                 if (!joined)
1939                         return -ENOMEM;
1940
1941                 x = strjoin(n, "=", joined);
1942                 if (!x)
1943                         return -ENOMEM;
1944
1945                 our_env[n_env++] = x;
1946         }
1947
1948         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1949                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1950                 if (!x)
1951                         return -ENOMEM;
1952
1953                 our_env[n_env++] = x;
1954         }
1955
1956         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1957                 return -ENOMEM;
1958
1959         our_env[n_env++] = x;
1960
1961         our_env[n_env++] = NULL;
1962         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1963 #undef N_ENV_VARS
1964
1965         *ret = TAKE_PTR(our_env);
1966
1967         return 0;
1968 }
1969
1970 static int build_pass_environment(const ExecContext *c, char ***ret) {
1971         _cleanup_strv_free_ char **pass_env = NULL;
1972         size_t n_env = 0;
1973         char **i;
1974
1975         STRV_FOREACH(i, c->pass_environment) {
1976                 _cleanup_free_ char *x = NULL;
1977                 char *v;
1978
1979                 v = getenv(*i);
1980                 if (!v)
1981                         continue;
1982                 x = strjoin(*i, "=", v);
1983                 if (!x)
1984                         return -ENOMEM;
1985
1986                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
1987                         return -ENOMEM;
1988
1989                 pass_env[n_env++] = TAKE_PTR(x);
1990                 pass_env[n_env] = NULL;
1991         }
1992
1993         *ret = TAKE_PTR(pass_env);
1994
1995         return 0;
1996 }
1997
1998 bool exec_needs_mount_namespace(
1999                 const ExecContext *context,
2000                 const ExecParameters *params,
2001                 const ExecRuntime *runtime) {
2002
2003         assert(context);
2004
2005         if (context->root_image)
2006                 return true;
2007
2008         if (!strv_isempty(context->read_write_paths) ||
2009             !strv_isempty(context->read_only_paths) ||
2010             !strv_isempty(context->inaccessible_paths) ||
2011             !strv_isempty(context->exec_paths) ||
2012             !strv_isempty(context->no_exec_paths))
2013                 return true;
2014
2015         if (context->n_bind_mounts > 0)
2016                 return true;
2017
2018         if (context->n_temporary_filesystems > 0)
2019                 return true;
2020
2021         if (context->n_mount_images > 0)
2022                 return true;
2023
2024         if (context->n_extension_images > 0)
2025                 return true;
2026
2027         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2028                 return true;
2029
2030         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2031                 return true;
2032
2033         if (context->private_devices ||
2034             context->private_mounts ||
2035             context->protect_system != PROTECT_SYSTEM_NO ||
2036             context->protect_home != PROTECT_HOME_NO ||
2037             context->protect_kernel_tunables ||
2038             context->protect_kernel_modules ||
2039             context->protect_kernel_logs ||
2040             context->protect_control_groups ||
2041             context->protect_proc != PROTECT_PROC_DEFAULT ||
2042             context->proc_subset != PROC_SUBSET_ALL ||
2043             context->private_ipc ||
2044             context->ipc_namespace_path)
2045                 return true;
2046
2047         if (context->root_directory) {
2048                 if (exec_context_get_effective_mount_apivfs(context))
2049                         return true;
2050
2051                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2052                         if (params && !params->prefix[t])
2053                                 continue;
2054
2055                         if (!strv_isempty(context->directories[t].paths))
2056                                 return true;
2057                 }
2058         }
2059
2060         if (context->dynamic_user &&
2061             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
2062              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2063              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2064                 return true;
2065
2066         if (context->log_namespace)
2067                 return true;
2068
2069         return false;
2070 }
2071
2072 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2073         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2074         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2075         _cleanup_close_ int unshare_ready_fd = -1;
2076         _cleanup_(sigkill_waitp) pid_t pid = 0;
2077         uint64_t c = 1;
2078         ssize_t n;
2079         int r;
2080
2081         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2082          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2083          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2084          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2085          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2086          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2087          * continues execution normally.
2088          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2089          * does not need CAP_SETUID to write the single line mapping to itself. */
2090
2091         /* Can only set up multiple mappings with CAP_SETUID. */
2092         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2093                 r = asprintf(&uid_map,
2094                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2095                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2096                              ouid, ouid, uid, uid);
2097         else
2098                 r = asprintf(&uid_map,
2099                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2100                              ouid, ouid);
2101
2102         if (r < 0)
2103                 return -ENOMEM;
2104
2105         /* Can only set up multiple mappings with CAP_SETGID. */
2106         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2107                 r = asprintf(&gid_map,
2108                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2109                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2110                              ogid, ogid, gid, gid);
2111         else
2112                 r = asprintf(&gid_map,
2113                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2114                              ogid, ogid);
2115
2116         if (r < 0)
2117                 return -ENOMEM;
2118
2119         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2120          * namespace. */
2121         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2122         if (unshare_ready_fd < 0)
2123                 return -errno;
2124
2125         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2126          * failed. */
2127         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2128                 return -errno;
2129
2130         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2131         if (r < 0)
2132                 return r;
2133         if (r == 0) {
2134                 _cleanup_close_ int fd = -1;
2135                 const char *a;
2136                 pid_t ppid;
2137
2138                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2139                  * here, after the parent opened its own user namespace. */
2140
2141                 ppid = getppid();
2142                 errno_pipe[0] = safe_close(errno_pipe[0]);
2143
2144                 /* Wait until the parent unshared the user namespace */
2145                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2146                         r = -errno;
2147                         goto child_fail;
2148                 }
2149
2150                 /* Disable the setgroups() system call in the child user namespace, for good. */
2151                 a = procfs_file_alloca(ppid, "setgroups");
2152                 fd = open(a, O_WRONLY|O_CLOEXEC);
2153                 if (fd < 0) {
2154                         if (errno != ENOENT) {
2155                                 r = -errno;
2156                                 goto child_fail;
2157                         }
2158
2159                         /* If the file is missing the kernel is too old, let's continue anyway. */
2160                 } else {
2161                         if (write(fd, "deny\n", 5) < 0) {
2162                                 r = -errno;
2163                                 goto child_fail;
2164                         }
2165
2166                         fd = safe_close(fd);
2167                 }
2168
2169                 /* First write the GID map */
2170                 a = procfs_file_alloca(ppid, "gid_map");
2171                 fd = open(a, O_WRONLY|O_CLOEXEC);
2172                 if (fd < 0) {
2173                         r = -errno;
2174                         goto child_fail;
2175                 }
2176                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2177                         r = -errno;
2178                         goto child_fail;
2179                 }
2180                 fd = safe_close(fd);
2181
2182                 /* The write the UID map */
2183                 a = procfs_file_alloca(ppid, "uid_map");
2184                 fd = open(a, O_WRONLY|O_CLOEXEC);
2185                 if (fd < 0) {
2186                         r = -errno;
2187                         goto child_fail;
2188                 }
2189                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2190                         r = -errno;
2191                         goto child_fail;
2192                 }
2193
2194                 _exit(EXIT_SUCCESS);
2195
2196         child_fail:
2197                 (void) write(errno_pipe[1], &r, sizeof(r));
2198                 _exit(EXIT_FAILURE);
2199         }
2200
2201         errno_pipe[1] = safe_close(errno_pipe[1]);
2202
2203         if (unshare(CLONE_NEWUSER) < 0)
2204                 return -errno;
2205
2206         /* Let the child know that the namespace is ready now */
2207         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2208                 return -errno;
2209
2210         /* Try to read an error code from the child */
2211         n = read(errno_pipe[0], &r, sizeof(r));
2212         if (n < 0)
2213                 return -errno;
2214         if (n == sizeof(r)) { /* an error code was sent to us */
2215                 if (r < 0)
2216                         return r;
2217                 return -EIO;
2218         }
2219         if (n != 0) /* on success we should have read 0 bytes */
2220                 return -EIO;
2221
2222         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2223         pid = 0;
2224         if (r < 0)
2225                 return r;
2226         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2227                 return -EIO;
2228
2229         return 0;
2230 }
2231
2232 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2233         if (!context->dynamic_user)
2234                 return false;
2235
2236         if (type == EXEC_DIRECTORY_CONFIGURATION)
2237                 return false;
2238
2239         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2240                 return false;
2241
2242         return true;
2243 }
2244
2245 static int setup_exec_directory(
2246                 const ExecContext *context,
2247                 const ExecParameters *params,
2248                 uid_t uid,
2249                 gid_t gid,
2250                 ExecDirectoryType type,
2251                 int *exit_status) {
2252
2253         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2254                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2255                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2256                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2257                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2258                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2259         };
2260         char **rt;
2261         int r;
2262
2263         assert(context);
2264         assert(params);
2265         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2266         assert(exit_status);
2267
2268         if (!params->prefix[type])
2269                 return 0;
2270
2271         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2272                 if (!uid_is_valid(uid))
2273                         uid = 0;
2274                 if (!gid_is_valid(gid))
2275                         gid = 0;
2276         }
2277
2278         STRV_FOREACH(rt, context->directories[type].paths) {
2279                 _cleanup_free_ char *p = NULL, *pp = NULL;
2280
2281                 p = path_join(params->prefix[type], *rt);
2282                 if (!p) {
2283                         r = -ENOMEM;
2284                         goto fail;
2285                 }
2286
2287                 r = mkdir_parents_label(p, 0755);
2288                 if (r < 0)
2289                         goto fail;
2290
2291                 if (exec_directory_is_private(context, type)) {
2292                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2293                          * case we want to avoid leaving a directory around fully accessible that is owned by
2294                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2295                          * trick used by container managers to prohibit host users to get access to files of
2296                          * the same UID in containers: we place everything inside a directory that has an
2297                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2298                          * for unprivileged host code. We then use fs namespacing to make this directory
2299                          * permeable for the service itself.
2300                          *
2301                          * Specifically: for a service which wants a special directory "foo/" we first create
2302                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2303                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2304                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2305                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2306                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2307                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2308                          * for the service and making sure it only gets access to the dirs it needs but no
2309                          * others. Tricky? Yes, absolutely, but it works!
2310                          *
2311                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2312                          * to be owned by the service itself.
2313                          *
2314                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2315                          * for sharing files or sockets with other services. */
2316
2317                         pp = path_join(params->prefix[type], "private");
2318                         if (!pp) {
2319                                 r = -ENOMEM;
2320                                 goto fail;
2321                         }
2322
2323                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2324                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2325                         if (r < 0)
2326                                 goto fail;
2327
2328                         if (!path_extend(&pp, *rt)) {
2329                                 r = -ENOMEM;
2330                                 goto fail;
2331                         }
2332
2333                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2334                         r = mkdir_parents_label(pp, 0755);
2335                         if (r < 0)
2336                                 goto fail;
2337
2338                         if (is_dir(p, false) > 0 &&
2339                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2340
2341                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2342                                  * it over. Most likely the service has been upgraded from one that didn't use
2343                                  * DynamicUser=1, to one that does. */
2344
2345                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2346                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2347                                          exec_directory_type_to_string(type), p, pp);
2348
2349                                 if (rename(p, pp) < 0) {
2350                                         r = -errno;
2351                                         goto fail;
2352                                 }
2353                         } else {
2354                                 /* Otherwise, create the actual directory for the service */
2355
2356                                 r = mkdir_label(pp, context->directories[type].mode);
2357                                 if (r < 0 && r != -EEXIST)
2358                                         goto fail;
2359                         }
2360
2361                         /* And link it up from the original place */
2362                         r = symlink_idempotent(pp, p, true);
2363                         if (r < 0)
2364                                 goto fail;
2365
2366                 } else {
2367                         _cleanup_free_ char *target = NULL;
2368
2369                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2370                             readlink_and_make_absolute(p, &target) >= 0) {
2371                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2372
2373                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2374                                  * by DynamicUser=1 (see above)?
2375                                  *
2376                                  * We do this for all directory types except for ConfigurationDirectory=,
2377                                  * since they all support the private/ symlink logic at least in some
2378                                  * configurations, see above. */
2379
2380                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2381                                 if (r < 0)
2382                                         goto fail;
2383
2384                                 q = path_join(params->prefix[type], "private", *rt);
2385                                 if (!q) {
2386                                         r = -ENOMEM;
2387                                         goto fail;
2388                                 }
2389
2390                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2391                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2392                                 if (r < 0)
2393                                         goto fail;
2394
2395                                 if (path_equal(q_resolved, target_resolved)) {
2396
2397                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2398                                          * but is no longer. Let's move the directory back up. */
2399
2400                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2401                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2402                                                  exec_directory_type_to_string(type), q, p);
2403
2404                                         if (unlink(p) < 0) {
2405                                                 r = -errno;
2406                                                 goto fail;
2407                                         }
2408
2409                                         if (rename(q, p) < 0) {
2410                                                 r = -errno;
2411                                                 goto fail;
2412                                         }
2413                                 }
2414                         }
2415
2416                         r = mkdir_label(p, context->directories[type].mode);
2417                         if (r < 0) {
2418                                 if (r != -EEXIST)
2419                                         goto fail;
2420
2421                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2422                                         struct stat st;
2423
2424                                         /* Don't change the owner/access mode of the configuration directory,
2425                                          * as in the common case it is not written to by a service, and shall
2426                                          * not be writable. */
2427
2428                                         if (stat(p, &st) < 0) {
2429                                                 r = -errno;
2430                                                 goto fail;
2431                                         }
2432
2433                                         /* Still complain if the access mode doesn't match */
2434                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2435                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2436                                                             "(File system: %o %sMode: %o)",
2437                                                             exec_directory_type_to_string(type), *rt,
2438                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2439
2440                                         continue;
2441                                 }
2442                         }
2443                 }
2444
2445                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2446                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2447                  * current UID/GID ownership.) */
2448                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2449                 if (r < 0)
2450                         goto fail;
2451
2452                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2453                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2454                  * assignments to exist. */
2455                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2456                 if (r < 0)
2457                         goto fail;
2458         }
2459
2460         return 0;
2461
2462 fail:
2463         *exit_status = exit_status_table[type];
2464         return r;
2465 }
2466
2467 static int write_credential(
2468                 int dfd,
2469                 const char *id,
2470                 const void *data,
2471                 size_t size,
2472                 uid_t uid,
2473                 bool ownership_ok) {
2474
2475         _cleanup_(unlink_and_freep) char *tmp = NULL;
2476         _cleanup_close_ int fd = -1;
2477         int r;
2478
2479         r = tempfn_random_child("", "cred", &tmp);
2480         if (r < 0)
2481                 return r;
2482
2483         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2484         if (fd < 0) {
2485                 tmp = mfree(tmp);
2486                 return -errno;
2487         }
2488
2489         r = loop_write(fd, data, size, /* do_pool = */ false);
2490         if (r < 0)
2491                 return r;
2492
2493         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2494                 return -errno;
2495
2496         if (uid_is_valid(uid) && uid != getuid()) {
2497                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2498                 if (r < 0) {
2499                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2500                                 return r;
2501
2502                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2503                                             * to express: that the user gets read access and nothing
2504                                             * else. But if the backing fs can't support that (e.g. ramfs)
2505                                             * then we can use file ownership instead. But that's only safe if
2506                                             * we can then re-mount the whole thing read-only, so that the
2507                                             * user can no longer chmod() the file to gain write access. */
2508                                 return r;
2509
2510                         if (fchown(fd, uid, GID_INVALID) < 0)
2511                                 return -errno;
2512                 }
2513         }
2514
2515         if (renameat(dfd, tmp, dfd, id) < 0)
2516                 return -errno;
2517
2518         tmp = mfree(tmp);
2519         return 0;
2520 }
2521
2522 #define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2523
2524 static int acquire_credentials(
2525                 const ExecContext *context,
2526                 const ExecParameters *params,
2527                 const char *unit,
2528                 const char *p,
2529                 uid_t uid,
2530                 bool ownership_ok) {
2531
2532         uint64_t left = CREDENTIALS_BYTES_MAX;
2533         _cleanup_close_ int dfd = -1;
2534         ExecSetCredential *sc;
2535         char **id, **fn;
2536         int r;
2537
2538         assert(context);
2539         assert(p);
2540
2541         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2542         if (dfd < 0)
2543                 return -errno;
2544
2545         /* First we use the literally specified credentials. Note that they might be overridden again below,
2546          * and thus act as a "default" if the same credential is specified multiple times */
2547         HASHMAP_FOREACH(sc, context->set_credentials) {
2548                 size_t add;
2549
2550                 add = strlen(sc->id) + sc->size;
2551                 if (add > left)
2552                         return -E2BIG;
2553
2554                 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2555                 if (r < 0)
2556                         return r;
2557
2558                 left -= add;
2559         }
2560
2561         /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2562         STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2563                 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2564                 _cleanup_(erase_and_freep) char *data = NULL;
2565                 _cleanup_free_ char *j = NULL, *bindname = NULL;
2566                 bool missing_ok = true;
2567                 const char *source;
2568                 size_t size, add;
2569
2570                 if (path_is_absolute(*fn)) {
2571                         /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2572                         source = *fn;
2573                         flags |= READ_FULL_FILE_CONNECT_SOCKET;
2574
2575                         /* Pass some minimal info about the unit and the credential name we are looking to acquire
2576                          * via the source socket address in case we read off an AF_UNIX socket. */
2577                         if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2578                                 return -ENOMEM;
2579
2580                         missing_ok = false;
2581
2582                 } else if (params->received_credentials) {
2583                         /* If this is a relative path, take it relative to the credentials we received
2584                          * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2585                          * on a credential store, i.e. this is guaranteed to be regular files. */
2586                         j = path_join(params->received_credentials, *fn);
2587                         if (!j)
2588                                 return -ENOMEM;
2589
2590                         source = j;
2591                 } else
2592                         source = NULL;
2593
2594                 if (source)
2595                         r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
2596                 else
2597                         r = -ENOENT;
2598                 if (r == -ENOENT && (missing_ok || faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)) {
2599                         /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2600                          * will get clear errors if we don't pass such a missing credential on as they
2601                          * themselves will get ENOENT when trying to read them, which should not be much
2602                          * worse than when we handle the error here and make it fatal.
2603                          *
2604                          * Also, if the source file doesn't exist, but we already acquired the key otherwise,
2605                          * then don't fail either. */
2606                         log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", *fn);
2607                         continue;
2608                 }
2609                 if (r < 0)
2610                         return log_debug_errno(r, "Failed to read credential '%s': %m", *fn);
2611
2612                 add = strlen(*id) + size;
2613                 if (add > left)
2614                         return -E2BIG;
2615
2616                 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2617                 if (r < 0)
2618                         return r;
2619
2620                 left -= add;
2621         }
2622
2623         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2624                 return -errno;
2625
2626         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2627          * accessible */
2628
2629         if (uid_is_valid(uid) && uid != getuid()) {
2630                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2631                 if (r < 0) {
2632                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2633                                 return r;
2634
2635                         if (!ownership_ok)
2636                                 return r;
2637
2638                         if (fchown(dfd, uid, GID_INVALID) < 0)
2639                                 return -errno;
2640                 }
2641         }
2642
2643         return 0;
2644 }
2645
2646 static int setup_credentials_internal(
2647                 const ExecContext *context,
2648                 const ExecParameters *params,
2649                 const char *unit,
2650                 const char *final,        /* This is where the credential store shall eventually end up at */
2651                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2652                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2653                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2654                 uid_t uid) {
2655
2656         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2657                                    * if we mounted something; false if we definitely can't mount anything */
2658         bool final_mounted;
2659         const char *where;
2660
2661         assert(context);
2662         assert(final);
2663         assert(workspace);
2664
2665         if (reuse_workspace) {
2666                 r = path_is_mount_point(workspace, NULL, 0);
2667                 if (r < 0)
2668                         return r;
2669                 if (r > 0)
2670                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2671                 else
2672                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2673         } else
2674                 workspace_mounted = -1; /* ditto */
2675
2676         r = path_is_mount_point(final, NULL, 0);
2677         if (r < 0)
2678                 return r;
2679         if (r > 0) {
2680                 /* If the final place already has something mounted, we use that. If the workspace also has
2681                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
2682                  * different). */
2683                 final_mounted = true;
2684
2685                 if (workspace_mounted < 0) {
2686                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2687                          * the final version to the workspace, and make it writable, so that we can make
2688                          * changes */
2689
2690                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2691                         if (r < 0)
2692                                 return r;
2693
2694                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2695                         if (r < 0)
2696                                 return r;
2697
2698                         workspace_mounted = true;
2699                 }
2700         } else
2701                 final_mounted = false;
2702
2703         if (workspace_mounted < 0) {
2704                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2705                 for (int try = 0;; try++) {
2706
2707                         if (try == 0) {
2708                                 /* Try "ramfs" first, since it's not swap backed */
2709                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2710                                 if (r >= 0) {
2711                                         workspace_mounted = true;
2712                                         break;
2713                                 }
2714
2715                         } else if (try == 1) {
2716                                 _cleanup_free_ char *opts = NULL;
2717
2718                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2719                                         return -ENOMEM;
2720
2721                                 /* Fall back to "tmpfs" otherwise */
2722                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2723                                 if (r >= 0) {
2724                                         workspace_mounted = true;
2725                                         break;
2726                                 }
2727
2728                         } else {
2729                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2730                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2731                                 if (r < 0) {
2732                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2733                                                 return r;
2734
2735                                         if (must_mount) /* If we it's not OK to use the plain directory
2736                                                          * fallback, propagate all errors too */
2737                                                 return r;
2738
2739                                         /* If we lack privileges to bind mount stuff, then let's gracefully
2740                                          * proceed for compat with container envs, and just use the final dir
2741                                          * as is. */
2742
2743                                         workspace_mounted = false;
2744                                         break;
2745                                 }
2746
2747                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2748                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2749                                 if (r < 0)
2750                                         return r;
2751
2752                                 workspace_mounted = true;
2753                                 break;
2754                         }
2755                 }
2756         }
2757
2758         assert(!must_mount || workspace_mounted > 0);
2759         where = workspace_mounted ? workspace : final;
2760
2761         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2762         if (r < 0)
2763                 return r;
2764
2765         if (workspace_mounted) {
2766                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2767                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2768                 if (r < 0)
2769                         return r;
2770
2771                 /* And mount it to the final place, read-only */
2772                 if (final_mounted)
2773                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2774                 else
2775                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2776                 if (r < 0)
2777                         return r;
2778         } else {
2779                 _cleanup_free_ char *parent = NULL;
2780
2781                 /* If we do not have our own mount put used the plain directory fallback, then we need to
2782                  * open access to the top-level credential directory and the per-service directory now */
2783
2784                 parent = dirname_malloc(final);
2785                 if (!parent)
2786                         return -ENOMEM;
2787                 if (chmod(parent, 0755) < 0)
2788                         return -errno;
2789         }
2790
2791         return 0;
2792 }
2793
2794 static int setup_credentials(
2795                 const ExecContext *context,
2796                 const ExecParameters *params,
2797                 const char *unit,
2798                 uid_t uid) {
2799
2800         _cleanup_free_ char *p = NULL, *q = NULL;
2801         const char *i;
2802         int r;
2803
2804         assert(context);
2805         assert(params);
2806
2807         if (!exec_context_has_credentials(context))
2808                 return 0;
2809
2810         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2811                 return -EINVAL;
2812
2813         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2814          * and the subdir we mount over with a read-only file system readable by the service's user */
2815         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2816         if (!q)
2817                 return -ENOMEM;
2818
2819         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2820         if (r < 0 && r != -EEXIST)
2821                 return r;
2822
2823         p = path_join(q, unit);
2824         if (!p)
2825                 return -ENOMEM;
2826
2827         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2828         if (r < 0 && r != -EEXIST)
2829                 return r;
2830
2831         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2832         if (r < 0) {
2833                 _cleanup_free_ char *t = NULL, *u = NULL;
2834
2835                 /* If this is not a privilege or support issue then propagate the error */
2836                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2837                         return r;
2838
2839                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2840                  * it into place, so that users can't access half-initialized credential stores. */
2841                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2842                 if (!t)
2843                         return -ENOMEM;
2844
2845                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2846                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2847                  * after it is fully set up */
2848                 u = path_join(t, unit);
2849                 if (!u)
2850                         return -ENOMEM;
2851
2852                 FOREACH_STRING(i, t, u) {
2853                         r = mkdir_label(i, 0700);
2854                         if (r < 0 && r != -EEXIST)
2855                                 return r;
2856                 }
2857
2858                 r = setup_credentials_internal(
2859                                 context,
2860                                 params,
2861                                 unit,
2862                                 p,       /* final mount point */
2863                                 u,       /* temporary workspace to overmount */
2864                                 true,    /* reuse the workspace if it is already a mount */
2865                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
2866                                 uid);
2867
2868                 (void) rmdir(u); /* remove the workspace again if we can. */
2869
2870                 if (r < 0)
2871                         return r;
2872
2873         } else if (r == 0) {
2874
2875                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2876                  * we can use the same directory for all cases, after turning off propagation. Question
2877                  * though is: where do we turn off propagation exactly, and where do we place the workspace
2878                  * directory? We need some place that is guaranteed to be a mount point in the host, and
2879                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2880                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
2881                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2882                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2883                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2884                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2885                  * propagation on the former, and then overmount the latter.
2886                  *
2887                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2888                  * for this purpose, but there are few other candidates that work equally well for us, and
2889                  * given that the we do this in a privately namespaced short-lived single-threaded process
2890                  * that no one else sees this should be OK to do. */
2891
2892                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2893                 if (r < 0)
2894                         goto child_fail;
2895
2896                 r = setup_credentials_internal(
2897                                 context,
2898                                 params,
2899                                 unit,
2900                                 p,           /* final mount point */
2901                                 "/dev/shm",  /* temporary workspace to overmount */
2902                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2903                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
2904                                 uid);
2905                 if (r < 0)
2906                         goto child_fail;
2907
2908                 _exit(EXIT_SUCCESS);
2909
2910         child_fail:
2911                 _exit(EXIT_FAILURE);
2912         }
2913
2914         return 0;
2915 }
2916
2917 #if ENABLE_SMACK
2918 static int setup_smack(
2919                 const ExecContext *context,
2920                 int executable_fd) {
2921         int r;
2922
2923         assert(context);
2924         assert(executable_fd >= 0);
2925
2926         if (context->smack_process_label) {
2927                 r = mac_smack_apply_pid(0, context->smack_process_label);
2928                 if (r < 0)
2929                         return r;
2930         }
2931 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2932         else {
2933                 _cleanup_free_ char *exec_label = NULL;
2934
2935                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2936                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2937                         return r;
2938
2939                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2940                 if (r < 0)
2941                         return r;
2942         }
2943 #endif
2944
2945         return 0;
2946 }
2947 #endif
2948
2949 static int compile_bind_mounts(
2950                 const ExecContext *context,
2951                 const ExecParameters *params,
2952                 BindMount **ret_bind_mounts,
2953                 size_t *ret_n_bind_mounts,
2954                 char ***ret_empty_directories) {
2955
2956         _cleanup_strv_free_ char **empty_directories = NULL;
2957         BindMount *bind_mounts;
2958         size_t n, h = 0;
2959         int r;
2960
2961         assert(context);
2962         assert(params);
2963         assert(ret_bind_mounts);
2964         assert(ret_n_bind_mounts);
2965         assert(ret_empty_directories);
2966
2967         n = context->n_bind_mounts;
2968         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2969                 if (!params->prefix[t])
2970                         continue;
2971
2972                 n += strv_length(context->directories[t].paths);
2973         }
2974
2975         if (n <= 0) {
2976                 *ret_bind_mounts = NULL;
2977                 *ret_n_bind_mounts = 0;
2978                 *ret_empty_directories = NULL;
2979                 return 0;
2980         }
2981
2982         bind_mounts = new(BindMount, n);
2983         if (!bind_mounts)
2984                 return -ENOMEM;
2985
2986         for (size_t i = 0; i < context->n_bind_mounts; i++) {
2987                 BindMount *item = context->bind_mounts + i;
2988                 char *s, *d;
2989
2990                 s = strdup(item->source);
2991                 if (!s) {
2992                         r = -ENOMEM;
2993                         goto finish;
2994                 }
2995
2996                 d = strdup(item->destination);
2997                 if (!d) {
2998                         free(s);
2999                         r = -ENOMEM;
3000                         goto finish;
3001                 }
3002
3003                 bind_mounts[h++] = (BindMount) {
3004                         .source = s,
3005                         .destination = d,
3006                         .read_only = item->read_only,
3007                         .recursive = item->recursive,
3008                         .ignore_enoent = item->ignore_enoent,
3009                 };
3010         }
3011
3012         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3013                 char **suffix;
3014
3015                 if (!params->prefix[t])
3016                         continue;
3017
3018                 if (strv_isempty(context->directories[t].paths))
3019                         continue;
3020
3021                 if (exec_directory_is_private(context, t) &&
3022                     !exec_context_with_rootfs(context)) {
3023                         char *private_root;
3024
3025                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3026                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3027                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3028
3029                         private_root = path_join(params->prefix[t], "private");
3030                         if (!private_root) {
3031                                 r = -ENOMEM;
3032                                 goto finish;
3033                         }
3034
3035                         r = strv_consume(&empty_directories, private_root);
3036                         if (r < 0)
3037                                 goto finish;
3038                 }
3039
3040                 STRV_FOREACH(suffix, context->directories[t].paths) {
3041                         char *s, *d;
3042
3043                         if (exec_directory_is_private(context, t))
3044                                 s = path_join(params->prefix[t], "private", *suffix);
3045                         else
3046                                 s = path_join(params->prefix[t], *suffix);
3047                         if (!s) {
3048                                 r = -ENOMEM;
3049                                 goto finish;
3050                         }
3051
3052                         if (exec_directory_is_private(context, t) &&
3053                             exec_context_with_rootfs(context))
3054                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3055                                  * directory is not created on the root directory. So, let's bind-mount the directory
3056                                  * on the 'non-private' place. */
3057                                 d = path_join(params->prefix[t], *suffix);
3058                         else
3059                                 d = strdup(s);
3060                         if (!d) {
3061                                 free(s);
3062                                 r = -ENOMEM;
3063                                 goto finish;
3064                         }
3065
3066                         bind_mounts[h++] = (BindMount) {
3067                                 .source = s,
3068                                 .destination = d,
3069                                 .read_only = false,
3070                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3071                                 .recursive = true,
3072                                 .ignore_enoent = false,
3073                         };
3074                 }
3075         }
3076
3077         assert(h == n);
3078
3079         *ret_bind_mounts = bind_mounts;
3080         *ret_n_bind_mounts = n;
3081         *ret_empty_directories = TAKE_PTR(empty_directories);
3082
3083         return (int) n;
3084
3085 finish:
3086         bind_mount_free_many(bind_mounts, h);
3087         return r;
3088 }
3089
3090 static bool insist_on_sandboxing(
3091                 const ExecContext *context,
3092                 const char *root_dir,
3093                 const char *root_image,
3094                 const BindMount *bind_mounts,
3095                 size_t n_bind_mounts) {
3096
3097         assert(context);
3098         assert(n_bind_mounts == 0 || bind_mounts);
3099
3100         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3101          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3102          * rearrange stuff in a way we cannot ignore gracefully. */
3103
3104         if (context->n_temporary_filesystems > 0)
3105                 return true;
3106
3107         if (root_dir || root_image)
3108                 return true;
3109
3110         if (context->n_mount_images > 0)
3111                 return true;
3112
3113         if (context->dynamic_user)
3114                 return true;
3115
3116         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3117          * essential. */
3118         for (size_t i = 0; i < n_bind_mounts; i++)
3119                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3120                         return true;
3121
3122         if (context->log_namespace)
3123                 return true;
3124
3125         return false;
3126 }
3127
3128 static int apply_mount_namespace(
3129                 const Unit *u,
3130                 ExecCommandFlags command_flags,
3131                 const ExecContext *context,
3132                 const ExecParameters *params,
3133                 const ExecRuntime *runtime,
3134                 char **error_path) {
3135
3136         _cleanup_strv_free_ char **empty_directories = NULL;
3137         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3138         const char *root_dir = NULL, *root_image = NULL;
3139         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3140         NamespaceInfo ns_info;
3141         bool needs_sandboxing;
3142         BindMount *bind_mounts = NULL;
3143         size_t n_bind_mounts = 0;
3144         int r;
3145
3146         assert(context);
3147
3148         if (params->flags & EXEC_APPLY_CHROOT) {
3149                 root_image = context->root_image;
3150
3151                 if (!root_image)
3152                         root_dir = context->root_directory;
3153         }
3154
3155         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3156         if (r < 0)
3157                 return r;
3158
3159         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3160         if (needs_sandboxing) {
3161                 /* The runtime struct only contains the parent of the private /tmp,
3162                  * which is non-accessible to world users. Inside of it there's a /tmp
3163                  * that is sticky, and that's the one we want to use here.
3164                  * This does not apply when we are using /run/systemd/empty as fallback. */
3165
3166                 if (context->private_tmp && runtime) {
3167                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3168                                 tmp_dir = runtime->tmp_dir;
3169                         else if (runtime->tmp_dir)
3170                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3171
3172                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3173                                 var_tmp_dir = runtime->var_tmp_dir;
3174                         else if (runtime->var_tmp_dir)
3175                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3176                 }
3177
3178                 ns_info = (NamespaceInfo) {
3179                         .ignore_protect_paths = false,
3180                         .private_dev = context->private_devices,
3181                         .protect_control_groups = context->protect_control_groups,
3182                         .protect_kernel_tunables = context->protect_kernel_tunables,
3183                         .protect_kernel_modules = context->protect_kernel_modules,
3184                         .protect_kernel_logs = context->protect_kernel_logs,
3185                         .protect_hostname = context->protect_hostname,
3186                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3187                         .private_mounts = context->private_mounts,
3188                         .protect_home = context->protect_home,
3189                         .protect_system = context->protect_system,
3190                         .protect_proc = context->protect_proc,
3191                         .proc_subset = context->proc_subset,
3192                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3193                 };
3194         } else if (!context->dynamic_user && root_dir)
3195                 /*
3196                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3197                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3198                  * fail if we are enable to apply the sandbox inside the mount namespace.
3199                  */
3200                 ns_info = (NamespaceInfo) {
3201                         .ignore_protect_paths = true,
3202                 };
3203         else
3204                 ns_info = (NamespaceInfo) {};
3205
3206         if (context->mount_flags == MS_SHARED)
3207                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3208
3209         if (exec_context_has_credentials(context) &&
3210             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3211             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3212                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3213                 if (!creds_path) {
3214                         r = -ENOMEM;
3215                         goto finalize;
3216                 }
3217         }
3218
3219         if (MANAGER_IS_SYSTEM(u->manager)) {
3220                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3221                 if (!propagate_dir) {
3222                         r = -ENOMEM;
3223                         goto finalize;
3224                 }
3225
3226                 incoming_dir = strdup("/run/systemd/incoming");
3227                 if (!incoming_dir) {
3228                         r = -ENOMEM;
3229                         goto finalize;
3230                 }
3231         }
3232
3233         r = setup_namespace(root_dir, root_image, context->root_image_options,
3234                             &ns_info, context->read_write_paths,
3235                             needs_sandboxing ? context->read_only_paths : NULL,
3236                             needs_sandboxing ? context->inaccessible_paths : NULL,
3237                             needs_sandboxing ? context->exec_paths : NULL,
3238                             needs_sandboxing ? context->no_exec_paths : NULL,
3239                             empty_directories,
3240                             bind_mounts,
3241                             n_bind_mounts,
3242                             context->temporary_filesystems,
3243                             context->n_temporary_filesystems,
3244                             context->mount_images,
3245                             context->n_mount_images,
3246                             tmp_dir,
3247                             var_tmp_dir,
3248                             creds_path,
3249                             context->log_namespace,
3250                             context->mount_flags,
3251                             context->root_hash, context->root_hash_size, context->root_hash_path,
3252                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3253                             context->root_verity,
3254                             context->extension_images,
3255                             context->n_extension_images,
3256                             propagate_dir,
3257                             incoming_dir,
3258                             root_dir || root_image ? params->notify_socket : NULL,
3259                             error_path);
3260
3261         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3262          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3263          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3264          * completely different execution environment. */
3265         if (r == -ENOANO) {
3266                 if (insist_on_sandboxing(
3267                                     context,
3268                                     root_dir, root_image,
3269                                     bind_mounts,
3270                                     n_bind_mounts)) {
3271                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3272                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3273                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3274
3275                         r = -EOPNOTSUPP;
3276                 } else {
3277                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3278                         r = 0;
3279                 }
3280         }
3281
3282 finalize:
3283         bind_mount_free_many(bind_mounts, n_bind_mounts);
3284         return r;
3285 }
3286
3287 static int apply_working_directory(
3288                 const ExecContext *context,
3289                 const ExecParameters *params,
3290                 const char *home,
3291                 int *exit_status) {
3292
3293         const char *d, *wd;
3294
3295         assert(context);
3296         assert(exit_status);
3297
3298         if (context->working_directory_home) {
3299
3300                 if (!home) {
3301                         *exit_status = EXIT_CHDIR;
3302                         return -ENXIO;
3303                 }
3304
3305                 wd = home;
3306
3307         } else
3308                 wd = empty_to_root(context->working_directory);
3309
3310         if (params->flags & EXEC_APPLY_CHROOT)
3311                 d = wd;
3312         else
3313                 d = prefix_roota(context->root_directory, wd);
3314
3315         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3316                 *exit_status = EXIT_CHDIR;
3317                 return -errno;
3318         }
3319
3320         return 0;
3321 }
3322
3323 static int apply_root_directory(
3324                 const ExecContext *context,
3325                 const ExecParameters *params,
3326                 const bool needs_mount_ns,
3327                 int *exit_status) {
3328
3329         assert(context);
3330         assert(exit_status);
3331
3332         if (params->flags & EXEC_APPLY_CHROOT)
3333                 if (!needs_mount_ns && context->root_directory)
3334                         if (chroot(context->root_directory) < 0) {
3335                                 *exit_status = EXIT_CHROOT;
3336                                 return -errno;
3337                         }
3338
3339         return 0;
3340 }
3341
3342 static int setup_keyring(
3343                 const Unit *u,
3344                 const ExecContext *context,
3345                 const ExecParameters *p,
3346                 uid_t uid, gid_t gid) {
3347
3348         key_serial_t keyring;
3349         int r = 0;
3350         uid_t saved_uid;
3351         gid_t saved_gid;
3352
3353         assert(u);
3354         assert(context);
3355         assert(p);
3356
3357         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3358          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3359          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3360          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3361          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3362          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3363
3364         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3365                 return 0;
3366
3367         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3368          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3369          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3370          * & group is just as nasty as acquiring a reference to the user keyring. */
3371
3372         saved_uid = getuid();
3373         saved_gid = getgid();
3374
3375         if (gid_is_valid(gid) && gid != saved_gid) {
3376                 if (setregid(gid, -1) < 0)
3377                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3378         }
3379
3380         if (uid_is_valid(uid) && uid != saved_uid) {
3381                 if (setreuid(uid, -1) < 0) {
3382                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3383                         goto out;
3384                 }
3385         }
3386
3387         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3388         if (keyring == -1) {
3389                 if (errno == ENOSYS)
3390                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3391                 else if (ERRNO_IS_PRIVILEGE(errno))
3392                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3393                 else if (errno == EDQUOT)
3394                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3395                 else
3396                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3397
3398                 goto out;
3399         }
3400
3401         /* When requested link the user keyring into the session keyring. */
3402         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3403
3404                 if (keyctl(KEYCTL_LINK,
3405                            KEY_SPEC_USER_KEYRING,
3406                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3407                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3408                         goto out;
3409                 }
3410         }
3411
3412         /* Restore uid/gid back */
3413         if (uid_is_valid(uid) && uid != saved_uid) {
3414                 if (setreuid(saved_uid, -1) < 0) {
3415                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3416                         goto out;
3417                 }
3418         }
3419
3420         if (gid_is_valid(gid) && gid != saved_gid) {
3421                 if (setregid(saved_gid, -1) < 0)
3422                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3423         }
3424
3425         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3426         if (!sd_id128_is_null(u->invocation_id)) {
3427                 key_serial_t key;
3428
3429                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3430                 if (key == -1)
3431                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3432                 else {
3433                         if (keyctl(KEYCTL_SETPERM, key,
3434                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3435                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3436                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3437                 }
3438         }
3439
3440 out:
3441         /* Revert back uid & gid for the last time, and exit */
3442         /* no extra logging, as only the first already reported error matters */
3443         if (getuid() != saved_uid)
3444                 (void) setreuid(saved_uid, -1);
3445
3446         if (getgid() != saved_gid)
3447                 (void) setregid(saved_gid, -1);
3448
3449         return r;
3450 }
3451
3452 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3453         assert(array);
3454         assert(n);
3455         assert(pair);
3456
3457         if (pair[0] >= 0)
3458                 array[(*n)++] = pair[0];
3459         if (pair[1] >= 0)
3460                 array[(*n)++] = pair[1];
3461 }
3462
3463 static int close_remaining_fds(
3464                 const ExecParameters *params,
3465                 const ExecRuntime *runtime,
3466                 const DynamicCreds *dcreds,
3467                 int user_lookup_fd,
3468                 int socket_fd,
3469                 const int *fds, size_t n_fds) {
3470
3471         size_t n_dont_close = 0;
3472         int dont_close[n_fds + 12];
3473
3474         assert(params);
3475
3476         if (params->stdin_fd >= 0)
3477                 dont_close[n_dont_close++] = params->stdin_fd;
3478         if (params->stdout_fd >= 0)
3479                 dont_close[n_dont_close++] = params->stdout_fd;
3480         if (params->stderr_fd >= 0)
3481                 dont_close[n_dont_close++] = params->stderr_fd;
3482
3483         if (socket_fd >= 0)
3484                 dont_close[n_dont_close++] = socket_fd;
3485         if (n_fds > 0) {
3486                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3487                 n_dont_close += n_fds;
3488         }
3489
3490         if (runtime) {
3491                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3492                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3493         }
3494
3495         if (dcreds) {
3496                 if (dcreds->user)
3497                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3498                 if (dcreds->group)
3499                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3500         }
3501
3502         if (user_lookup_fd >= 0)
3503                 dont_close[n_dont_close++] = user_lookup_fd;
3504
3505         return close_all_fds(dont_close, n_dont_close);
3506 }
3507
3508 static int send_user_lookup(
3509                 Unit *unit,
3510                 int user_lookup_fd,
3511                 uid_t uid,
3512                 gid_t gid) {
3513
3514         assert(unit);
3515
3516         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3517          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3518          * specified. */
3519
3520         if (user_lookup_fd < 0)
3521                 return 0;
3522
3523         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3524                 return 0;
3525
3526         if (writev(user_lookup_fd,
3527                (struct iovec[]) {
3528                            IOVEC_INIT(&uid, sizeof(uid)),
3529                            IOVEC_INIT(&gid, sizeof(gid)),
3530                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3531                 return -errno;
3532
3533         return 0;
3534 }
3535
3536 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3537         int r;
3538
3539         assert(c);
3540         assert(home);
3541         assert(buf);
3542
3543         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3544
3545         if (*home)
3546                 return 0;
3547
3548         if (!c->working_directory_home)
3549                 return 0;
3550
3551         r = get_home_dir(buf);
3552         if (r < 0)
3553                 return r;
3554
3555         *home = *buf;
3556         return 1;
3557 }
3558
3559 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3560         _cleanup_strv_free_ char ** list = NULL;
3561         int r;
3562
3563         assert(c);
3564         assert(p);
3565         assert(ret);
3566
3567         assert(c->dynamic_user);
3568
3569         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3570          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3571          * directories. */
3572
3573         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3574                 char **i;
3575
3576                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3577                         continue;
3578
3579                 if (!p->prefix[t])
3580                         continue;
3581
3582                 STRV_FOREACH(i, c->directories[t].paths) {
3583                         char *e;
3584
3585                         if (exec_directory_is_private(c, t))
3586                                 e = path_join(p->prefix[t], "private", *i);
3587                         else
3588                                 e = path_join(p->prefix[t], *i);
3589                         if (!e)
3590                                 return -ENOMEM;
3591
3592                         r = strv_consume(&list, e);
3593                         if (r < 0)
3594                                 return r;
3595                 }
3596         }
3597
3598         *ret = TAKE_PTR(list);
3599
3600         return 0;
3601 }
3602
3603 static char *exec_command_line(char **argv);
3604
3605 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3606         bool using_subcgroup;
3607         char *p;
3608
3609         assert(params);
3610         assert(ret);
3611
3612         if (!params->cgroup_path)
3613                 return -EINVAL;
3614
3615         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3616          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3617          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3618          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3619          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3620          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3621          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3622          * flag, which is only passed for the former statements, not for the latter. */
3623
3624         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3625         if (using_subcgroup)
3626                 p = path_join(params->cgroup_path, ".control");
3627         else
3628                 p = strdup(params->cgroup_path);
3629         if (!p)
3630                 return -ENOMEM;
3631
3632         *ret = p;
3633         return using_subcgroup;
3634 }
3635
3636 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3637         _cleanup_(cpu_set_reset) CPUSet s = {};
3638         int r;
3639
3640         assert(c);
3641         assert(ret);
3642
3643         if (!c->numa_policy.nodes.set) {
3644                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3645                 return 0;
3646         }
3647
3648         r = numa_to_cpu_set(&c->numa_policy, &s);
3649         if (r < 0)
3650                 return r;
3651
3652         cpu_set_reset(ret);
3653
3654         return cpu_set_add_all(ret, &s);
3655 }
3656
3657 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3658         assert(c);
3659
3660         return c->cpu_affinity_from_numa;
3661 }
3662
3663 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3664         int r;
3665
3666         assert(fds);
3667         assert(n_fds);
3668         assert(*n_fds < fds_size);
3669         assert(ret_fd);
3670
3671         if (fd < 0) {
3672                 *ret_fd = -1;
3673                 return 0;
3674         }
3675
3676         if (fd < 3 + (int) *n_fds) {
3677                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3678                  * the fds we pass to the process (or which are closed only during execve). */
3679
3680                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3681                 if (r < 0)
3682                         return -errno;
3683
3684                 CLOSE_AND_REPLACE(fd, r);
3685         }
3686
3687         *ret_fd = fds[*n_fds] = fd;
3688         (*n_fds) ++;
3689         return 1;
3690 }
3691
3692 static int exec_child(
3693                 Unit *unit,
3694                 const ExecCommand *command,
3695                 const ExecContext *context,
3696                 const ExecParameters *params,
3697                 ExecRuntime *runtime,
3698                 DynamicCreds *dcreds,
3699                 int socket_fd,
3700                 const int named_iofds[static 3],
3701                 int *fds,
3702                 size_t n_socket_fds,
3703                 size_t n_storage_fds,
3704                 char **files_env,
3705                 int user_lookup_fd,
3706                 int *exit_status) {
3707
3708         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3709         int r, ngids = 0, exec_fd;
3710         _cleanup_free_ gid_t *supplementary_gids = NULL;
3711         const char *username = NULL, *groupname = NULL;
3712         _cleanup_free_ char *home_buffer = NULL;
3713         const char *home = NULL, *shell = NULL;
3714         char **final_argv = NULL;
3715         dev_t journal_stream_dev = 0;
3716         ino_t journal_stream_ino = 0;
3717         bool userns_set_up = false;
3718         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3719                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3720                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3721                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3722 #if HAVE_SELINUX
3723         _cleanup_free_ char *mac_selinux_context_net = NULL;
3724         bool use_selinux = false;
3725 #endif
3726 #if ENABLE_SMACK
3727         bool use_smack = false;
3728 #endif
3729 #if HAVE_APPARMOR
3730         bool use_apparmor = false;
3731 #endif
3732         uid_t saved_uid = getuid();
3733         gid_t saved_gid = getgid();
3734         uid_t uid = UID_INVALID;
3735         gid_t gid = GID_INVALID;
3736         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3737                n_keep_fds; /* total number of fds not to close */
3738         int secure_bits;
3739         _cleanup_free_ gid_t *gids_after_pam = NULL;
3740         int ngids_after_pam = 0;
3741
3742         assert(unit);
3743         assert(command);
3744         assert(context);
3745         assert(params);
3746         assert(exit_status);
3747
3748         rename_process_from_path(command->path);
3749
3750         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3751          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3752          * both of which will be demoted to SIG_DFL. */
3753         (void) default_signals(SIGNALS_CRASH_HANDLER,
3754                                SIGNALS_IGNORE);
3755
3756         if (context->ignore_sigpipe)
3757                 (void) ignore_signals(SIGPIPE);
3758
3759         r = reset_signal_mask();
3760         if (r < 0) {
3761                 *exit_status = EXIT_SIGNAL_MASK;
3762                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3763         }
3764
3765         if (params->idle_pipe)
3766                 do_idle_pipe_dance(params->idle_pipe);
3767
3768         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3769          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3770          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3771          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3772
3773         log_forget_fds();
3774         log_set_open_when_needed(true);
3775
3776         /* In case anything used libc syslog(), close this here, too */
3777         closelog();
3778
3779         int keep_fds[n_fds + 2];
3780         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3781         n_keep_fds = n_fds;
3782
3783         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3784         if (r < 0) {
3785                 *exit_status = EXIT_FDS;
3786                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3787         }
3788
3789         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3790         if (r < 0) {
3791                 *exit_status = EXIT_FDS;
3792                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3793         }
3794
3795         if (!context->same_pgrp &&
3796             setsid() < 0) {
3797                 *exit_status = EXIT_SETSID;
3798                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3799         }
3800
3801         exec_context_tty_reset(context, params);
3802
3803         if (unit_shall_confirm_spawn(unit)) {
3804                 const char *vc = params->confirm_spawn;
3805                 _cleanup_free_ char *cmdline = NULL;
3806
3807                 cmdline = exec_command_line(command->argv);
3808                 if (!cmdline) {
3809                         *exit_status = EXIT_MEMORY;
3810                         return log_oom();
3811                 }
3812
3813                 r = ask_for_confirmation(vc, unit, cmdline);
3814                 if (r != CONFIRM_EXECUTE) {
3815                         if (r == CONFIRM_PRETEND_SUCCESS) {
3816                                 *exit_status = EXIT_SUCCESS;
3817                                 return 0;
3818                         }
3819                         *exit_status = EXIT_CONFIRM;
3820                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3821                                                     "Execution cancelled by the user");
3822                 }
3823         }
3824
3825         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3826          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3827          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3828          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3829          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3830         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3831             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3832                 *exit_status = EXIT_MEMORY;
3833                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3834         }
3835
3836         if (context->dynamic_user && dcreds) {
3837                 _cleanup_strv_free_ char **suggested_paths = NULL;
3838
3839                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3840                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
3841                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3842                         *exit_status = EXIT_USER;
3843                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3844                 }
3845
3846                 r = compile_suggested_paths(context, params, &suggested_paths);
3847                 if (r < 0) {
3848                         *exit_status = EXIT_MEMORY;
3849                         return log_oom();
3850                 }
3851
3852                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3853                 if (r < 0) {
3854                         *exit_status = EXIT_USER;
3855                         if (r == -EILSEQ)
3856                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3857                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
3858                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3859                 }
3860
3861                 if (!uid_is_valid(uid)) {
3862                         *exit_status = EXIT_USER;
3863                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
3864                 }
3865
3866                 if (!gid_is_valid(gid)) {
3867                         *exit_status = EXIT_USER;
3868                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
3869                 }
3870
3871                 if (dcreds->user)
3872                         username = dcreds->user->name;
3873
3874         } else {
3875                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3876                 if (r < 0) {
3877                         *exit_status = EXIT_USER;
3878                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3879                 }
3880
3881                 r = get_fixed_group(context, &groupname, &gid);
3882                 if (r < 0) {
3883                         *exit_status = EXIT_GROUP;
3884                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3885                 }
3886         }
3887
3888         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3889         r = get_supplementary_groups(context, username, groupname, gid,
3890                                      &supplementary_gids, &ngids);
3891         if (r < 0) {
3892                 *exit_status = EXIT_GROUP;
3893                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3894         }
3895
3896         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3897         if (r < 0) {
3898                 *exit_status = EXIT_USER;
3899                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3900         }
3901
3902         user_lookup_fd = safe_close(user_lookup_fd);
3903
3904         r = acquire_home(context, uid, &home, &home_buffer);
3905         if (r < 0) {
3906                 *exit_status = EXIT_CHDIR;
3907                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3908         }
3909
3910         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3911          * must sure to drop O_NONBLOCK */
3912         if (socket_fd >= 0)
3913                 (void) fd_nonblock(socket_fd, false);
3914
3915         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3916          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3917         if (params->cgroup_path) {
3918                 _cleanup_free_ char *p = NULL;
3919
3920                 r = exec_parameters_get_cgroup_path(params, &p);
3921                 if (r < 0) {
3922                         *exit_status = EXIT_CGROUP;
3923                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3924                 }
3925
3926                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3927                 if (r < 0) {
3928                         *exit_status = EXIT_CGROUP;
3929                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3930                 }
3931         }
3932
3933         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3934                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
3935                 if (r < 0) {
3936                         *exit_status = EXIT_NETWORK;
3937                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3938                 }
3939         }
3940
3941         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
3942                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
3943                 if (r < 0) {
3944                         *exit_status = EXIT_NAMESPACE;
3945                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
3946                 }
3947         }
3948
3949         r = setup_input(context, params, socket_fd, named_iofds);
3950         if (r < 0) {
3951                 *exit_status = EXIT_STDIN;
3952                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3953         }
3954
3955         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3956         if (r < 0) {
3957                 *exit_status = EXIT_STDOUT;
3958                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3959         }
3960
3961         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3962         if (r < 0) {
3963                 *exit_status = EXIT_STDERR;
3964                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3965         }
3966
3967         if (context->oom_score_adjust_set) {
3968                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3969                  * prohibit write access to this file, and we shouldn't trip up over that. */
3970                 r = set_oom_score_adjust(context->oom_score_adjust);
3971                 if (ERRNO_IS_PRIVILEGE(r))
3972                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3973                 else if (r < 0) {
3974                         *exit_status = EXIT_OOM_ADJUST;
3975                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3976                 }
3977         }
3978
3979         if (context->coredump_filter_set) {
3980                 r = set_coredump_filter(context->coredump_filter);
3981                 if (ERRNO_IS_PRIVILEGE(r))
3982                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3983                 else if (r < 0)
3984                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3985         }
3986
3987         if (context->nice_set) {
3988                 r = setpriority_closest(context->nice);
3989                 if (r < 0)
3990                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3991         }
3992
3993         if (context->cpu_sched_set) {
3994                 struct sched_param param = {
3995                         .sched_priority = context->cpu_sched_priority,
3996                 };
3997
3998                 r = sched_setscheduler(0,
3999                                        context->cpu_sched_policy |
4000                                        (context->cpu_sched_reset_on_fork ?
4001                                         SCHED_RESET_ON_FORK : 0),
4002                                        &param);
4003                 if (r < 0) {
4004                         *exit_status = EXIT_SETSCHEDULER;
4005                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4006                 }
4007         }
4008
4009         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4010                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4011                 const CPUSet *cpu_set;
4012
4013                 if (context->cpu_affinity_from_numa) {
4014                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4015                         if (r < 0) {
4016                                 *exit_status = EXIT_CPUAFFINITY;
4017                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4018                         }
4019
4020                         cpu_set = &converted_cpu_set;
4021                 } else
4022                         cpu_set = &context->cpu_set;
4023
4024                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4025                         *exit_status = EXIT_CPUAFFINITY;
4026                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4027                 }
4028         }
4029
4030         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4031                 r = apply_numa_policy(&context->numa_policy);
4032                 if (r == -EOPNOTSUPP)
4033                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4034                 else if (r < 0) {
4035                         *exit_status = EXIT_NUMA_POLICY;
4036                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4037                 }
4038         }
4039
4040         if (context->ioprio_set)
4041                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4042                         *exit_status = EXIT_IOPRIO;
4043                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4044                 }
4045
4046         if (context->timer_slack_nsec != NSEC_INFINITY)
4047                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4048                         *exit_status = EXIT_TIMERSLACK;
4049                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4050                 }
4051
4052         if (context->personality != PERSONALITY_INVALID) {
4053                 r = safe_personality(context->personality);
4054                 if (r < 0) {
4055                         *exit_status = EXIT_PERSONALITY;
4056                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4057                 }
4058         }
4059
4060         if (context->utmp_id)
4061                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4062                                       context->tty_path,
4063                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4064                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4065                                       USER_PROCESS,
4066                                       username);
4067
4068         if (uid_is_valid(uid)) {
4069                 r = chown_terminal(STDIN_FILENO, uid);
4070                 if (r < 0) {
4071                         *exit_status = EXIT_STDIN;
4072                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4073                 }
4074         }
4075
4076         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4077          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4078          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4079          * touch a single hierarchy too. */
4080         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4081                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4082                 if (r < 0) {
4083                         *exit_status = EXIT_CGROUP;
4084                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4085                 }
4086         }
4087
4088         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4089                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
4090                 if (r < 0)
4091                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4092         }
4093
4094         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4095                 r = setup_credentials(context, params, unit->id, uid);
4096                 if (r < 0) {
4097                         *exit_status = EXIT_CREDENTIALS;
4098                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4099                 }
4100         }
4101
4102         r = build_environment(
4103                         unit,
4104                         context,
4105                         params,
4106                         n_fds,
4107                         home,
4108                         username,
4109                         shell,
4110                         journal_stream_dev,
4111                         journal_stream_ino,
4112                         &our_env);
4113         if (r < 0) {
4114                 *exit_status = EXIT_MEMORY;
4115                 return log_oom();
4116         }
4117
4118         r = build_pass_environment(context, &pass_env);
4119         if (r < 0) {
4120                 *exit_status = EXIT_MEMORY;
4121                 return log_oom();
4122         }
4123
4124         accum_env = strv_env_merge(5,
4125                                    params->environment,
4126                                    our_env,
4127                                    pass_env,
4128                                    context->environment,
4129                                    files_env);
4130         if (!accum_env) {
4131                 *exit_status = EXIT_MEMORY;
4132                 return log_oom();
4133         }
4134         accum_env = strv_env_clean(accum_env);
4135
4136         (void) umask(context->umask);
4137
4138         r = setup_keyring(unit, context, params, uid, gid);
4139         if (r < 0) {
4140                 *exit_status = EXIT_KEYRING;
4141                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4142         }
4143
4144         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4145         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4146
4147         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4148         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4149
4150         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4151         if (needs_ambient_hack)
4152                 needs_setuid = false;
4153         else
4154                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4155
4156         if (needs_sandboxing) {
4157                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4158                  * present. The actual MAC context application will happen later, as late as possible, to avoid
4159                  * impacting our own code paths. */
4160
4161 #if HAVE_SELINUX
4162                 use_selinux = mac_selinux_use();
4163 #endif
4164 #if ENABLE_SMACK
4165                 use_smack = mac_smack_use();
4166 #endif
4167 #if HAVE_APPARMOR
4168                 use_apparmor = mac_apparmor_use();
4169 #endif
4170         }
4171
4172         if (needs_sandboxing) {
4173                 int which_failed;
4174
4175                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4176                  * is set here. (See below.) */
4177
4178                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4179                 if (r < 0) {
4180                         *exit_status = EXIT_LIMITS;
4181                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4182                 }
4183         }
4184
4185         if (needs_setuid && context->pam_name && username) {
4186                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4187                  * wins here. (See above.) */
4188
4189                 /* All fds passed in the fds array will be closed in the pam child process. */
4190                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4191                 if (r < 0) {
4192                         *exit_status = EXIT_PAM;
4193                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4194                 }
4195
4196                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4197                 if (ngids_after_pam < 0) {
4198                         *exit_status = EXIT_MEMORY;
4199                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4200                 }
4201         }
4202
4203         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4204                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4205                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4206                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4207
4208                 userns_set_up = true;
4209                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4210                 if (r < 0) {
4211                         *exit_status = EXIT_USER;
4212                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4213                 }
4214         }
4215
4216         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4217
4218                 if (ns_type_supported(NAMESPACE_NET)) {
4219                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4220                         if (r == -EPERM)
4221                                 log_unit_warning_errno(unit, r,
4222                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4223                         else if (r < 0) {
4224                                 *exit_status = EXIT_NETWORK;
4225                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4226                         }
4227                 } else if (context->network_namespace_path) {
4228                         *exit_status = EXIT_NETWORK;
4229                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4230                                                     "NetworkNamespacePath= is not supported, refusing.");
4231                 } else
4232                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4233         }
4234
4235         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4236
4237                 if (ns_type_supported(NAMESPACE_IPC)) {
4238                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4239                         if (r == -EPERM)
4240                                 log_unit_warning_errno(unit, r,
4241                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4242                         else if (r < 0) {
4243                                 *exit_status = EXIT_NAMESPACE;
4244                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4245                         }
4246                 } else if (context->ipc_namespace_path) {
4247                         *exit_status = EXIT_NAMESPACE;
4248                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4249                                                     "IPCNamespacePath= is not supported, refusing.");
4250                 } else
4251                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4252         }
4253
4254         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4255         if (needs_mount_namespace) {
4256                 _cleanup_free_ char *error_path = NULL;
4257
4258                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4259                 if (r < 0) {
4260                         *exit_status = EXIT_NAMESPACE;
4261                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4262                                                     error_path ? ": " : "", strempty(error_path));
4263                 }
4264         }
4265
4266         if (needs_sandboxing) {
4267                 r = apply_protect_hostname(unit, context, exit_status);
4268                 if (r < 0)
4269                         return r;
4270         }
4271
4272         /* Drop groups as early as possible.
4273          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4274          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4275         if (needs_setuid) {
4276                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4277                 int ngids_to_enforce = 0;
4278
4279                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4280                                                    ngids,
4281                                                    gids_after_pam,
4282                                                    ngids_after_pam,
4283                                                    &gids_to_enforce);
4284                 if (ngids_to_enforce < 0) {
4285                         *exit_status = EXIT_MEMORY;
4286                         return log_unit_error_errno(unit,
4287                                                     ngids_to_enforce,
4288                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4289                 }
4290
4291                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4292                 if (r < 0) {
4293                         *exit_status = EXIT_GROUP;
4294                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4295                 }
4296         }
4297
4298         /* If the user namespace was not set up above, try to do it now.
4299          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4300          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4301          * case of mount namespaces being less privileged when the mount point list is copied from a
4302          * different user namespace). */
4303
4304         if (needs_sandboxing && context->private_users && !userns_set_up) {
4305                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4306                 if (r < 0) {
4307                         *exit_status = EXIT_USER;
4308                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4309                 }
4310         }
4311
4312         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4313          * shall execute. */
4314
4315         _cleanup_free_ char *executable = NULL;
4316         _cleanup_close_ int executable_fd = -1;
4317         r = find_executable_full(command->path, false, &executable, &executable_fd);
4318         if (r < 0) {
4319                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4320                         log_unit_struct_errno(unit, LOG_INFO, r,
4321                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4322                                               LOG_UNIT_INVOCATION_ID(unit),
4323                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4324                                                                command->path),
4325                                               "EXECUTABLE=%s", command->path);
4326                         return 0;
4327                 }
4328
4329                 *exit_status = EXIT_EXEC;
4330
4331                 return log_unit_struct_errno(unit, LOG_INFO, r,
4332                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4333                                              LOG_UNIT_INVOCATION_ID(unit),
4334                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4335                                                               command->path),
4336                                              "EXECUTABLE=%s", command->path);
4337         }
4338
4339         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4340         if (r < 0) {
4341                 *exit_status = EXIT_FDS;
4342                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4343         }
4344
4345 #if HAVE_SELINUX
4346         if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) {
4347                 r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net);
4348                 if (r < 0) {
4349                         *exit_status = EXIT_SELINUX_CONTEXT;
4350                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4351                 }
4352         }
4353 #endif
4354
4355         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4356          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4357          * however if we have it as we want to keep it open until the final execve(). */
4358
4359         r = close_all_fds(keep_fds, n_keep_fds);
4360         if (r >= 0)
4361                 r = shift_fds(fds, n_fds);
4362         if (r >= 0)
4363                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4364         if (r < 0) {
4365                 *exit_status = EXIT_FDS;
4366                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4367         }
4368
4369         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4370          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4371          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4372          * came this far. */
4373
4374         secure_bits = context->secure_bits;
4375
4376         if (needs_sandboxing) {
4377                 uint64_t bset;
4378
4379                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4380                  * requested. (Note this is placed after the general resource limit initialization, see
4381                  * above, in order to take precedence.) */
4382                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4383                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4384                                 *exit_status = EXIT_LIMITS;
4385                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4386                         }
4387                 }
4388
4389 #if ENABLE_SMACK
4390                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4391                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4392                 if (use_smack) {
4393                         r = setup_smack(context, executable_fd);
4394                         if (r < 0) {
4395                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4396                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4397                         }
4398                 }
4399 #endif
4400
4401                 bset = context->capability_bounding_set;
4402                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4403                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4404                  * instead of us doing that */
4405                 if (needs_ambient_hack)
4406                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4407                                 (UINT64_C(1) << CAP_SETUID) |
4408                                 (UINT64_C(1) << CAP_SETGID);
4409
4410                 if (!cap_test_all(bset)) {
4411                         r = capability_bounding_set_drop(bset, false);
4412                         if (r < 0) {
4413                                 *exit_status = EXIT_CAPABILITIES;
4414                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4415                         }
4416                 }
4417
4418                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4419                  * keep-caps set.
4420                  * To be able to raise the ambient capabilities after setresuid() they have to be
4421                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4422                  * After setresuid() the ambient capabilities can be raised as they are present in
4423                  * the permitted and inhertiable set. However it is possible that someone wants to
4424                  * set ambient capabilities without changing the user, so we also set the ambient
4425                  * capabilities here.
4426                  * The requested ambient capabilities are raised in the inheritable set if the
4427                  * second argument is true. */
4428                 if (!needs_ambient_hack) {
4429                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4430                         if (r < 0) {
4431                                 *exit_status = EXIT_CAPABILITIES;
4432                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4433                         }
4434                 }
4435         }
4436
4437         /* chroot to root directory first, before we lose the ability to chroot */
4438         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4439         if (r < 0)
4440                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4441
4442         if (needs_setuid) {
4443                 if (uid_is_valid(uid)) {
4444                         r = enforce_user(context, uid);
4445                         if (r < 0) {
4446                                 *exit_status = EXIT_USER;
4447                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4448                         }
4449
4450                         if (!needs_ambient_hack &&
4451                             context->capability_ambient_set != 0) {
4452
4453                                 /* Raise the ambient capabilities after user change. */
4454                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4455                                 if (r < 0) {
4456                                         *exit_status = EXIT_CAPABILITIES;
4457                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4458                                 }
4459                         }
4460                 }
4461         }
4462
4463         /* Apply working directory here, because the working directory might be on NFS and only the user running
4464          * this service might have the correct privilege to change to the working directory */
4465         r = apply_working_directory(context, params, home, exit_status);
4466         if (r < 0)
4467                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4468
4469         if (needs_sandboxing) {
4470                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4471                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4472                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4473                  * are restricted. */
4474
4475 #if HAVE_SELINUX
4476                 if (use_selinux) {
4477                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4478
4479                         if (exec_context) {
4480                                 r = setexeccon(exec_context);
4481                                 if (r < 0) {
4482                                         *exit_status = EXIT_SELINUX_CONTEXT;
4483                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4484                                 }
4485                         }
4486                 }
4487 #endif
4488
4489 #if HAVE_APPARMOR
4490                 if (use_apparmor && context->apparmor_profile) {
4491                         r = aa_change_onexec(context->apparmor_profile);
4492                         if (r < 0 && !context->apparmor_profile_ignore) {
4493                                 *exit_status = EXIT_APPARMOR_PROFILE;
4494                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4495                         }
4496                 }
4497 #endif
4498
4499                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4500                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4501                  * CAP_SETPCAP. */
4502                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4503                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4504                          * effective set here.
4505                          * The effective set is overwritten during execve  with the following  values:
4506                          * - ambient set (for non-root processes)
4507                          * - (inheritable | bounding) set for root processes)
4508                          *
4509                          * Hence there is no security impact to raise it in the effective set before execve
4510                          */
4511                         r = capability_gain_cap_setpcap(NULL);
4512                         if (r < 0) {
4513                                 *exit_status = EXIT_CAPABILITIES;
4514                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4515                         }
4516                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4517                                 *exit_status = EXIT_SECUREBITS;
4518                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4519                         }
4520                 }
4521
4522                 if (context_has_no_new_privileges(context))
4523                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4524                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4525                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4526                         }
4527
4528 #if HAVE_SECCOMP
4529                 r = apply_address_families(unit, context);
4530                 if (r < 0) {
4531                         *exit_status = EXIT_ADDRESS_FAMILIES;
4532                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4533                 }
4534
4535                 r = apply_memory_deny_write_execute(unit, context);
4536                 if (r < 0) {
4537                         *exit_status = EXIT_SECCOMP;
4538                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4539                 }
4540
4541                 r = apply_restrict_realtime(unit, context);
4542                 if (r < 0) {
4543                         *exit_status = EXIT_SECCOMP;
4544                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4545                 }
4546
4547                 r = apply_restrict_suid_sgid(unit, context);
4548                 if (r < 0) {
4549                         *exit_status = EXIT_SECCOMP;
4550                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4551                 }
4552
4553                 r = apply_restrict_namespaces(unit, context);
4554                 if (r < 0) {
4555                         *exit_status = EXIT_SECCOMP;
4556                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4557                 }
4558
4559                 r = apply_protect_sysctl(unit, context);
4560                 if (r < 0) {
4561                         *exit_status = EXIT_SECCOMP;
4562                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4563                 }
4564
4565                 r = apply_protect_kernel_modules(unit, context);
4566                 if (r < 0) {
4567                         *exit_status = EXIT_SECCOMP;
4568                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4569                 }
4570
4571                 r = apply_protect_kernel_logs(unit, context);
4572                 if (r < 0) {
4573                         *exit_status = EXIT_SECCOMP;
4574                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4575                 }
4576
4577                 r = apply_protect_clock(unit, context);
4578                 if (r < 0) {
4579                         *exit_status = EXIT_SECCOMP;
4580                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4581                 }
4582
4583                 r = apply_private_devices(unit, context);
4584                 if (r < 0) {
4585                         *exit_status = EXIT_SECCOMP;
4586                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4587                 }
4588
4589                 r = apply_syscall_archs(unit, context);
4590                 if (r < 0) {
4591                         *exit_status = EXIT_SECCOMP;
4592                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4593                 }
4594
4595                 r = apply_lock_personality(unit, context);
4596                 if (r < 0) {
4597                         *exit_status = EXIT_SECCOMP;
4598                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4599                 }
4600
4601                 r = apply_syscall_log(unit, context);
4602                 if (r < 0) {
4603                         *exit_status = EXIT_SECCOMP;
4604                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4605                 }
4606
4607                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4608                  * by the filter as little as possible. */
4609                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4610                 if (r < 0) {
4611                         *exit_status = EXIT_SECCOMP;
4612                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4613                 }
4614 #endif
4615         }
4616
4617         if (!strv_isempty(context->unset_environment)) {
4618                 char **ee = NULL;
4619
4620                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4621                 if (!ee) {
4622                         *exit_status = EXIT_MEMORY;
4623                         return log_oom();
4624                 }
4625
4626                 strv_free_and_replace(accum_env, ee);
4627         }
4628
4629         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4630                 replaced_argv = replace_env_argv(command->argv, accum_env);
4631                 if (!replaced_argv) {
4632                         *exit_status = EXIT_MEMORY;
4633                         return log_oom();
4634                 }
4635                 final_argv = replaced_argv;
4636         } else
4637                 final_argv = command->argv;
4638
4639         if (DEBUG_LOGGING) {
4640                 _cleanup_free_ char *line = NULL;
4641
4642                 line = exec_command_line(final_argv);
4643                 if (line)
4644                         log_unit_struct(unit, LOG_DEBUG,
4645                                         "EXECUTABLE=%s", executable,
4646                                         LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4647                                         LOG_UNIT_INVOCATION_ID(unit));
4648         }
4649
4650         if (exec_fd >= 0) {
4651                 uint8_t hot = 1;
4652
4653                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4654                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4655
4656                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4657                         *exit_status = EXIT_EXEC;
4658                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4659                 }
4660         }
4661
4662         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4663
4664         if (exec_fd >= 0) {
4665                 uint8_t hot = 0;
4666
4667                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4668                  * that POLLHUP on it no longer means execve() succeeded. */
4669
4670                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4671                         *exit_status = EXIT_EXEC;
4672                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4673                 }
4674         }
4675
4676         *exit_status = EXIT_EXEC;
4677         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4678 }
4679
4680 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4681 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4682
4683 int exec_spawn(Unit *unit,
4684                ExecCommand *command,
4685                const ExecContext *context,
4686                const ExecParameters *params,
4687                ExecRuntime *runtime,
4688                DynamicCreds *dcreds,
4689                pid_t *ret) {
4690
4691         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4692         _cleanup_free_ char *subcgroup_path = NULL;
4693         _cleanup_strv_free_ char **files_env = NULL;
4694         size_t n_storage_fds = 0, n_socket_fds = 0;
4695         _cleanup_free_ char *line = NULL;
4696         pid_t pid;
4697
4698         assert(unit);
4699         assert(command);
4700         assert(context);
4701         assert(ret);
4702         assert(params);
4703         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4704
4705         if (context->std_input == EXEC_INPUT_SOCKET ||
4706             context->std_output == EXEC_OUTPUT_SOCKET ||
4707             context->std_error == EXEC_OUTPUT_SOCKET) {
4708
4709                 if (params->n_socket_fds > 1)
4710                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4711
4712                 if (params->n_socket_fds == 0)
4713                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4714
4715                 socket_fd = params->fds[0];
4716         } else {
4717                 socket_fd = -1;
4718                 fds = params->fds;
4719                 n_socket_fds = params->n_socket_fds;
4720                 n_storage_fds = params->n_storage_fds;
4721         }
4722
4723         r = exec_context_named_iofds(context, params, named_iofds);
4724         if (r < 0)
4725                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4726
4727         r = exec_context_load_environment(unit, context, &files_env);
4728         if (r < 0)
4729                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4730
4731         line = exec_command_line(command->argv);
4732         if (!line)
4733                 return log_oom();
4734
4735         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4736            and, until the next SELinux policy changes, we save further reloads in future children. */
4737         mac_selinux_maybe_reload();
4738
4739         log_unit_struct(unit, LOG_DEBUG,
4740                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4741                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4742                                                            the mount namespace in the child, but we want to log
4743                                                            from the parent, so we need to use the (possibly
4744                                                            inaccurate) path here. */
4745                         LOG_UNIT_INVOCATION_ID(unit));
4746
4747         if (params->cgroup_path) {
4748                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4749                 if (r < 0)
4750                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4751                 if (r > 0) { /* We are using a child cgroup */
4752                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4753                         if (r < 0)
4754                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4755
4756                         /* Normally we would not propagate the oomd xattrs to children but since we created this
4757                          * sub-cgroup internally we should do it. */
4758                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
4759                 }
4760         }
4761
4762         pid = fork();
4763         if (pid < 0)
4764                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4765
4766         if (pid == 0) {
4767                 int exit_status = EXIT_SUCCESS;
4768
4769                 r = exec_child(unit,
4770                                command,
4771                                context,
4772                                params,
4773                                runtime,
4774                                dcreds,
4775                                socket_fd,
4776                                named_iofds,
4777                                fds,
4778                                n_socket_fds,
4779                                n_storage_fds,
4780                                files_env,
4781                                unit->manager->user_lookup_fds[1],
4782                                &exit_status);
4783
4784                 if (r < 0) {
4785                         const char *status =
4786                                 exit_status_to_string(exit_status,
4787                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4788
4789                         log_unit_struct_errno(unit, LOG_ERR, r,
4790                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4791                                               LOG_UNIT_INVOCATION_ID(unit),
4792                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4793                                                                status, command->path),
4794                                               "EXECUTABLE=%s", command->path);
4795                 }
4796
4797                 _exit(exit_status);
4798         }
4799
4800         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4801
4802         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4803          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4804          * process will be killed too). */
4805         if (subcgroup_path)
4806                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4807
4808         exec_status_start(&command->exec_status, pid);
4809
4810         *ret = pid;
4811         return 0;
4812 }
4813
4814 void exec_context_init(ExecContext *c) {
4815         assert(c);
4816
4817         c->umask = 0022;
4818         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4819         c->cpu_sched_policy = SCHED_OTHER;
4820         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4821         c->syslog_level_prefix = true;
4822         c->ignore_sigpipe = true;
4823         c->timer_slack_nsec = NSEC_INFINITY;
4824         c->personality = PERSONALITY_INVALID;
4825         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4826                 c->directories[t].mode = 0755;
4827         c->timeout_clean_usec = USEC_INFINITY;
4828         c->capability_bounding_set = CAP_ALL;
4829         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4830         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4831         c->log_level_max = -1;
4832 #if HAVE_SECCOMP
4833         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4834 #endif
4835         numa_policy_reset(&c->numa_policy);
4836 }
4837
4838 void exec_context_done(ExecContext *c) {
4839         assert(c);
4840
4841         c->environment = strv_free(c->environment);
4842         c->environment_files = strv_free(c->environment_files);
4843         c->pass_environment = strv_free(c->pass_environment);
4844         c->unset_environment = strv_free(c->unset_environment);
4845
4846         rlimit_free_all(c->rlimit);
4847
4848         for (size_t l = 0; l < 3; l++) {
4849                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4850                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4851         }
4852
4853         c->working_directory = mfree(c->working_directory);
4854         c->root_directory = mfree(c->root_directory);
4855         c->root_image = mfree(c->root_image);
4856         c->root_image_options = mount_options_free_all(c->root_image_options);
4857         c->root_hash = mfree(c->root_hash);
4858         c->root_hash_size = 0;
4859         c->root_hash_path = mfree(c->root_hash_path);
4860         c->root_hash_sig = mfree(c->root_hash_sig);
4861         c->root_hash_sig_size = 0;
4862         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4863         c->root_verity = mfree(c->root_verity);
4864         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
4865         c->tty_path = mfree(c->tty_path);
4866         c->syslog_identifier = mfree(c->syslog_identifier);
4867         c->user = mfree(c->user);
4868         c->group = mfree(c->group);
4869
4870         c->supplementary_groups = strv_free(c->supplementary_groups);
4871
4872         c->pam_name = mfree(c->pam_name);
4873
4874         c->read_only_paths = strv_free(c->read_only_paths);
4875         c->read_write_paths = strv_free(c->read_write_paths);
4876         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4877         c->exec_paths = strv_free(c->exec_paths);
4878         c->no_exec_paths = strv_free(c->no_exec_paths);
4879
4880         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4881         c->bind_mounts = NULL;
4882         c->n_bind_mounts = 0;
4883         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4884         c->temporary_filesystems = NULL;
4885         c->n_temporary_filesystems = 0;
4886         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
4887
4888         cpu_set_reset(&c->cpu_set);
4889         numa_policy_reset(&c->numa_policy);
4890
4891         c->utmp_id = mfree(c->utmp_id);
4892         c->selinux_context = mfree(c->selinux_context);
4893         c->apparmor_profile = mfree(c->apparmor_profile);
4894         c->smack_process_label = mfree(c->smack_process_label);
4895
4896         c->syscall_filter = hashmap_free(c->syscall_filter);
4897         c->syscall_archs = set_free(c->syscall_archs);
4898         c->address_families = set_free(c->address_families);
4899
4900         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4901                 c->directories[t].paths = strv_free(c->directories[t].paths);
4902
4903         c->log_level_max = -1;
4904
4905         exec_context_free_log_extra_fields(c);
4906
4907         c->log_ratelimit_interval_usec = 0;
4908         c->log_ratelimit_burst = 0;
4909
4910         c->stdin_data = mfree(c->stdin_data);
4911         c->stdin_data_size = 0;
4912
4913         c->network_namespace_path = mfree(c->network_namespace_path);
4914         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
4915
4916         c->log_namespace = mfree(c->log_namespace);
4917
4918         c->load_credentials = strv_free(c->load_credentials);
4919         c->set_credentials = hashmap_free(c->set_credentials);
4920 }
4921
4922 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4923         char **i;
4924
4925         assert(c);
4926
4927         if (!runtime_prefix)
4928                 return 0;
4929
4930         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4931                 _cleanup_free_ char *p = NULL;
4932
4933                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4934                         p = path_join(runtime_prefix, "private", *i);
4935                 else
4936                         p = path_join(runtime_prefix, *i);
4937                 if (!p)
4938                         return -ENOMEM;
4939
4940                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4941                  * service next. */
4942                 (void) rm_rf(p, REMOVE_ROOT);
4943         }
4944
4945         return 0;
4946 }
4947
4948 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4949         _cleanup_free_ char *p = NULL;
4950
4951         assert(c);
4952
4953         if (!runtime_prefix || !unit)
4954                 return 0;
4955
4956         p = path_join(runtime_prefix, "credentials", unit);
4957         if (!p)
4958                 return -ENOMEM;
4959
4960         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4961          * unmount it, and afterwards remove the mount point */
4962         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4963         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4964
4965         return 0;
4966 }
4967
4968 static void exec_command_done(ExecCommand *c) {
4969         assert(c);
4970
4971         c->path = mfree(c->path);
4972         c->argv = strv_free(c->argv);
4973 }
4974
4975 void exec_command_done_array(ExecCommand *c, size_t n) {
4976         for (size_t i = 0; i < n; i++)
4977                 exec_command_done(c+i);
4978 }
4979
4980 ExecCommand* exec_command_free_list(ExecCommand *c) {
4981         ExecCommand *i;
4982
4983         while ((i = c)) {
4984                 LIST_REMOVE(command, c, i);
4985                 exec_command_done(i);
4986                 free(i);
4987         }
4988
4989         return NULL;
4990 }
4991
4992 void exec_command_free_array(ExecCommand **c, size_t n) {
4993         for (size_t i = 0; i < n; i++)
4994                 c[i] = exec_command_free_list(c[i]);
4995 }
4996
4997 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4998         for (size_t i = 0; i < n; i++)
4999                 exec_status_reset(&c[i].exec_status);
5000 }
5001
5002 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5003         for (size_t i = 0; i < n; i++) {
5004                 ExecCommand *z;
5005
5006                 LIST_FOREACH(command, z, c[i])
5007                         exec_status_reset(&z->exec_status);
5008         }
5009 }
5010
5011 typedef struct InvalidEnvInfo {
5012         const Unit *unit;
5013         const char *path;
5014 } InvalidEnvInfo;
5015
5016 static void invalid_env(const char *p, void *userdata) {
5017         InvalidEnvInfo *info = userdata;
5018
5019         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5020 }
5021
5022 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5023         assert(c);
5024
5025         switch (fd_index) {
5026
5027         case STDIN_FILENO:
5028                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5029                         return NULL;
5030
5031                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5032
5033         case STDOUT_FILENO:
5034                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5035                         return NULL;
5036
5037                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5038
5039         case STDERR_FILENO:
5040                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5041                         return NULL;
5042
5043                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5044
5045         default:
5046                 return NULL;
5047         }
5048 }
5049
5050 static int exec_context_named_iofds(
5051                 const ExecContext *c,
5052                 const ExecParameters *p,
5053                 int named_iofds[static 3]) {
5054
5055         size_t targets;
5056         const char* stdio_fdname[3];
5057         size_t n_fds;
5058
5059         assert(c);
5060         assert(p);
5061         assert(named_iofds);
5062
5063         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5064                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5065                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5066
5067         for (size_t i = 0; i < 3; i++)
5068                 stdio_fdname[i] = exec_context_fdname(c, i);
5069
5070         n_fds = p->n_storage_fds + p->n_socket_fds;
5071
5072         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5073                 if (named_iofds[STDIN_FILENO] < 0 &&
5074                     c->std_input == EXEC_INPUT_NAMED_FD &&
5075                     stdio_fdname[STDIN_FILENO] &&
5076                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5077
5078                         named_iofds[STDIN_FILENO] = p->fds[i];
5079                         targets--;
5080
5081                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5082                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5083                            stdio_fdname[STDOUT_FILENO] &&
5084                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5085
5086                         named_iofds[STDOUT_FILENO] = p->fds[i];
5087                         targets--;
5088
5089                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5090                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5091                            stdio_fdname[STDERR_FILENO] &&
5092                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5093
5094                         named_iofds[STDERR_FILENO] = p->fds[i];
5095                         targets--;
5096                 }
5097
5098         return targets == 0 ? 0 : -ENOENT;
5099 }
5100
5101 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5102         char **i, **r = NULL;
5103
5104         assert(c);
5105         assert(l);
5106
5107         STRV_FOREACH(i, c->environment_files) {
5108                 char *fn;
5109                 int k;
5110                 bool ignore = false;
5111                 char **p;
5112                 _cleanup_globfree_ glob_t pglob = {};
5113
5114                 fn = *i;
5115
5116                 if (fn[0] == '-') {
5117                         ignore = true;
5118                         fn++;
5119                 }
5120
5121                 if (!path_is_absolute(fn)) {
5122                         if (ignore)
5123                                 continue;
5124
5125                         strv_free(r);
5126                         return -EINVAL;
5127                 }
5128
5129                 /* Filename supports globbing, take all matching files */
5130                 k = safe_glob(fn, 0, &pglob);
5131                 if (k < 0) {
5132                         if (ignore)
5133                                 continue;
5134
5135                         strv_free(r);
5136                         return k;
5137                 }
5138
5139                 /* When we don't match anything, -ENOENT should be returned */
5140                 assert(pglob.gl_pathc > 0);
5141
5142                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5143                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5144                         if (k < 0) {
5145                                 if (ignore)
5146                                         continue;
5147
5148                                 strv_free(r);
5149                                 return k;
5150                         }
5151                         /* Log invalid environment variables with filename */
5152                         if (p) {
5153                                 InvalidEnvInfo info = {
5154                                         .unit = unit,
5155                                         .path = pglob.gl_pathv[n]
5156                                 };
5157
5158                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5159                         }
5160
5161                         if (!r)
5162                                 r = p;
5163                         else {
5164                                 char **m;
5165
5166                                 m = strv_env_merge(2, r, p);
5167                                 strv_free(r);
5168                                 strv_free(p);
5169                                 if (!m)
5170                                         return -ENOMEM;
5171
5172                                 r = m;
5173                         }
5174                 }
5175         }
5176
5177         *l = r;
5178
5179         return 0;
5180 }
5181
5182 static bool tty_may_match_dev_console(const char *tty) {
5183         _cleanup_free_ char *resolved = NULL;
5184
5185         if (!tty)
5186                 return true;
5187
5188         tty = skip_dev_prefix(tty);
5189
5190         /* trivial identity? */
5191         if (streq(tty, "console"))
5192                 return true;
5193
5194         if (resolve_dev_console(&resolved) < 0)
5195                 return true; /* if we could not resolve, assume it may */
5196
5197         /* "tty0" means the active VC, so it may be the same sometimes */
5198         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5199 }
5200
5201 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5202         assert(ec);
5203
5204         return ec->tty_reset ||
5205                 ec->tty_vhangup ||
5206                 ec->tty_vt_disallocate ||
5207                 is_terminal_input(ec->std_input) ||
5208                 is_terminal_output(ec->std_output) ||
5209                 is_terminal_output(ec->std_error);
5210 }
5211
5212 bool exec_context_may_touch_console(const ExecContext *ec) {
5213
5214         return exec_context_may_touch_tty(ec) &&
5215                tty_may_match_dev_console(exec_context_tty_path(ec));
5216 }
5217
5218 static void strv_fprintf(FILE *f, char **l) {
5219         char **g;
5220
5221         assert(f);
5222
5223         STRV_FOREACH(g, l)
5224                 fprintf(f, " %s", *g);
5225 }
5226
5227 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5228         assert(f);
5229         assert(prefix);
5230         assert(name);
5231
5232         if (!strv_isempty(strv)) {
5233                 fprintf(f, "%s%s:", prefix, name);
5234                 strv_fprintf(f, strv);
5235                 fputs("\n", f);
5236         }
5237 }
5238
5239 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5240         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
5241         int r;
5242
5243         assert(c);
5244         assert(f);
5245
5246         prefix = strempty(prefix);
5247
5248         fprintf(f,
5249                 "%sUMask: %04o\n"
5250                 "%sWorkingDirectory: %s\n"
5251                 "%sRootDirectory: %s\n"
5252                 "%sNonBlocking: %s\n"
5253                 "%sPrivateTmp: %s\n"
5254                 "%sPrivateDevices: %s\n"
5255                 "%sProtectKernelTunables: %s\n"
5256                 "%sProtectKernelModules: %s\n"
5257                 "%sProtectKernelLogs: %s\n"
5258                 "%sProtectClock: %s\n"
5259                 "%sProtectControlGroups: %s\n"
5260                 "%sPrivateNetwork: %s\n"
5261                 "%sPrivateUsers: %s\n"
5262                 "%sProtectHome: %s\n"
5263                 "%sProtectSystem: %s\n"
5264                 "%sMountAPIVFS: %s\n"
5265                 "%sIgnoreSIGPIPE: %s\n"
5266                 "%sMemoryDenyWriteExecute: %s\n"
5267                 "%sRestrictRealtime: %s\n"
5268                 "%sRestrictSUIDSGID: %s\n"
5269                 "%sKeyringMode: %s\n"
5270                 "%sProtectHostname: %s\n"
5271                 "%sProtectProc: %s\n"
5272                 "%sProcSubset: %s\n",
5273                 prefix, c->umask,
5274                 prefix, empty_to_root(c->working_directory),
5275                 prefix, empty_to_root(c->root_directory),
5276                 prefix, yes_no(c->non_blocking),
5277                 prefix, yes_no(c->private_tmp),
5278                 prefix, yes_no(c->private_devices),
5279                 prefix, yes_no(c->protect_kernel_tunables),
5280                 prefix, yes_no(c->protect_kernel_modules),
5281                 prefix, yes_no(c->protect_kernel_logs),
5282                 prefix, yes_no(c->protect_clock),
5283                 prefix, yes_no(c->protect_control_groups),
5284                 prefix, yes_no(c->private_network),
5285                 prefix, yes_no(c->private_users),
5286                 prefix, protect_home_to_string(c->protect_home),
5287                 prefix, protect_system_to_string(c->protect_system),
5288                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5289                 prefix, yes_no(c->ignore_sigpipe),
5290                 prefix, yes_no(c->memory_deny_write_execute),
5291                 prefix, yes_no(c->restrict_realtime),
5292                 prefix, yes_no(c->restrict_suid_sgid),
5293                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5294                 prefix, yes_no(c->protect_hostname),
5295                 prefix, protect_proc_to_string(c->protect_proc),
5296                 prefix, proc_subset_to_string(c->proc_subset));
5297
5298         if (c->root_image)
5299                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5300
5301         if (c->root_image_options) {
5302                 MountOptions *o;
5303
5304                 fprintf(f, "%sRootImageOptions:", prefix);
5305                 LIST_FOREACH(mount_options, o, c->root_image_options)
5306                         if (!isempty(o->options))
5307                                 fprintf(f, " %s:%s",
5308                                         partition_designator_to_string(o->partition_designator),
5309                                         o->options);
5310                 fprintf(f, "\n");
5311         }
5312
5313         if (c->root_hash) {
5314                 _cleanup_free_ char *encoded = NULL;
5315                 encoded = hexmem(c->root_hash, c->root_hash_size);
5316                 if (encoded)
5317                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5318         }
5319
5320         if (c->root_hash_path)
5321                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5322
5323         if (c->root_hash_sig) {
5324                 _cleanup_free_ char *encoded = NULL;
5325                 ssize_t len;
5326                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5327                 if (len)
5328                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5329         }
5330
5331         if (c->root_hash_sig_path)
5332                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5333
5334         if (c->root_verity)
5335                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5336
5337         STRV_FOREACH(e, c->environment)
5338                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5339
5340         STRV_FOREACH(e, c->environment_files)
5341                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5342
5343         STRV_FOREACH(e, c->pass_environment)
5344                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5345
5346         STRV_FOREACH(e, c->unset_environment)
5347                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5348
5349         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5350
5351         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5352                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5353
5354                 STRV_FOREACH(d, c->directories[dt].paths)
5355                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5356         }
5357
5358         fprintf(f,
5359                 "%sTimeoutCleanSec: %s\n",
5360                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5361
5362         if (c->nice_set)
5363                 fprintf(f,
5364                         "%sNice: %i\n",
5365                         prefix, c->nice);
5366
5367         if (c->oom_score_adjust_set)
5368                 fprintf(f,
5369                         "%sOOMScoreAdjust: %i\n",
5370                         prefix, c->oom_score_adjust);
5371
5372         if (c->coredump_filter_set)
5373                 fprintf(f,
5374                         "%sCoredumpFilter: 0x%"PRIx64"\n",
5375                         prefix, c->coredump_filter);
5376
5377         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5378                 if (c->rlimit[i]) {
5379                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5380                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5381                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5382                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5383                 }
5384
5385         if (c->ioprio_set) {
5386                 _cleanup_free_ char *class_str = NULL;
5387
5388                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5389                 if (r >= 0)
5390                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5391
5392                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
5393         }
5394
5395         if (c->cpu_sched_set) {
5396                 _cleanup_free_ char *policy_str = NULL;
5397
5398                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5399                 if (r >= 0)
5400                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5401
5402                 fprintf(f,
5403                         "%sCPUSchedulingPriority: %i\n"
5404                         "%sCPUSchedulingResetOnFork: %s\n",
5405                         prefix, c->cpu_sched_priority,
5406                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5407         }
5408
5409         if (c->cpu_set.set) {
5410                 _cleanup_free_ char *affinity = NULL;
5411
5412                 affinity = cpu_set_to_range_string(&c->cpu_set);
5413                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5414         }
5415
5416         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5417                 _cleanup_free_ char *nodes = NULL;
5418
5419                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5420                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5421                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5422         }
5423
5424         if (c->timer_slack_nsec != NSEC_INFINITY)
5425                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5426
5427         fprintf(f,
5428                 "%sStandardInput: %s\n"
5429                 "%sStandardOutput: %s\n"
5430                 "%sStandardError: %s\n",
5431                 prefix, exec_input_to_string(c->std_input),
5432                 prefix, exec_output_to_string(c->std_output),
5433                 prefix, exec_output_to_string(c->std_error));
5434
5435         if (c->std_input == EXEC_INPUT_NAMED_FD)
5436                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5437         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5438                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5439         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5440                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5441
5442         if (c->std_input == EXEC_INPUT_FILE)
5443                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5444         if (c->std_output == EXEC_OUTPUT_FILE)
5445                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5446         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5447                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5448         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5449                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5450         if (c->std_error == EXEC_OUTPUT_FILE)
5451                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5452         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5453                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5454         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5455                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5456
5457         if (c->tty_path)
5458                 fprintf(f,
5459                         "%sTTYPath: %s\n"
5460                         "%sTTYReset: %s\n"
5461                         "%sTTYVHangup: %s\n"
5462                         "%sTTYVTDisallocate: %s\n",
5463                         prefix, c->tty_path,
5464                         prefix, yes_no(c->tty_reset),
5465                         prefix, yes_no(c->tty_vhangup),
5466                         prefix, yes_no(c->tty_vt_disallocate));
5467
5468         if (IN_SET(c->std_output,
5469                    EXEC_OUTPUT_KMSG,
5470                    EXEC_OUTPUT_JOURNAL,
5471                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5472                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5473             IN_SET(c->std_error,
5474                    EXEC_OUTPUT_KMSG,
5475                    EXEC_OUTPUT_JOURNAL,
5476                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5477                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5478
5479                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5480
5481                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5482                 if (r >= 0)
5483                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5484
5485                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5486                 if (r >= 0)
5487                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5488         }
5489
5490         if (c->log_level_max >= 0) {
5491                 _cleanup_free_ char *t = NULL;
5492
5493                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5494
5495                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5496         }
5497
5498         if (c->log_ratelimit_interval_usec > 0) {
5499                 char buf_timespan[FORMAT_TIMESPAN_MAX];
5500
5501                 fprintf(f,
5502                         "%sLogRateLimitIntervalSec: %s\n",
5503                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
5504         }
5505
5506         if (c->log_ratelimit_burst > 0)
5507                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5508
5509         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5510                 fprintf(f, "%sLogExtraFields: ", prefix);
5511                 fwrite(c->log_extra_fields[j].iov_base,
5512                        1, c->log_extra_fields[j].iov_len,
5513                        f);
5514                 fputc('\n', f);
5515         }
5516
5517         if (c->log_namespace)
5518                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5519
5520         if (c->secure_bits) {
5521                 _cleanup_free_ char *str = NULL;
5522
5523                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5524                 if (r >= 0)
5525                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5526         }
5527
5528         if (c->capability_bounding_set != CAP_ALL) {
5529                 _cleanup_free_ char *str = NULL;
5530
5531                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5532                 if (r >= 0)
5533                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5534         }
5535
5536         if (c->capability_ambient_set != 0) {
5537                 _cleanup_free_ char *str = NULL;
5538
5539                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5540                 if (r >= 0)
5541                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5542         }
5543
5544         if (c->user)
5545                 fprintf(f, "%sUser: %s\n", prefix, c->user);
5546         if (c->group)
5547                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5548
5549         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5550
5551         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5552
5553         if (c->pam_name)
5554                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5555
5556         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5557         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5558         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5559         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5560         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5561
5562         for (size_t i = 0; i < c->n_bind_mounts; i++)
5563                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5564                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5565                         c->bind_mounts[i].ignore_enoent ? "-": "",
5566                         c->bind_mounts[i].source,
5567                         c->bind_mounts[i].destination,
5568                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
5569
5570         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5571                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5572
5573                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5574                         t->path,
5575                         isempty(t->options) ? "" : ":",
5576                         strempty(t->options));
5577         }
5578
5579         if (c->utmp_id)
5580                 fprintf(f,
5581                         "%sUtmpIdentifier: %s\n",
5582                         prefix, c->utmp_id);
5583
5584         if (c->selinux_context)
5585                 fprintf(f,
5586                         "%sSELinuxContext: %s%s\n",
5587                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5588
5589         if (c->apparmor_profile)
5590                 fprintf(f,
5591                         "%sAppArmorProfile: %s%s\n",
5592                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5593
5594         if (c->smack_process_label)
5595                 fprintf(f,
5596                         "%sSmackProcessLabel: %s%s\n",
5597                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5598
5599         if (c->personality != PERSONALITY_INVALID)
5600                 fprintf(f,
5601                         "%sPersonality: %s\n",
5602                         prefix, strna(personality_to_string(c->personality)));
5603
5604         fprintf(f,
5605                 "%sLockPersonality: %s\n",
5606                 prefix, yes_no(c->lock_personality));
5607
5608         if (c->syscall_filter) {
5609 #if HAVE_SECCOMP
5610                 void *id, *val;
5611                 bool first = true;
5612 #endif
5613
5614                 fprintf(f,
5615                         "%sSystemCallFilter: ",
5616                         prefix);
5617
5618                 if (!c->syscall_allow_list)
5619                         fputc('~', f);
5620
5621 #if HAVE_SECCOMP
5622                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5623                         _cleanup_free_ char *name = NULL;
5624                         const char *errno_name = NULL;
5625                         int num = PTR_TO_INT(val);
5626
5627                         if (first)
5628                                 first = false;
5629                         else
5630                                 fputc(' ', f);
5631
5632                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5633                         fputs(strna(name), f);
5634
5635                         if (num >= 0) {
5636                                 errno_name = seccomp_errno_or_action_to_string(num);
5637                                 if (errno_name)
5638                                         fprintf(f, ":%s", errno_name);
5639                                 else
5640                                         fprintf(f, ":%d", num);
5641                         }
5642                 }
5643 #endif
5644
5645                 fputc('\n', f);
5646         }
5647
5648         if (c->syscall_archs) {
5649 #if HAVE_SECCOMP
5650                 void *id;
5651 #endif
5652
5653                 fprintf(f,
5654                         "%sSystemCallArchitectures:",
5655                         prefix);
5656
5657 #if HAVE_SECCOMP
5658                 SET_FOREACH(id, c->syscall_archs)
5659                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5660 #endif
5661                 fputc('\n', f);
5662         }
5663
5664         if (exec_context_restrict_namespaces_set(c)) {
5665                 _cleanup_free_ char *s = NULL;
5666
5667                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5668                 if (r >= 0)
5669                         fprintf(f, "%sRestrictNamespaces: %s\n",
5670                                 prefix, strna(s));
5671         }
5672
5673         if (c->network_namespace_path)
5674                 fprintf(f,
5675                         "%sNetworkNamespacePath: %s\n",
5676                         prefix, c->network_namespace_path);
5677
5678         if (c->syscall_errno > 0) {
5679 #if HAVE_SECCOMP
5680                 const char *errno_name;
5681 #endif
5682
5683                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5684
5685 #if HAVE_SECCOMP
5686                 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5687                 if (errno_name)
5688                         fputs(errno_name, f);
5689                 else
5690                         fprintf(f, "%d", c->syscall_errno);
5691 #endif
5692                 fputc('\n', f);
5693         }
5694
5695         for (size_t i = 0; i < c->n_mount_images; i++) {
5696                 MountOptions *o;
5697
5698                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
5699                         c->mount_images[i].ignore_enoent ? "-": "",
5700                         c->mount_images[i].source,
5701                         c->mount_images[i].destination);
5702                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5703                         fprintf(f, ":%s:%s",
5704                                 partition_designator_to_string(o->partition_designator),
5705                                 strempty(o->options));
5706                 fprintf(f, "\n");
5707         }
5708
5709         for (size_t i = 0; i < c->n_extension_images; i++) {
5710                 MountOptions *o;
5711
5712                 fprintf(f, "%sExtensionImages: %s%s", prefix,
5713                         c->extension_images[i].ignore_enoent ? "-": "",
5714                         c->extension_images[i].source);
5715                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5716                         fprintf(f, ":%s:%s",
5717                                 partition_designator_to_string(o->partition_designator),
5718                                 strempty(o->options));
5719                 fprintf(f, "\n");
5720         }
5721 }
5722
5723 bool exec_context_maintains_privileges(const ExecContext *c) {
5724         assert(c);
5725
5726         /* Returns true if the process forked off would run under
5727          * an unchanged UID or as root. */
5728
5729         if (!c->user)
5730                 return true;
5731
5732         if (streq(c->user, "root") || streq(c->user, "0"))
5733                 return true;
5734
5735         return false;
5736 }
5737
5738 int exec_context_get_effective_ioprio(const ExecContext *c) {
5739         int p;
5740
5741         assert(c);
5742
5743         if (c->ioprio_set)
5744                 return c->ioprio;
5745
5746         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5747         if (p < 0)
5748                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5749
5750         return p;
5751 }
5752
5753 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5754         assert(c);
5755
5756         /* Explicit setting wins */
5757         if (c->mount_apivfs_set)
5758                 return c->mount_apivfs;
5759
5760         /* Default to "yes" if root directory or image are specified */
5761         if (exec_context_with_rootfs(c))
5762                 return true;
5763
5764         return false;
5765 }
5766
5767 void exec_context_free_log_extra_fields(ExecContext *c) {
5768         assert(c);
5769
5770         for (size_t l = 0; l < c->n_log_extra_fields; l++)
5771                 free(c->log_extra_fields[l].iov_base);
5772         c->log_extra_fields = mfree(c->log_extra_fields);
5773         c->n_log_extra_fields = 0;
5774 }
5775
5776 void exec_context_revert_tty(ExecContext *c) {
5777         _cleanup_close_ int fd = -1;
5778         const char *path;
5779         struct stat st;
5780         int r;
5781
5782         assert(c);
5783
5784         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5785         exec_context_tty_reset(c, NULL);
5786
5787         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5788          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5789          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5790         if (!exec_context_may_touch_tty(c))
5791                 return;
5792
5793         path = exec_context_tty_path(c);
5794         if (!path)
5795                 return;
5796
5797         fd = open(path, O_PATH|O_CLOEXEC);
5798         if (fd < 0)
5799                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
5800                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
5801                                              path);
5802
5803         if (fstat(fd, &st) < 0)
5804                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
5805
5806         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
5807          * if things are a character device, since a proper check either means we'd have to open the TTY and
5808          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
5809          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
5810          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
5811         if (!S_ISCHR(st.st_mode))
5812                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
5813
5814         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
5815         if (r < 0)
5816                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5817 }
5818
5819 int exec_context_get_clean_directories(
5820                 ExecContext *c,
5821                 char **prefix,
5822                 ExecCleanMask mask,
5823                 char ***ret) {
5824
5825         _cleanup_strv_free_ char **l = NULL;
5826         int r;
5827
5828         assert(c);
5829         assert(prefix);
5830         assert(ret);
5831
5832         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5833                 char **i;
5834
5835                 if (!FLAGS_SET(mask, 1U << t))
5836                         continue;
5837
5838                 if (!prefix[t])
5839                         continue;
5840
5841                 STRV_FOREACH(i, c->directories[t].paths) {
5842                         char *j;
5843
5844                         j = path_join(prefix[t], *i);
5845                         if (!j)
5846                                 return -ENOMEM;
5847
5848                         r = strv_consume(&l, j);
5849                         if (r < 0)
5850                                 return r;
5851
5852                         /* Also remove private directories unconditionally. */
5853                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5854                                 j = path_join(prefix[t], "private", *i);
5855                                 if (!j)
5856                                         return -ENOMEM;
5857
5858                                 r = strv_consume(&l, j);
5859                                 if (r < 0)
5860                                         return r;
5861                         }
5862                 }
5863         }
5864
5865         *ret = TAKE_PTR(l);
5866         return 0;
5867 }
5868
5869 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5870         ExecCleanMask mask = 0;
5871
5872         assert(c);
5873         assert(ret);
5874
5875         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5876                 if (!strv_isempty(c->directories[t].paths))
5877                         mask |= 1U << t;
5878
5879         *ret = mask;
5880         return 0;
5881 }
5882
5883 void exec_status_start(ExecStatus *s, pid_t pid) {
5884         assert(s);
5885
5886         *s = (ExecStatus) {
5887                 .pid = pid,
5888         };
5889
5890         dual_timestamp_get(&s->start_timestamp);
5891 }
5892
5893 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5894         assert(s);
5895
5896         if (s->pid != pid)
5897                 *s = (ExecStatus) {
5898                         .pid = pid,
5899                 };
5900
5901         dual_timestamp_get(&s->exit_timestamp);
5902
5903         s->code = code;
5904         s->status = status;
5905
5906         if (context && context->utmp_id)
5907                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5908 }
5909
5910 void exec_status_reset(ExecStatus *s) {
5911         assert(s);
5912
5913         *s = (ExecStatus) {};
5914 }
5915
5916 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5917         char buf[FORMAT_TIMESTAMP_MAX];
5918
5919         assert(s);
5920         assert(f);
5921
5922         if (s->pid <= 0)
5923                 return;
5924
5925         prefix = strempty(prefix);
5926
5927         fprintf(f,
5928                 "%sPID: "PID_FMT"\n",
5929                 prefix, s->pid);
5930
5931         if (dual_timestamp_is_set(&s->start_timestamp))
5932                 fprintf(f,
5933                         "%sStart Timestamp: %s\n",
5934                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5935
5936         if (dual_timestamp_is_set(&s->exit_timestamp))
5937                 fprintf(f,
5938                         "%sExit Timestamp: %s\n"
5939                         "%sExit Code: %s\n"
5940                         "%sExit Status: %i\n",
5941                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5942                         prefix, sigchld_code_to_string(s->code),
5943                         prefix, s->status);
5944 }
5945
5946 static char *exec_command_line(char **argv) {
5947         size_t k;
5948         char *n, *p, **a;
5949         bool first = true;
5950
5951         assert(argv);
5952
5953         k = 1;
5954         STRV_FOREACH(a, argv)
5955                 k += strlen(*a)+3;
5956
5957         n = new(char, k);
5958         if (!n)
5959                 return NULL;
5960
5961         p = n;
5962         STRV_FOREACH(a, argv) {
5963
5964                 if (!first)
5965                         *(p++) = ' ';
5966                 else
5967                         first = false;
5968
5969                 if (strpbrk(*a, WHITESPACE)) {
5970                         *(p++) = '\'';
5971                         p = stpcpy(p, *a);
5972                         *(p++) = '\'';
5973                 } else
5974                         p = stpcpy(p, *a);
5975
5976         }
5977
5978         *p = 0;
5979
5980         /* FIXME: this doesn't really handle arguments that have
5981          * spaces and ticks in them */
5982
5983         return n;
5984 }
5985
5986 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5987         _cleanup_free_ char *cmd = NULL;
5988         const char *prefix2;
5989
5990         assert(c);
5991         assert(f);
5992
5993         prefix = strempty(prefix);
5994         prefix2 = strjoina(prefix, "\t");
5995
5996         cmd = exec_command_line(c->argv);
5997         fprintf(f,
5998                 "%sCommand Line: %s\n",
5999                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
6000
6001         exec_status_dump(&c->exec_status, f, prefix2);
6002 }
6003
6004 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6005         assert(f);
6006
6007         prefix = strempty(prefix);
6008
6009         LIST_FOREACH(command, c, c)
6010                 exec_command_dump(c, f, prefix);
6011 }
6012
6013 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6014         ExecCommand *end;
6015
6016         assert(l);
6017         assert(e);
6018
6019         if (*l) {
6020                 /* It's kind of important, that we keep the order here */
6021                 LIST_FIND_TAIL(command, *l, end);
6022                 LIST_INSERT_AFTER(command, *l, end, e);
6023         } else
6024               *l = e;
6025 }
6026
6027 int exec_command_set(ExecCommand *c, const char *path, ...) {
6028         va_list ap;
6029         char **l, *p;
6030
6031         assert(c);
6032         assert(path);
6033
6034         va_start(ap, path);
6035         l = strv_new_ap(path, ap);
6036         va_end(ap);
6037
6038         if (!l)
6039                 return -ENOMEM;
6040
6041         p = strdup(path);
6042         if (!p) {
6043                 strv_free(l);
6044                 return -ENOMEM;
6045         }
6046
6047         free_and_replace(c->path, p);
6048
6049         return strv_free_and_replace(c->argv, l);
6050 }
6051
6052 int exec_command_append(ExecCommand *c, const char *path, ...) {
6053         _cleanup_strv_free_ char **l = NULL;
6054         va_list ap;
6055         int r;
6056
6057         assert(c);
6058         assert(path);
6059
6060         va_start(ap, path);
6061         l = strv_new_ap(path, ap);
6062         va_end(ap);
6063
6064         if (!l)
6065                 return -ENOMEM;
6066
6067         r = strv_extend_strv(&c->argv, l, false);
6068         if (r < 0)
6069                 return r;
6070
6071         return 0;
6072 }
6073
6074 static void *remove_tmpdir_thread(void *p) {
6075         _cleanup_free_ char *path = p;
6076
6077         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6078         return NULL;
6079 }
6080
6081 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6082         int r;
6083
6084         if (!rt)
6085                 return NULL;
6086
6087         if (rt->manager)
6088                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6089
6090         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6091
6092         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6093                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6094
6095                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6096                 if (r < 0)
6097                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6098                 else
6099                         rt->tmp_dir = NULL;
6100         }
6101
6102         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6103                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6104
6105                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6106                 if (r < 0)
6107                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6108                 else
6109                         rt->var_tmp_dir = NULL;
6110         }
6111
6112         rt->id = mfree(rt->id);
6113         rt->tmp_dir = mfree(rt->tmp_dir);
6114         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6115         safe_close_pair(rt->netns_storage_socket);
6116         safe_close_pair(rt->ipcns_storage_socket);
6117         return mfree(rt);
6118 }
6119
6120 static void exec_runtime_freep(ExecRuntime **rt) {
6121         (void) exec_runtime_free(*rt, false);
6122 }
6123
6124 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6125         _cleanup_free_ char *id_copy = NULL;
6126         ExecRuntime *n;
6127
6128         assert(ret);
6129
6130         id_copy = strdup(id);
6131         if (!id_copy)
6132                 return -ENOMEM;
6133
6134         n = new(ExecRuntime, 1);
6135         if (!n)
6136                 return -ENOMEM;
6137
6138         *n = (ExecRuntime) {
6139                 .id = TAKE_PTR(id_copy),
6140                 .netns_storage_socket = { -1, -1 },
6141                 .ipcns_storage_socket = { -1, -1 },
6142         };
6143
6144         *ret = n;
6145         return 0;
6146 }
6147
6148 static int exec_runtime_add(
6149                 Manager *m,
6150                 const char *id,
6151                 char **tmp_dir,
6152                 char **var_tmp_dir,
6153                 int netns_storage_socket[2],
6154                 int ipcns_storage_socket[2],
6155                 ExecRuntime **ret) {
6156
6157         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6158         int r;
6159
6160         assert(m);
6161         assert(id);
6162
6163         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6164
6165         r = exec_runtime_allocate(&rt, id);
6166         if (r < 0)
6167                 return r;
6168
6169         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6170         if (r < 0)
6171                 return r;
6172
6173         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6174         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6175         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6176
6177         if (netns_storage_socket) {
6178                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6179                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6180         }
6181
6182         if (ipcns_storage_socket) {
6183                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6184                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6185         }
6186
6187         rt->manager = m;
6188
6189         if (ret)
6190                 *ret = rt;
6191         /* do not remove created ExecRuntime object when the operation succeeds. */
6192         TAKE_PTR(rt);
6193         return 0;
6194 }
6195
6196 static int exec_runtime_make(
6197                 Manager *m,
6198                 const ExecContext *c,
6199                 const char *id,
6200                 ExecRuntime **ret) {
6201
6202         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6203         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6204         int r;
6205
6206         assert(m);
6207         assert(c);
6208         assert(id);
6209
6210         /* It is not necessary to create ExecRuntime object. */
6211         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6212                 *ret = NULL;
6213                 return 0;
6214         }
6215
6216         if (c->private_tmp &&
6217             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6218               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6219                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6220                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6221                 if (r < 0)
6222                         return r;
6223         }
6224
6225         if (c->private_network || c->network_namespace_path) {
6226                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6227                         return -errno;
6228         }
6229
6230         if (c->private_ipc || c->ipc_namespace_path) {
6231                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6232                         return -errno;
6233         }
6234
6235         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6236         if (r < 0)
6237                 return r;
6238
6239         return 1;
6240 }
6241
6242 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6243         ExecRuntime *rt;
6244         int r;
6245
6246         assert(m);
6247         assert(id);
6248         assert(ret);
6249
6250         rt = hashmap_get(m->exec_runtime_by_id, id);
6251         if (rt)
6252                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
6253                 goto ref;
6254
6255         if (!create) {
6256                 *ret = NULL;
6257                 return 0;
6258         }
6259
6260         /* If not found, then create a new object. */
6261         r = exec_runtime_make(m, c, id, &rt);
6262         if (r < 0)
6263                 return r;
6264         if (r == 0) {
6265                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6266                 *ret = NULL;
6267                 return 0;
6268         }
6269
6270 ref:
6271         /* increment reference counter. */
6272         rt->n_ref++;
6273         *ret = rt;
6274         return 1;
6275 }
6276
6277 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6278         if (!rt)
6279                 return NULL;
6280
6281         assert(rt->n_ref > 0);
6282
6283         rt->n_ref--;
6284         if (rt->n_ref > 0)
6285                 return NULL;
6286
6287         return exec_runtime_free(rt, destroy);
6288 }
6289
6290 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6291         ExecRuntime *rt;
6292
6293         assert(m);
6294         assert(f);
6295         assert(fds);
6296
6297         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6298                 fprintf(f, "exec-runtime=%s", rt->id);
6299
6300                 if (rt->tmp_dir)
6301                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6302
6303                 if (rt->var_tmp_dir)
6304                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6305
6306                 if (rt->netns_storage_socket[0] >= 0) {
6307                         int copy;
6308
6309                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6310                         if (copy < 0)
6311                                 return copy;
6312
6313                         fprintf(f, " netns-socket-0=%i", copy);
6314                 }
6315
6316                 if (rt->netns_storage_socket[1] >= 0) {
6317                         int copy;
6318
6319                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6320                         if (copy < 0)
6321                                 return copy;
6322
6323                         fprintf(f, " netns-socket-1=%i", copy);
6324                 }
6325
6326                 if (rt->ipcns_storage_socket[0] >= 0) {
6327                         int copy;
6328
6329                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6330                         if (copy < 0)
6331                                 return copy;
6332
6333                         fprintf(f, " ipcns-socket-0=%i", copy);
6334                 }
6335
6336                 if (rt->ipcns_storage_socket[1] >= 0) {
6337                         int copy;
6338
6339                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6340                         if (copy < 0)
6341                                 return copy;
6342
6343                         fprintf(f, " ipcns-socket-1=%i", copy);
6344                 }
6345
6346                 fputc('\n', f);
6347         }
6348
6349         return 0;
6350 }
6351
6352 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6353         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6354         ExecRuntime *rt;
6355         int r;
6356
6357         /* This is for the migration from old (v237 or earlier) deserialization text.
6358          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6359          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6360          * so or not from the serialized text, then we always creates a new object owned by this. */
6361
6362         assert(u);
6363         assert(key);
6364         assert(value);
6365
6366         /* Manager manages ExecRuntime objects by the unit id.
6367          * So, we omit the serialized text when the unit does not have id (yet?)... */
6368         if (isempty(u->id)) {
6369                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6370                 return 0;
6371         }
6372
6373         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6374                 return log_oom();
6375
6376         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6377         if (!rt) {
6378                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6379                         return log_oom();
6380
6381                 rt = rt_create;
6382         }
6383
6384         if (streq(key, "tmp-dir")) {
6385                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6386                         return -ENOMEM;
6387
6388         } else if (streq(key, "var-tmp-dir")) {
6389                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6390                         return -ENOMEM;
6391
6392         } else if (streq(key, "netns-socket-0")) {
6393                 int fd;
6394
6395                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6396                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6397                         return 0;
6398                 }
6399
6400                 safe_close(rt->netns_storage_socket[0]);
6401                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6402
6403         } else if (streq(key, "netns-socket-1")) {
6404                 int fd;
6405
6406                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6407                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6408                         return 0;
6409                 }
6410
6411                 safe_close(rt->netns_storage_socket[1]);
6412                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6413
6414         } else
6415                 return 0;
6416
6417         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6418         if (rt_create) {
6419                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6420                 if (r < 0) {
6421                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6422                         return 0;
6423                 }
6424
6425                 rt_create->manager = u->manager;
6426
6427                 /* Avoid cleanup */
6428                 TAKE_PTR(rt_create);
6429         }
6430
6431         return 1;
6432 }
6433
6434 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6435         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6436         char *id = NULL;
6437         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6438         const char *p, *v = value;
6439         size_t n;
6440
6441         assert(m);
6442         assert(value);
6443         assert(fds);
6444
6445         n = strcspn(v, " ");
6446         id = strndupa(v, n);
6447         if (v[n] != ' ')
6448                 goto finalize;
6449         p = v + n + 1;
6450
6451         v = startswith(p, "tmp-dir=");
6452         if (v) {
6453                 n = strcspn(v, " ");
6454                 tmp_dir = strndup(v, n);
6455                 if (!tmp_dir)
6456                         return log_oom();
6457                 if (v[n] != ' ')
6458                         goto finalize;
6459                 p = v + n + 1;
6460         }
6461
6462         v = startswith(p, "var-tmp-dir=");
6463         if (v) {
6464                 n = strcspn(v, " ");
6465                 var_tmp_dir = strndup(v, n);
6466                 if (!var_tmp_dir)
6467                         return log_oom();
6468                 if (v[n] != ' ')
6469                         goto finalize;
6470                 p = v + n + 1;
6471         }
6472
6473         v = startswith(p, "netns-socket-0=");
6474         if (v) {
6475                 char *buf;
6476
6477                 n = strcspn(v, " ");
6478                 buf = strndupa(v, n);
6479
6480                 r = safe_atoi(buf, &netns_fdpair[0]);
6481                 if (r < 0)
6482                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6483                 if (!fdset_contains(fds, netns_fdpair[0]))
6484                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6485                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6486                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6487                 if (v[n] != ' ')
6488                         goto finalize;
6489                 p = v + n + 1;
6490         }
6491
6492         v = startswith(p, "netns-socket-1=");
6493         if (v) {
6494                 char *buf;
6495
6496                 n = strcspn(v, " ");
6497                 buf = strndupa(v, n);
6498
6499                 r = safe_atoi(buf, &netns_fdpair[1]);
6500                 if (r < 0)
6501                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6502                 if (!fdset_contains(fds, netns_fdpair[1]))
6503                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6504                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6505                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6506                 if (v[n] != ' ')
6507                         goto finalize;
6508                 p = v + n + 1;
6509         }
6510
6511         v = startswith(p, "ipcns-socket-0=");
6512         if (v) {
6513                 char *buf;
6514
6515                 n = strcspn(v, " ");
6516                 buf = strndupa(v, n);
6517
6518                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6519                 if (r < 0)
6520                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6521                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6522                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6523                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6524                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6525                 if (v[n] != ' ')
6526                         goto finalize;
6527                 p = v + n + 1;
6528         }
6529
6530         v = startswith(p, "ipcns-socket-1=");
6531         if (v) {
6532                 char *buf;
6533
6534                 n = strcspn(v, " ");
6535                 buf = strndupa(v, n);
6536
6537                 r = safe_atoi(buf, &ipcns_fdpair[1]);
6538                 if (r < 0)
6539                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6540                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6541                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6542                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6543                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6544         }
6545
6546 finalize:
6547         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6548         if (r < 0)
6549                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6550         return 0;
6551 }
6552
6553 void exec_runtime_vacuum(Manager *m) {
6554         ExecRuntime *rt;
6555
6556         assert(m);
6557
6558         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6559
6560         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6561                 if (rt->n_ref > 0)
6562                         continue;
6563
6564                 (void) exec_runtime_free(rt, false);
6565         }
6566 }
6567
6568 void exec_params_clear(ExecParameters *p) {
6569         if (!p)
6570                 return;
6571
6572         p->environment = strv_free(p->environment);
6573         p->fd_names = strv_free(p->fd_names);
6574         p->fds = mfree(p->fds);
6575         p->exec_fd = safe_close(p->exec_fd);
6576 }
6577
6578 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6579         if (!sc)
6580                 return NULL;
6581
6582         free(sc->id);
6583         free(sc->data);
6584         return mfree(sc);
6585 }
6586
6587 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6588
6589 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6590         [EXEC_INPUT_NULL] = "null",
6591         [EXEC_INPUT_TTY] = "tty",
6592         [EXEC_INPUT_TTY_FORCE] = "tty-force",
6593         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6594         [EXEC_INPUT_SOCKET] = "socket",
6595         [EXEC_INPUT_NAMED_FD] = "fd",
6596         [EXEC_INPUT_DATA] = "data",
6597         [EXEC_INPUT_FILE] = "file",
6598 };
6599
6600 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6601
6602 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6603         [EXEC_OUTPUT_INHERIT] = "inherit",
6604         [EXEC_OUTPUT_NULL] = "null",
6605         [EXEC_OUTPUT_TTY] = "tty",
6606         [EXEC_OUTPUT_KMSG] = "kmsg",
6607         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6608         [EXEC_OUTPUT_JOURNAL] = "journal",
6609         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6610         [EXEC_OUTPUT_SOCKET] = "socket",
6611         [EXEC_OUTPUT_NAMED_FD] = "fd",
6612         [EXEC_OUTPUT_FILE] = "file",
6613         [EXEC_OUTPUT_FILE_APPEND] = "append",
6614         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
6615 };
6616
6617 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6618
6619 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6620         [EXEC_UTMP_INIT] = "init",
6621         [EXEC_UTMP_LOGIN] = "login",
6622         [EXEC_UTMP_USER] = "user",
6623 };
6624
6625 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6626
6627 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6628         [EXEC_PRESERVE_NO] = "no",
6629         [EXEC_PRESERVE_YES] = "yes",
6630         [EXEC_PRESERVE_RESTART] = "restart",
6631 };
6632
6633 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6634
6635 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6636 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6637         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6638         [EXEC_DIRECTORY_STATE] = "StateDirectory",
6639         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6640         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6641         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6642 };
6643
6644 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6645
6646 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6647  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6648  * directories, specifically .timer units with their timestamp touch file. */
6649 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6650         [EXEC_DIRECTORY_RUNTIME] = "runtime",
6651         [EXEC_DIRECTORY_STATE] = "state",
6652         [EXEC_DIRECTORY_CACHE] = "cache",
6653         [EXEC_DIRECTORY_LOGS] = "logs",
6654         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6655 };
6656
6657 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6658
6659 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6660  * the service payload in. */
6661 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6662         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6663         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6664         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6665         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6666         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6667 };
6668
6669 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6670
6671 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6672         [EXEC_KEYRING_INHERIT] = "inherit",
6673         [EXEC_KEYRING_PRIVATE] = "private",
6674         [EXEC_KEYRING_SHARED] = "shared",
6675 };
6676
6677 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);