src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/mount.h>
  10 #include <sys/personality.h>
  11 #include <sys/prctl.h>
  12 #include <sys/shm.h>
  13 #include <sys/types.h>
  14 #include <sys/un.h>
  15 #include <unistd.h>
  16 #include <utmpx.h>
  17
  18 #if HAVE_PAM
  19 #include <security/pam_appl.h>
  20 #endif
  21
  22 #if HAVE_SELINUX
  23 #include <selinux/selinux.h>
  24 #endif
  25
  26 #if HAVE_SECCOMP
  27 #include <seccomp.h>
  28 #endif
  29
  30 #if HAVE_APPARMOR
  31 #include <sys/apparmor.h>
  32 #endif
  33
  34 #include "sd-messages.h"
  35
  36 #include "acl-util.h"
  37 #include "af-list.h"
  38 #include "alloc-util.h"
  39 #if HAVE_APPARMOR
  40 #include "apparmor-util.h"
  41 #endif
  42 #include "async.h"
  43 #include "barrier.h"
  44 #include "bpf-lsm.h"
  45 #include "cap-list.h"
  46 #include "capability-util.h"
  47 #include "cgroup-setup.h"
  48 #include "chase-symlinks.h"
  49 #include "chown-recursive.h"
  50 #include "cpu-set-util.h"
  51 #include "creds-util.h"
  52 #include "data-fd-util.h"
  53 #include "def.h"
  54 #include "env-file.h"
  55 #include "env-util.h"
  56 #include "errno-list.h"
  57 #include "escape.h"
  58 #include "execute.h"
  59 #include "exit-status.h"
  60 #include "fd-util.h"
  61 #include "fileio.h"
  62 #include "format-util.h"
  63 #include "glob-util.h"
  64 #include "hexdecoct.h"
  65 #include "io-util.h"
  66 #include "ioprio-util.h"
  67 #include "label.h"
  68 #include "log.h"
  69 #include "macro.h"
  70 #include "manager.h"
  71 #include "manager-dump.h"
  72 #include "memory-util.h"
  73 #include "missing_fs.h"
  74 #include "missing_ioprio.h"
  75 #include "mkdir-label.h"
  76 #include "mount-util.h"
  77 #include "mountpoint-util.h"
  78 #include "namespace.h"
  79 #include "parse-util.h"
  80 #include "path-util.h"
  81 #include "process-util.h"
  82 #include "random-util.h"
  83 #include "recurse-dir.h"
  84 #include "rlimit-util.h"
  85 #include "rm-rf.h"
  86 #if HAVE_SECCOMP
  87 #include "seccomp-util.h"
  88 #endif
  89 #include "securebits-util.h"
  90 #include "selinux-util.h"
  91 #include "signal-util.h"
  92 #include "smack-util.h"
  93 #include "socket-util.h"
  94 #include "special.h"
  95 #include "stat-util.h"
  96 #include "string-table.h"
  97 #include "string-util.h"
  98 #include "strv.h"
  99 #include "syslog-util.h"
 100 #include "terminal-util.h"
 101 #include "tmpfile-util.h"
 102 #include "umask-util.h"
 103 #include "unit-serialize.h"
 104 #include "user-util.h"
 105 #include "utmp-wtmp.h"
 106
 107 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 108 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 109
 110 #define SNDBUF_SIZE (8*1024*1024)
 111
 112 static int shift_fds(int fds[], size_t n_fds) {
 113         if (n_fds <= 0)
 114                 return 0;
 115
 116         /* Modifies the fds array! (sorts it) */
 117
 118         assert(fds);
 119
 120         for (int start = 0;;) {
 121                 int restart_from = -1;
 122
 123                 for (int i = start; i < (int) n_fds; i++) {
 124                         int nfd;
 125
 126                         /* Already at right index? */
 127                         if (fds[i] == i+3)
 128                                 continue;
 129
 130                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 131                         if (nfd < 0)
 132                                 return -errno;
 133
 134                         safe_close(fds[i]);
 135                         fds[i] = nfd;
 136
 137                         /* Hmm, the fd we wanted isn't free? Then
 138                          * let's remember that and try again from here */
 139                         if (nfd != i+3 && restart_from < 0)
 140                                 restart_from = i;
 141                 }
 142
 143                 if (restart_from < 0)
 144                         break;
 145
 146                 start = restart_from;
 147         }
 148
 149         return 0;
 150 }
 151
 152 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 153         size_t n_fds;
 154         int r;
 155
 156         n_fds = n_socket_fds + n_storage_fds;
 157         if (n_fds <= 0)
 158                 return 0;
 159
 160         assert(fds);
 161
 162         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 163          * O_NONBLOCK only applies to socket activation though. */
 164
 165         for (size_t i = 0; i < n_fds; i++) {
 166
 167                 if (i < n_socket_fds) {
 168                         r = fd_nonblock(fds[i], nonblock);
 169                         if (r < 0)
 170                                 return r;
 171                 }
 172
 173                 /* We unconditionally drop FD_CLOEXEC from the fds,
 174                  * since after all we want to pass these fds to our
 175                  * children */
 176
 177                 r = fd_cloexec(fds[i], false);
 178                 if (r < 0)
 179                         return r;
 180         }
 181
 182         return 0;
 183 }
 184
 185 static const char *exec_context_tty_path(const ExecContext *context) {
 186         assert(context);
 187
 188         if (context->stdio_as_fds)
 189                 return NULL;
 190
 191         if (context->tty_path)
 192                 return context->tty_path;
 193
 194         return "/dev/console";
 195 }
 196
 197 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 198         const char *path;
 199
 200         assert(context);
 201
 202         path = exec_context_tty_path(context);
 203
 204         if (context->tty_vhangup) {
 205                 if (p && p->stdin_fd >= 0)
 206                         (void) terminal_vhangup_fd(p->stdin_fd);
 207                 else if (path)
 208                         (void) terminal_vhangup(path);
 209         }
 210
 211         if (context->tty_reset) {
 212                 if (p && p->stdin_fd >= 0)
 213                         (void) reset_terminal_fd(p->stdin_fd, true);
 214                 else if (path)
 215                         (void) reset_terminal(path);
 216         }
 217
 218         if (p && p->stdin_fd >= 0)
 219                 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
 220
 221         if (context->tty_vt_disallocate && path)
 222                 (void) vt_disallocate(path);
 223 }
 224
 225 static bool is_terminal_input(ExecInput i) {
 226         return IN_SET(i,
 227                       EXEC_INPUT_TTY,
 228                       EXEC_INPUT_TTY_FORCE,
 229                       EXEC_INPUT_TTY_FAIL);
 230 }
 231
 232 static bool is_terminal_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_TTY,
 235                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 236                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 237 }
 238
 239 static bool is_kmsg_output(ExecOutput o) {
 240         return IN_SET(o,
 241                       EXEC_OUTPUT_KMSG,
 242                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 243 }
 244
 245 static bool exec_context_needs_term(const ExecContext *c) {
 246         assert(c);
 247
 248         /* Return true if the execution context suggests we should set $TERM to something useful. */
 249
 250         if (is_terminal_input(c->std_input))
 251                 return true;
 252
 253         if (is_terminal_output(c->std_output))
 254                 return true;
 255
 256         if (is_terminal_output(c->std_error))
 257                 return true;
 258
 259         return !!c->tty_path;
 260 }
 261
 262 static int open_null_as(int flags, int nfd) {
 263         int fd;
 264
 265         assert(nfd >= 0);
 266
 267         fd = open("/dev/null", flags|O_NOCTTY);
 268         if (fd < 0)
 269                 return -errno;
 270
 271         return move_fd(fd, nfd, false);
 272 }
 273
 274 static int connect_journal_socket(
 275                 int fd,
 276                 const char *log_namespace,
 277                 uid_t uid,
 278                 gid_t gid) {
 279
 280         uid_t olduid = UID_INVALID;
 281         gid_t oldgid = GID_INVALID;
 282         const char *j;
 283         int r;
 284
 285         j = log_namespace ?
 286                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 287                 "/run/systemd/journal/stdout";
 288
 289         if (gid_is_valid(gid)) {
 290                 oldgid = getgid();
 291
 292                 if (setegid(gid) < 0)
 293                         return -errno;
 294         }
 295
 296         if (uid_is_valid(uid)) {
 297                 olduid = getuid();
 298
 299                 if (seteuid(uid) < 0) {
 300                         r = -errno;
 301                         goto restore_gid;
 302                 }
 303         }
 304
 305         r = connect_unix_path(fd, AT_FDCWD, j);
 306
 307         /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
 308            an LSM interferes. */
 309
 310         if (uid_is_valid(uid))
 311                 (void) seteuid(olduid);
 312
 313  restore_gid:
 314         if (gid_is_valid(gid))
 315                 (void) setegid(oldgid);
 316
 317         return r;
 318 }
 319
 320 static int connect_logger_as(
 321                 const Unit *unit,
 322                 const ExecContext *context,
 323                 const ExecParameters *params,
 324                 ExecOutput output,
 325                 const char *ident,
 326                 int nfd,
 327                 uid_t uid,
 328                 gid_t gid) {
 329
 330         _cleanup_close_ int fd = -1;
 331         int r;
 332
 333         assert(context);
 334         assert(params);
 335         assert(output < _EXEC_OUTPUT_MAX);
 336         assert(ident);
 337         assert(nfd >= 0);
 338
 339         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 340         if (fd < 0)
 341                 return -errno;
 342
 343         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 344         if (r < 0)
 345                 return r;
 346
 347         if (shutdown(fd, SHUT_RD) < 0)
 348                 return -errno;
 349
 350         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 351
 352         if (dprintf(fd,
 353                 "%s\n"
 354                 "%s\n"
 355                 "%i\n"
 356                 "%i\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n",
 360                 context->syslog_identifier ?: ident,
 361                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 362                 context->syslog_priority,
 363                 !!context->syslog_level_prefix,
 364                 false,
 365                 is_kmsg_output(output),
 366                 is_terminal_output(output)) < 0)
 367                 return -errno;
 368
 369         return move_fd(TAKE_FD(fd), nfd, false);
 370 }
 371
 372 static int open_terminal_as(const char *path, int flags, int nfd) {
 373         int fd;
 374
 375         assert(path);
 376         assert(nfd >= 0);
 377
 378         fd = open_terminal(path, flags | O_NOCTTY);
 379         if (fd < 0)
 380                 return fd;
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384
 385 static int acquire_path(const char *path, int flags, mode_t mode) {
 386         _cleanup_close_ int fd = -1;
 387         int r;
 388
 389         assert(path);
 390
 391         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 392                 flags |= O_CREAT;
 393
 394         fd = open(path, flags|O_NOCTTY, mode);
 395         if (fd >= 0)
 396                 return TAKE_FD(fd);
 397
 398         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 399                 return -errno;
 400
 401         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 402
 403         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 404         if (fd < 0)
 405                 return -errno;
 406
 407         r = connect_unix_path(fd, AT_FDCWD, path);
 408         if (IN_SET(r, -ENOTSOCK, -EINVAL))
 409                 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
 410                  * wasn't an AF_UNIX socket after all */
 411                 return -ENXIO;
 412         if (r < 0)
 413                 return r;
 414
 415         if ((flags & O_ACCMODE) == O_RDONLY)
 416                 r = shutdown(fd, SHUT_WR);
 417         else if ((flags & O_ACCMODE) == O_WRONLY)
 418                 r = shutdown(fd, SHUT_RD);
 419         else
 420                 r = 0;
 421         if (r < 0)
 422                 return -errno;
 423
 424         return TAKE_FD(fd);
 425 }
 426
 427 static int fixup_input(
 428                 const ExecContext *context,
 429                 int socket_fd,
 430                 bool apply_tty_stdin) {
 431
 432         ExecInput std_input;
 433
 434         assert(context);
 435
 436         std_input = context->std_input;
 437
 438         if (is_terminal_input(std_input) && !apply_tty_stdin)
 439                 return EXEC_INPUT_NULL;
 440
 441         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 442                 return EXEC_INPUT_NULL;
 443
 444         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 445                 return EXEC_INPUT_NULL;
 446
 447         return std_input;
 448 }
 449
 450 static int fixup_output(ExecOutput output, int socket_fd) {
 451
 452         if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 453                 return EXEC_OUTPUT_INHERIT;
 454
 455         return output;
 456 }
 457
 458 static int setup_input(
 459                 const ExecContext *context,
 460                 const ExecParameters *params,
 461                 int socket_fd,
 462                 const int named_iofds[static 3]) {
 463
 464         ExecInput i;
 465         int r;
 466
 467         assert(context);
 468         assert(params);
 469         assert(named_iofds);
 470
 471         if (params->stdin_fd >= 0) {
 472                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 473                         return -errno;
 474
 475                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 476                 if (isatty(STDIN_FILENO)) {
 477                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 478                         (void) reset_terminal_fd(STDIN_FILENO, true);
 479                         (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
 480                 }
 481
 482                 return STDIN_FILENO;
 483         }
 484
 485         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 486
 487         switch (i) {
 488
 489         case EXEC_INPUT_NULL:
 490                 return open_null_as(O_RDONLY, STDIN_FILENO);
 491
 492         case EXEC_INPUT_TTY:
 493         case EXEC_INPUT_TTY_FORCE:
 494         case EXEC_INPUT_TTY_FAIL: {
 495                 int fd;
 496
 497                 fd = acquire_terminal(exec_context_tty_path(context),
 498                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 499                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 500                                                                   ACQUIRE_TERMINAL_WAIT,
 501                                       USEC_INFINITY);
 502                 if (fd < 0)
 503                         return fd;
 504
 505                 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
 506                 if (r < 0)
 507                         return r;
 508
 509                 return move_fd(fd, STDIN_FILENO, false);
 510         }
 511
 512         case EXEC_INPUT_SOCKET:
 513                 assert(socket_fd >= 0);
 514
 515                 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
 516
 517         case EXEC_INPUT_NAMED_FD:
 518                 assert(named_iofds[STDIN_FILENO] >= 0);
 519
 520                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 521                 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
 522
 523         case EXEC_INPUT_DATA: {
 524                 int fd;
 525
 526                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 527                 if (fd < 0)
 528                         return fd;
 529
 530                 return move_fd(fd, STDIN_FILENO, false);
 531         }
 532
 533         case EXEC_INPUT_FILE: {
 534                 bool rw;
 535                 int fd;
 536
 537                 assert(context->stdio_file[STDIN_FILENO]);
 538
 539                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 540                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 541
 542                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 543                 if (fd < 0)
 544                         return fd;
 545
 546                 return move_fd(fd, STDIN_FILENO, false);
 547         }
 548
 549         default:
 550                 assert_not_reached();
 551         }
 552 }
 553
 554 static bool can_inherit_stderr_from_stdout(
 555                 const ExecContext *context,
 556                 ExecOutput o,
 557                 ExecOutput e) {
 558
 559         assert(context);
 560
 561         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 562          * stderr fd */
 563
 564         if (e == EXEC_OUTPUT_INHERIT)
 565                 return true;
 566         if (e != o)
 567                 return false;
 568
 569         if (e == EXEC_OUTPUT_NAMED_FD)
 570                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 571
 572         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
 573                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 574
 575         return true;
 576 }
 577
 578 static int setup_output(
 579                 const Unit *unit,
 580                 const ExecContext *context,
 581                 const ExecParameters *params,
 582                 int fileno,
 583                 int socket_fd,
 584                 const int named_iofds[static 3],
 585                 const char *ident,
 586                 uid_t uid,
 587                 gid_t gid,
 588                 dev_t *journal_stream_dev,
 589                 ino_t *journal_stream_ino) {
 590
 591         ExecOutput o;
 592         ExecInput i;
 593         int r;
 594
 595         assert(unit);
 596         assert(context);
 597         assert(params);
 598         assert(ident);
 599         assert(journal_stream_dev);
 600         assert(journal_stream_ino);
 601
 602         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 603
 604                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 605                         return -errno;
 606
 607                 return STDOUT_FILENO;
 608         }
 609
 610         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 611                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 612                         return -errno;
 613
 614                 return STDERR_FILENO;
 615         }
 616
 617         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 618         o = fixup_output(context->std_output, socket_fd);
 619
 620         if (fileno == STDERR_FILENO) {
 621                 ExecOutput e;
 622                 e = fixup_output(context->std_error, socket_fd);
 623
 624                 /* This expects the input and output are already set up */
 625
 626                 /* Don't change the stderr file descriptor if we inherit all
 627                  * the way and are not on a tty */
 628                 if (e == EXEC_OUTPUT_INHERIT &&
 629                     o == EXEC_OUTPUT_INHERIT &&
 630                     i == EXEC_INPUT_NULL &&
 631                     !is_terminal_input(context->std_input) &&
 632                     getppid() != 1)
 633                         return fileno;
 634
 635                 /* Duplicate from stdout if possible */
 636                 if (can_inherit_stderr_from_stdout(context, o, e))
 637                         return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
 638
 639                 o = e;
 640
 641         } else if (o == EXEC_OUTPUT_INHERIT) {
 642                 /* If input got downgraded, inherit the original value */
 643                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 644                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 645
 646                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 647                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 648                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 649
 650                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 651                 if (getppid() != 1)
 652                         return fileno;
 653
 654                 /* We need to open /dev/null here anew, to get the right access mode. */
 655                 return open_null_as(O_WRONLY, fileno);
 656         }
 657
 658         switch (o) {
 659
 660         case EXEC_OUTPUT_NULL:
 661                 return open_null_as(O_WRONLY, fileno);
 662
 663         case EXEC_OUTPUT_TTY:
 664                 if (is_terminal_input(i))
 665                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 666
 667                 /* We don't reset the terminal if this is just about output */
 668                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 669
 670         case EXEC_OUTPUT_KMSG:
 671         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 672         case EXEC_OUTPUT_JOURNAL:
 673         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 674                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 675                 if (r < 0) {
 676                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
 677                                                fileno == STDOUT_FILENO ? "stdout" : "stderr");
 678                         r = open_null_as(O_WRONLY, fileno);
 679                 } else {
 680                         struct stat st;
 681
 682                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 683                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 684                          * services to detect whether they are connected to the journal or not.
 685                          *
 686                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 687                          * about STDERR as that's usually the best way to do logging. */
 688
 689                         if (fstat(fileno, &st) >= 0 &&
 690                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 691                                 *journal_stream_dev = st.st_dev;
 692                                 *journal_stream_ino = st.st_ino;
 693                         }
 694                 }
 695                 return r;
 696
 697         case EXEC_OUTPUT_SOCKET:
 698                 assert(socket_fd >= 0);
 699
 700                 return RET_NERRNO(dup2(socket_fd, fileno));
 701
 702         case EXEC_OUTPUT_NAMED_FD:
 703                 assert(named_iofds[fileno] >= 0);
 704
 705                 (void) fd_nonblock(named_iofds[fileno], false);
 706                 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
 707
 708         case EXEC_OUTPUT_FILE:
 709         case EXEC_OUTPUT_FILE_APPEND:
 710         case EXEC_OUTPUT_FILE_TRUNCATE: {
 711                 bool rw;
 712                 int fd, flags;
 713
 714                 assert(context->stdio_file[fileno]);
 715
 716                 rw = context->std_input == EXEC_INPUT_FILE &&
 717                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 718
 719                 if (rw)
 720                         return RET_NERRNO(dup2(STDIN_FILENO, fileno));
 721
 722                 flags = O_WRONLY;
 723                 if (o == EXEC_OUTPUT_FILE_APPEND)
 724                         flags |= O_APPEND;
 725                 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
 726                         flags |= O_TRUNC;
 727
 728                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 729                 if (fd < 0)
 730                         return fd;
 731
 732                 return move_fd(fd, fileno, 0);
 733         }
 734
 735         default:
 736                 assert_not_reached();
 737         }
 738 }
 739
 740 static int chown_terminal(int fd, uid_t uid) {
 741         int r;
 742
 743         assert(fd >= 0);
 744
 745         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 746         if (isatty(fd) < 1) {
 747                 if (IN_SET(errno, EINVAL, ENOTTY))
 748                         return 0; /* not a tty */
 749
 750                 return -errno;
 751         }
 752
 753         /* This might fail. What matters are the results. */
 754         r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
 755         if (r < 0)
 756                 return r;
 757
 758         return 1;
 759 }
 760
 761 static int setup_confirm_stdio(
 762                 const ExecContext *context,
 763                 const char *vc,
 764                 int *ret_saved_stdin,
 765                 int *ret_saved_stdout) {
 766
 767         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 768         int r;
 769
 770         assert(ret_saved_stdin);
 771         assert(ret_saved_stdout);
 772
 773         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 774         if (saved_stdin < 0)
 775                 return -errno;
 776
 777         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 778         if (saved_stdout < 0)
 779                 return -errno;
 780
 781         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 782         if (fd < 0)
 783                 return fd;
 784
 785         r = chown_terminal(fd, getuid());
 786         if (r < 0)
 787                 return r;
 788
 789         r = reset_terminal_fd(fd, true);
 790         if (r < 0)
 791                 return r;
 792
 793         r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
 794         if (r < 0)
 795                 return r;
 796
 797         r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
 798         TAKE_FD(fd);
 799         if (r < 0)
 800                 return r;
 801
 802         *ret_saved_stdin = TAKE_FD(saved_stdin);
 803         *ret_saved_stdout = TAKE_FD(saved_stdout);
 804         return 0;
 805 }
 806
 807 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 808         assert(err < 0);
 809
 810         if (err == -ETIMEDOUT)
 811                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 812         else {
 813                 errno = -err;
 814                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 815         }
 816 }
 817
 818 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 819         _cleanup_close_ int fd = -1;
 820
 821         assert(vc);
 822
 823         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 824         if (fd < 0)
 825                 return;
 826
 827         write_confirm_error_fd(err, fd, u);
 828 }
 829
 830 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 831         int r = 0;
 832
 833         assert(saved_stdin);
 834         assert(saved_stdout);
 835
 836         release_terminal();
 837
 838         if (*saved_stdin >= 0)
 839                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 840                         r = -errno;
 841
 842         if (*saved_stdout >= 0)
 843                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 844                         r = -errno;
 845
 846         *saved_stdin = safe_close(*saved_stdin);
 847         *saved_stdout = safe_close(*saved_stdout);
 848
 849         return r;
 850 }
 851
 852 enum {
 853         CONFIRM_PRETEND_FAILURE = -1,
 854         CONFIRM_PRETEND_SUCCESS =  0,
 855         CONFIRM_EXECUTE = 1,
 856 };
 857
 858 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
 859         int saved_stdout = -1, saved_stdin = -1, r;
 860         _cleanup_free_ char *e = NULL;
 861         char c;
 862
 863         /* For any internal errors, assume a positive response. */
 864         r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
 865         if (r < 0) {
 866                 write_confirm_error(r, vc, u);
 867                 return CONFIRM_EXECUTE;
 868         }
 869
 870         /* confirm_spawn might have been disabled while we were sleeping. */
 871         if (manager_is_confirm_spawn_disabled(u->manager)) {
 872                 r = 1;
 873                 goto restore_stdio;
 874         }
 875
 876         e = ellipsize(cmdline, 60, 100);
 877         if (!e) {
 878                 log_oom();
 879                 r = CONFIRM_EXECUTE;
 880                 goto restore_stdio;
 881         }
 882
 883         for (;;) {
 884                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 885                 if (r < 0) {
 886                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 887                         r = CONFIRM_EXECUTE;
 888                         goto restore_stdio;
 889                 }
 890
 891                 switch (c) {
 892                 case 'c':
 893                         printf("Resuming normal execution.\n");
 894                         manager_disable_confirm_spawn();
 895                         r = 1;
 896                         break;
 897                 case 'D':
 898                         unit_dump(u, stdout, "  ");
 899                         continue; /* ask again */
 900                 case 'f':
 901                         printf("Failing execution.\n");
 902                         r = CONFIRM_PRETEND_FAILURE;
 903                         break;
 904                 case 'h':
 905                         printf("  c - continue, proceed without asking anymore\n"
 906                                "  D - dump, show the state of the unit\n"
 907                                "  f - fail, don't execute the command and pretend it failed\n"
 908                                "  h - help\n"
 909                                "  i - info, show a short summary of the unit\n"
 910                                "  j - jobs, show jobs that are in progress\n"
 911                                "  s - skip, don't execute the command and pretend it succeeded\n"
 912                                "  y - yes, execute the command\n");
 913                         continue; /* ask again */
 914                 case 'i':
 915                         printf("  Description: %s\n"
 916                                "  Unit:        %s\n"
 917                                "  Command:     %s\n",
 918                                u->id, u->description, cmdline);
 919                         continue; /* ask again */
 920                 case 'j':
 921                         manager_dump_jobs(u->manager, stdout, "  ");
 922                         continue; /* ask again */
 923                 case 'n':
 924                         /* 'n' was removed in favor of 'f'. */
 925                         printf("Didn't understand 'n', did you mean 'f'?\n");
 926                         continue; /* ask again */
 927                 case 's':
 928                         printf("Skipping execution.\n");
 929                         r = CONFIRM_PRETEND_SUCCESS;
 930                         break;
 931                 case 'y':
 932                         r = CONFIRM_EXECUTE;
 933                         break;
 934                 default:
 935                         assert_not_reached();
 936                 }
 937                 break;
 938         }
 939
 940 restore_stdio:
 941         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 942         return r;
 943 }
 944
 945 static int get_fixed_user(const ExecContext *c, const char **user,
 946                           uid_t *uid, gid_t *gid,
 947                           const char **home, const char **shell) {
 948         int r;
 949         const char *name;
 950
 951         assert(c);
 952
 953         if (!c->user)
 954                 return 0;
 955
 956         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 957          * (i.e. are "/" or "/bin/nologin"). */
 958
 959         name = c->user;
 960         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 961         if (r < 0)
 962                 return r;
 963
 964         *user = name;
 965         return 0;
 966 }
 967
 968 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 969         int r;
 970         const char *name;
 971
 972         assert(c);
 973
 974         if (!c->group)
 975                 return 0;
 976
 977         name = c->group;
 978         r = get_group_creds(&name, gid, 0);
 979         if (r < 0)
 980                 return r;
 981
 982         *group = name;
 983         return 0;
 984 }
 985
 986 static int get_supplementary_groups(const ExecContext *c, const char *user,
 987                                     const char *group, gid_t gid,
 988                                     gid_t **supplementary_gids, int *ngids) {
 989         int r, k = 0;
 990         int ngroups_max;
 991         bool keep_groups = false;
 992         gid_t *groups = NULL;
 993         _cleanup_free_ gid_t *l_gids = NULL;
 994
 995         assert(c);
 996
 997         /*
 998          * If user is given, then lookup GID and supplementary groups list.
 999          * We avoid NSS lookups for gid=0. Also we have to initialize groups
1000          * here and as early as possible so we keep the list of supplementary
1001          * groups of the caller.
1002          */
1003         if (user && gid_is_valid(gid) && gid != 0) {
1004                 /* First step, initialize groups from /etc/groups */
1005                 if (initgroups(user, gid) < 0)
1006                         return -errno;
1007
1008                 keep_groups = true;
1009         }
1010
1011         if (strv_isempty(c->supplementary_groups))
1012                 return 0;
1013
1014         /*
1015          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1016          * be positive, otherwise fail.
1017          */
1018         errno = 0;
1019         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1020         if (ngroups_max <= 0)
1021                 return errno_or_else(EOPNOTSUPP);
1022
1023         l_gids = new(gid_t, ngroups_max);
1024         if (!l_gids)
1025                 return -ENOMEM;
1026
1027         if (keep_groups) {
1028                 /*
1029                  * Lookup the list of groups that the user belongs to, we
1030                  * avoid NSS lookups here too for gid=0.
1031                  */
1032                 k = ngroups_max;
1033                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1034                         return -EINVAL;
1035         } else
1036                 k = 0;
1037
1038         STRV_FOREACH(i, c->supplementary_groups) {
1039                 const char *g;
1040
1041                 if (k >= ngroups_max)
1042                         return -E2BIG;
1043
1044                 g = *i;
1045                 r = get_group_creds(&g, l_gids+k, 0);
1046                 if (r < 0)
1047                         return r;
1048
1049                 k++;
1050         }
1051
1052         /*
1053          * Sets ngids to zero to drop all supplementary groups, happens
1054          * when we are under root and SupplementaryGroups= is empty.
1055          */
1056         if (k == 0) {
1057                 *ngids = 0;
1058                 return 0;
1059         }
1060
1061         /* Otherwise get the final list of supplementary groups */
1062         groups = memdup(l_gids, sizeof(gid_t) * k);
1063         if (!groups)
1064                 return -ENOMEM;
1065
1066         *supplementary_gids = groups;
1067         *ngids = k;
1068
1069         groups = NULL;
1070
1071         return 0;
1072 }
1073
1074 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1075         int r;
1076
1077         /* Handle SupplementaryGroups= if it is not empty */
1078         if (ngids > 0) {
1079                 r = maybe_setgroups(ngids, supplementary_gids);
1080                 if (r < 0)
1081                         return r;
1082         }
1083
1084         if (gid_is_valid(gid)) {
1085                 /* Then set our gids */
1086                 if (setresgid(gid, gid, gid) < 0)
1087                         return -errno;
1088         }
1089
1090         return 0;
1091 }
1092
1093 static int set_securebits(int bits, int mask) {
1094         int current, applied;
1095         current = prctl(PR_GET_SECUREBITS);
1096         if (current < 0)
1097                 return -errno;
1098         /* Clear all securebits defined in mask and set bits */
1099         applied = (current & ~mask) | bits;
1100         if (current == applied)
1101                 return 0;
1102         if (prctl(PR_SET_SECUREBITS, applied) < 0)
1103                 return -errno;
1104         return 1;
1105 }
1106
1107 static int enforce_user(const ExecContext *context, uid_t uid) {
1108         assert(context);
1109         int r;
1110
1111         if (!uid_is_valid(uid))
1112                 return 0;
1113
1114         /* Sets (but doesn't look up) the uid and make sure we keep the
1115          * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1116          * required, so we also need keep-caps in this case.
1117          */
1118
1119         if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1120
1121                 /* First step: If we need to keep capabilities but
1122                  * drop privileges we need to make sure we keep our
1123                  * caps, while we drop privileges. */
1124                 if (uid != 0) {
1125                         /* Add KEEP_CAPS to the securebits */
1126                         r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1127                         if (r < 0)
1128                                 return r;
1129                 }
1130         }
1131
1132         /* Second step: actually set the uids */
1133         if (setresuid(uid, uid, uid) < 0)
1134                 return -errno;
1135
1136         /* At this point we should have all necessary capabilities but
1137            are otherwise a normal user. However, the caps might got
1138            corrupted due to the setresuid() so we need clean them up
1139            later. This is done outside of this call. */
1140
1141         return 0;
1142 }
1143
1144 #if HAVE_PAM
1145
1146 static int null_conv(
1147                 int num_msg,
1148                 const struct pam_message **msg,
1149                 struct pam_response **resp,
1150                 void *appdata_ptr) {
1151
1152         /* We don't support conversations */
1153
1154         return PAM_CONV_ERR;
1155 }
1156
1157 #endif
1158
1159 static int setup_pam(
1160                 const char *name,
1161                 const char *user,
1162                 uid_t uid,
1163                 gid_t gid,
1164                 const char *tty,
1165                 char ***env, /* updated on success */
1166                 const int fds[], size_t n_fds) {
1167
1168 #if HAVE_PAM
1169
1170         static const struct pam_conv conv = {
1171                 .conv = null_conv,
1172                 .appdata_ptr = NULL
1173         };
1174
1175         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1176         _cleanup_strv_free_ char **e = NULL;
1177         pam_handle_t *handle = NULL;
1178         sigset_t old_ss;
1179         int pam_code = PAM_SUCCESS, r;
1180         bool close_session = false;
1181         pid_t pam_pid = 0, parent_pid;
1182         int flags = 0;
1183
1184         assert(name);
1185         assert(user);
1186         assert(env);
1187
1188         /* We set up PAM in the parent process, then fork. The child
1189          * will then stay around until killed via PR_GET_PDEATHSIG or
1190          * systemd via the cgroup logic. It will then remove the PAM
1191          * session again. The parent process will exec() the actual
1192          * daemon. We do things this way to ensure that the main PID
1193          * of the daemon is the one we initially fork()ed. */
1194
1195         r = barrier_create(&barrier);
1196         if (r < 0)
1197                 goto fail;
1198
1199         if (log_get_max_level() < LOG_DEBUG)
1200                 flags |= PAM_SILENT;
1201
1202         pam_code = pam_start(name, user, &conv, &handle);
1203         if (pam_code != PAM_SUCCESS) {
1204                 handle = NULL;
1205                 goto fail;
1206         }
1207
1208         if (!tty) {
1209                 _cleanup_free_ char *q = NULL;
1210
1211                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1212                  * out if that's the case, and read the TTY off it. */
1213
1214                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1215                         tty = strjoina("/dev/", q);
1216         }
1217
1218         if (tty) {
1219                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1220                 if (pam_code != PAM_SUCCESS)
1221                         goto fail;
1222         }
1223
1224         STRV_FOREACH(nv, *env) {
1225                 pam_code = pam_putenv(handle, *nv);
1226                 if (pam_code != PAM_SUCCESS)
1227                         goto fail;
1228         }
1229
1230         pam_code = pam_acct_mgmt(handle, flags);
1231         if (pam_code != PAM_SUCCESS)
1232                 goto fail;
1233
1234         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1235         if (pam_code != PAM_SUCCESS)
1236                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1237
1238         pam_code = pam_open_session(handle, flags);
1239         if (pam_code != PAM_SUCCESS)
1240                 goto fail;
1241
1242         close_session = true;
1243
1244         e = pam_getenvlist(handle);
1245         if (!e) {
1246                 pam_code = PAM_BUF_ERR;
1247                 goto fail;
1248         }
1249
1250         /* Block SIGTERM, so that we know that it won't get lost in the child */
1251
1252         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1253
1254         parent_pid = getpid_cached();
1255
1256         r = safe_fork("(sd-pam)", 0, &pam_pid);
1257         if (r < 0)
1258                 goto fail;
1259         if (r == 0) {
1260                 int sig, ret = EXIT_PAM;
1261
1262                 /* The child's job is to reset the PAM session on termination */
1263                 barrier_set_role(&barrier, BARRIER_CHILD);
1264
1265                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1266                  * those fds are open here that have been opened by PAM. */
1267                 (void) close_many(fds, n_fds);
1268
1269                 /* Drop privileges - we don't need any to pam_close_session and this will make
1270                  * PR_SET_PDEATHSIG work in most cases.  If this fails, ignore the error - but expect sd-pam
1271                  * threads to fail to exit normally */
1272
1273                 r = maybe_setgroups(0, NULL);
1274                 if (r < 0)
1275                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1276                 if (setresgid(gid, gid, gid) < 0)
1277                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1278                 if (setresuid(uid, uid, uid) < 0)
1279                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1280
1281                 (void) ignore_signals(SIGPIPE);
1282
1283                 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1284                  * otherwise the kernel will not allow unprivileged parents kill their privileged children
1285                  * this way. We rely on the control groups kill logic to do the rest for us. */
1286                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1287                         goto child_finish;
1288
1289                 /* Tell the parent that our setup is done. This is especially important regarding dropping
1290                  * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1291                  *
1292                  * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1293                 (void) barrier_place(&barrier);
1294
1295                 /* Check if our parent process might already have died? */
1296                 if (getppid() == parent_pid) {
1297                         sigset_t ss;
1298
1299                         assert_se(sigemptyset(&ss) >= 0);
1300                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
1302                         for (;;) {
1303                                 if (sigwait(&ss, &sig) < 0) {
1304                                         if (errno == EINTR)
1305                                                 continue;
1306
1307                                         goto child_finish;
1308                                 }
1309
1310                                 assert(sig == SIGTERM);
1311                                 break;
1312                         }
1313                 }
1314
1315                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316                 if (pam_code != PAM_SUCCESS)
1317                         goto child_finish;
1318
1319                 /* If our parent died we'll end the session */
1320                 if (getppid() != parent_pid) {
1321                         pam_code = pam_close_session(handle, flags);
1322                         if (pam_code != PAM_SUCCESS)
1323                                 goto child_finish;
1324                 }
1325
1326                 ret = 0;
1327
1328         child_finish:
1329                 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1330                  * know about this. See pam_end(3) */
1331                 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1332                 _exit(ret);
1333         }
1334
1335         barrier_set_role(&barrier, BARRIER_PARENT);
1336
1337         /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1338          * here. */
1339         handle = NULL;
1340
1341         /* Unblock SIGTERM again in the parent */
1342         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1343
1344         /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1345          * this fd around. */
1346         closelog();
1347
1348         /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1349          * recover. However, warn loudly if it happens. */
1350         if (!barrier_place_and_sync(&barrier))
1351                 log_error("PAM initialization failed");
1352
1353         return strv_free_and_replace(*env, e);
1354
1355 fail:
1356         if (pam_code != PAM_SUCCESS) {
1357                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1358                 r = -EPERM;  /* PAM errors do not map to errno */
1359         } else
1360                 log_error_errno(r, "PAM failed: %m");
1361
1362         if (handle) {
1363                 if (close_session)
1364                         pam_code = pam_close_session(handle, flags);
1365
1366                 (void) pam_end(handle, pam_code | flags);
1367         }
1368
1369         closelog();
1370         return r;
1371 #else
1372         return 0;
1373 #endif
1374 }
1375
1376 static void rename_process_from_path(const char *path) {
1377         char process_name[11];
1378         const char *p;
1379         size_t l;
1380
1381         /* This resulting string must fit in 10 chars (i.e. the length
1382          * of "/sbin/init") to look pretty in /bin/ps */
1383
1384         p = basename(path);
1385         if (isempty(p)) {
1386                 rename_process("(...)");
1387                 return;
1388         }
1389
1390         l = strlen(p);
1391         if (l > 8) {
1392                 /* The end of the process name is usually more
1393                  * interesting, since the first bit might just be
1394                  * "systemd-" */
1395                 p = p + l - 8;
1396                 l = 8;
1397         }
1398
1399         process_name[0] = '(';
1400         memcpy(process_name+1, p, l);
1401         process_name[1+l] = ')';
1402         process_name[1+l+1] = 0;
1403
1404         rename_process(process_name);
1405 }
1406
1407 static bool context_has_address_families(const ExecContext *c) {
1408         assert(c);
1409
1410         return c->address_families_allow_list ||
1411                 !set_isempty(c->address_families);
1412 }
1413
1414 static bool context_has_syscall_filters(const ExecContext *c) {
1415         assert(c);
1416
1417         return c->syscall_allow_list ||
1418                 !hashmap_isempty(c->syscall_filter);
1419 }
1420
1421 static bool context_has_syscall_logs(const ExecContext *c) {
1422         assert(c);
1423
1424         return c->syscall_log_allow_list ||
1425                 !hashmap_isempty(c->syscall_log);
1426 }
1427
1428 static bool context_has_no_new_privileges(const ExecContext *c) {
1429         assert(c);
1430
1431         if (c->no_new_privileges)
1432                 return true;
1433
1434         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435                 return false;
1436
1437         /* We need NNP if we have any form of seccomp and are unprivileged */
1438         return c->lock_personality ||
1439                 c->memory_deny_write_execute ||
1440                 c->private_devices ||
1441                 c->protect_clock ||
1442                 c->protect_hostname ||
1443                 c->protect_kernel_tunables ||
1444                 c->protect_kernel_modules ||
1445                 c->protect_kernel_logs ||
1446                 context_has_address_families(c) ||
1447                 exec_context_restrict_namespaces_set(c) ||
1448                 c->restrict_realtime ||
1449                 c->restrict_suid_sgid ||
1450                 !set_isempty(c->syscall_archs) ||
1451                 context_has_syscall_filters(c) ||
1452                 context_has_syscall_logs(c);
1453 }
1454
1455 static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457         assert(context);
1458
1459         return !hashmap_isempty(context->set_credentials) ||
1460                 !hashmap_isempty(context->load_credentials);
1461 }
1462
1463 #if HAVE_SECCOMP
1464
1465 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1466
1467         if (is_seccomp_available())
1468                 return false;
1469
1470         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1471         return true;
1472 }
1473
1474 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1475         uint32_t negative_action, default_action, action;
1476         int r;
1477
1478         assert(u);
1479         assert(c);
1480
1481         if (!context_has_syscall_filters(c))
1482                 return 0;
1483
1484         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485                 return 0;
1486
1487         negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1488
1489         if (c->syscall_allow_list) {
1490                 default_action = negative_action;
1491                 action = SCMP_ACT_ALLOW;
1492         } else {
1493                 default_action = SCMP_ACT_ALLOW;
1494                 action = negative_action;
1495         }
1496
1497         if (needs_ambient_hack) {
1498                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1499                 if (r < 0)
1500                         return r;
1501         }
1502
1503         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1504 }
1505
1506 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507 #ifdef SCMP_ACT_LOG
1508         uint32_t default_action, action;
1509 #endif
1510
1511         assert(u);
1512         assert(c);
1513
1514         if (!context_has_syscall_logs(c))
1515                 return 0;
1516
1517 #ifdef SCMP_ACT_LOG
1518         if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519                 return 0;
1520
1521         if (c->syscall_log_allow_list) {
1522                 /* Log nothing but the ones listed */
1523                 default_action = SCMP_ACT_ALLOW;
1524                 action = SCMP_ACT_LOG;
1525         } else {
1526                 /* Log everything but the ones listed */
1527                 default_action = SCMP_ACT_LOG;
1528                 action = SCMP_ACT_ALLOW;
1529         }
1530
1531         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532 #else
1533         /* old libseccomp */
1534         log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535         return 0;
1536 #endif
1537 }
1538
1539 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540         assert(u);
1541         assert(c);
1542
1543         if (set_isempty(c->syscall_archs))
1544                 return 0;
1545
1546         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547                 return 0;
1548
1549         return seccomp_restrict_archs(c->syscall_archs);
1550 }
1551
1552 static int apply_address_families(const Unit* u, const ExecContext *c) {
1553         assert(u);
1554         assert(c);
1555
1556         if (!context_has_address_families(c))
1557                 return 0;
1558
1559         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560                 return 0;
1561
1562         return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1563 }
1564
1565 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1566         assert(u);
1567         assert(c);
1568
1569         if (!c->memory_deny_write_execute)
1570                 return 0;
1571
1572         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573                 return 0;
1574
1575         return seccomp_memory_deny_write_execute();
1576 }
1577
1578 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1579         assert(u);
1580         assert(c);
1581
1582         if (!c->restrict_realtime)
1583                 return 0;
1584
1585         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586                 return 0;
1587
1588         return seccomp_restrict_realtime();
1589 }
1590
1591 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592         assert(u);
1593         assert(c);
1594
1595         if (!c->restrict_suid_sgid)
1596                 return 0;
1597
1598         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599                 return 0;
1600
1601         return seccomp_restrict_suid_sgid();
1602 }
1603
1604 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1605         assert(u);
1606         assert(c);
1607
1608         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609          * let's protect even those systems where this is left on in the kernel. */
1610
1611         if (!c->protect_kernel_tunables)
1612                 return 0;
1613
1614         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615                 return 0;
1616
1617         return seccomp_protect_sysctl();
1618 }
1619
1620 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1621         assert(u);
1622         assert(c);
1623
1624         /* Turn off module syscalls on ProtectKernelModules=yes */
1625
1626         if (!c->protect_kernel_modules)
1627                 return 0;
1628
1629         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630                 return 0;
1631
1632         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1633 }
1634
1635 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636         assert(u);
1637         assert(c);
1638
1639         if (!c->protect_kernel_logs)
1640                 return 0;
1641
1642         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643                 return 0;
1644
1645         return seccomp_protect_syslog();
1646 }
1647
1648 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1649         assert(u);
1650         assert(c);
1651
1652         if (!c->protect_clock)
1653                 return 0;
1654
1655         if (skip_seccomp_unavailable(u, "ProtectClock="))
1656                 return 0;
1657
1658         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659 }
1660
1661 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1662         assert(u);
1663         assert(c);
1664
1665         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1666
1667         if (!c->private_devices)
1668                 return 0;
1669
1670         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671                 return 0;
1672
1673         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1674 }
1675
1676 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1677         assert(u);
1678         assert(c);
1679
1680         if (!exec_context_restrict_namespaces_set(c))
1681                 return 0;
1682
1683         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684                 return 0;
1685
1686         return seccomp_restrict_namespaces(c->restrict_namespaces);
1687 }
1688
1689 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1690         unsigned long personality;
1691         int r;
1692
1693         assert(u);
1694         assert(c);
1695
1696         if (!c->lock_personality)
1697                 return 0;
1698
1699         if (skip_seccomp_unavailable(u, "LockPersonality="))
1700                 return 0;
1701
1702         personality = c->personality;
1703
1704         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1705         if (personality == PERSONALITY_INVALID) {
1706
1707                 r = opinionated_personality(&personality);
1708                 if (r < 0)
1709                         return r;
1710         }
1711
1712         return seccomp_lock_personality(personality);
1713 }
1714
1715 #endif
1716
1717 #if HAVE_LIBBPF
1718 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1719         assert(u);
1720         assert(c);
1721
1722         if (!exec_context_restrict_filesystems_set(c))
1723                 return 0;
1724
1725         if (!u->manager->restrict_fs) {
1726                 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1727                 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1728                 return 0;
1729         }
1730
1731         return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1732 }
1733 #endif
1734
1735 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1736         assert(u);
1737         assert(c);
1738
1739         if (!c->protect_hostname)
1740                 return 0;
1741
1742         if (ns_type_supported(NAMESPACE_UTS)) {
1743                 if (unshare(CLONE_NEWUTS) < 0) {
1744                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1745                                 *ret_exit_status = EXIT_NAMESPACE;
1746                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1747                         }
1748
1749                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1750                 }
1751         } else
1752                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1753
1754 #if HAVE_SECCOMP
1755         int r;
1756
1757         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1758                 return 0;
1759
1760         r = seccomp_protect_hostname();
1761         if (r < 0) {
1762                 *ret_exit_status = EXIT_SECCOMP;
1763                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1764         }
1765 #endif
1766
1767         return 0;
1768 }
1769
1770 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1771         assert(idle_pipe);
1772
1773         idle_pipe[1] = safe_close(idle_pipe[1]);
1774         idle_pipe[2] = safe_close(idle_pipe[2]);
1775
1776         if (idle_pipe[0] >= 0) {
1777                 int r;
1778
1779                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1780
1781                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1782                         ssize_t n;
1783
1784                         /* Signal systemd that we are bored and want to continue. */
1785                         n = write(idle_pipe[3], "x", 1);
1786                         if (n > 0)
1787                                 /* Wait for systemd to react to the signal above. */
1788                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1789                 }
1790
1791                 idle_pipe[0] = safe_close(idle_pipe[0]);
1792
1793         }
1794
1795         idle_pipe[3] = safe_close(idle_pipe[3]);
1796 }
1797
1798 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1799
1800 static int build_environment(
1801                 const Unit *u,
1802                 const ExecContext *c,
1803                 const ExecParameters *p,
1804                 size_t n_fds,
1805                 const char *home,
1806                 const char *username,
1807                 const char *shell,
1808                 dev_t journal_stream_dev,
1809                 ino_t journal_stream_ino,
1810                 char ***ret) {
1811
1812         _cleanup_strv_free_ char **our_env = NULL;
1813         size_t n_env = 0;
1814         char *x;
1815
1816         assert(u);
1817         assert(c);
1818         assert(p);
1819         assert(ret);
1820
1821 #define N_ENV_VARS 17
1822         our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1823         if (!our_env)
1824                 return -ENOMEM;
1825
1826         if (n_fds > 0) {
1827                 _cleanup_free_ char *joined = NULL;
1828
1829                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1830                         return -ENOMEM;
1831                 our_env[n_env++] = x;
1832
1833                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1834                         return -ENOMEM;
1835                 our_env[n_env++] = x;
1836
1837                 joined = strv_join(p->fd_names, ":");
1838                 if (!joined)
1839                         return -ENOMEM;
1840
1841                 x = strjoin("LISTEN_FDNAMES=", joined);
1842                 if (!x)
1843                         return -ENOMEM;
1844                 our_env[n_env++] = x;
1845         }
1846
1847         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1848                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1849                         return -ENOMEM;
1850                 our_env[n_env++] = x;
1851
1852                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1853                         return -ENOMEM;
1854                 our_env[n_env++] = x;
1855         }
1856
1857         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1858          * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1859          * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1860         if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1861                 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1862                 if (!x)
1863                         return -ENOMEM;
1864                 our_env[n_env++] = x;
1865         }
1866
1867         if (home) {
1868                 x = strjoin("HOME=", home);
1869                 if (!x)
1870                         return -ENOMEM;
1871
1872                 path_simplify(x + 5);
1873                 our_env[n_env++] = x;
1874         }
1875
1876         if (username) {
1877                 x = strjoin("LOGNAME=", username);
1878                 if (!x)
1879                         return -ENOMEM;
1880                 our_env[n_env++] = x;
1881
1882                 x = strjoin("USER=", username);
1883                 if (!x)
1884                         return -ENOMEM;
1885                 our_env[n_env++] = x;
1886         }
1887
1888         if (shell) {
1889                 x = strjoin("SHELL=", shell);
1890                 if (!x)
1891                         return -ENOMEM;
1892
1893                 path_simplify(x + 6);
1894                 our_env[n_env++] = x;
1895         }
1896
1897         if (!sd_id128_is_null(u->invocation_id)) {
1898                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1899                         return -ENOMEM;
1900
1901                 our_env[n_env++] = x;
1902         }
1903
1904         if (exec_context_needs_term(c)) {
1905                 const char *tty_path, *term = NULL;
1906
1907                 tty_path = exec_context_tty_path(c);
1908
1909                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1910                  * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1911                  * container manager passes to PID 1 ends up all the way in the console login shown. */
1912
1913                 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1914                         term = getenv("TERM");
1915
1916                 if (!term)
1917                         term = default_term_for_tty(tty_path);
1918
1919                 x = strjoin("TERM=", term);
1920                 if (!x)
1921                         return -ENOMEM;
1922                 our_env[n_env++] = x;
1923         }
1924
1925         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1926                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1927                         return -ENOMEM;
1928
1929                 our_env[n_env++] = x;
1930         }
1931
1932         if (c->log_namespace) {
1933                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1934                 if (!x)
1935                         return -ENOMEM;
1936
1937                 our_env[n_env++] = x;
1938         }
1939
1940         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1941                 _cleanup_free_ char *joined = NULL;
1942                 const char *n;
1943
1944                 if (!p->prefix[t])
1945                         continue;
1946
1947                 if (c->directories[t].n_items == 0)
1948                         continue;
1949
1950                 n = exec_directory_env_name_to_string(t);
1951                 if (!n)
1952                         continue;
1953
1954                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1955                         _cleanup_free_ char *prefixed = NULL;
1956
1957                         prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1958                         if (!prefixed)
1959                                 return -ENOMEM;
1960
1961                         if (!strextend_with_separator(&joined, ":", prefixed))
1962                                 return -ENOMEM;
1963                 }
1964
1965                 x = strjoin(n, "=", joined);
1966                 if (!x)
1967                         return -ENOMEM;
1968
1969                 our_env[n_env++] = x;
1970         }
1971
1972         if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1973                 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1974                 if (!x)
1975                         return -ENOMEM;
1976
1977                 our_env[n_env++] = x;
1978         }
1979
1980         if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1981                 return -ENOMEM;
1982
1983         our_env[n_env++] = x;
1984
1985         our_env[n_env++] = NULL;
1986         assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1987 #undef N_ENV_VARS
1988
1989         *ret = TAKE_PTR(our_env);
1990
1991         return 0;
1992 }
1993
1994 static int build_pass_environment(const ExecContext *c, char ***ret) {
1995         _cleanup_strv_free_ char **pass_env = NULL;
1996         size_t n_env = 0;
1997
1998         STRV_FOREACH(i, c->pass_environment) {
1999                 _cleanup_free_ char *x = NULL;
2000                 char *v;
2001
2002                 v = getenv(*i);
2003                 if (!v)
2004                         continue;
2005                 x = strjoin(*i, "=", v);
2006                 if (!x)
2007                         return -ENOMEM;
2008
2009                 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2010                         return -ENOMEM;
2011
2012                 pass_env[n_env++] = TAKE_PTR(x);
2013                 pass_env[n_env] = NULL;
2014         }
2015
2016         *ret = TAKE_PTR(pass_env);
2017
2018         return 0;
2019 }
2020
2021 bool exec_needs_mount_namespace(
2022                 const ExecContext *context,
2023                 const ExecParameters *params,
2024                 const ExecRuntime *runtime) {
2025
2026         assert(context);
2027
2028         if (context->root_image)
2029                 return true;
2030
2031         if (!strv_isempty(context->read_write_paths) ||
2032             !strv_isempty(context->read_only_paths) ||
2033             !strv_isempty(context->inaccessible_paths) ||
2034             !strv_isempty(context->exec_paths) ||
2035             !strv_isempty(context->no_exec_paths))
2036                 return true;
2037
2038         if (context->n_bind_mounts > 0)
2039                 return true;
2040
2041         if (context->n_temporary_filesystems > 0)
2042                 return true;
2043
2044         if (context->n_mount_images > 0)
2045                 return true;
2046
2047         if (context->n_extension_images > 0)
2048                 return true;
2049
2050         if (!strv_isempty(context->extension_directories))
2051                 return true;
2052
2053         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2054                 return true;
2055
2056         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2057                 return true;
2058
2059         if (context->private_devices ||
2060             context->private_mounts ||
2061             context->protect_system != PROTECT_SYSTEM_NO ||
2062             context->protect_home != PROTECT_HOME_NO ||
2063             context->protect_kernel_tunables ||
2064             context->protect_kernel_modules ||
2065             context->protect_kernel_logs ||
2066             context->protect_control_groups ||
2067             context->protect_proc != PROTECT_PROC_DEFAULT ||
2068             context->proc_subset != PROC_SUBSET_ALL ||
2069             context->private_ipc ||
2070             context->ipc_namespace_path)
2071                 return true;
2072
2073         if (context->root_directory) {
2074                 if (exec_context_get_effective_mount_apivfs(context))
2075                         return true;
2076
2077                 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2078                         if (params && !params->prefix[t])
2079                                 continue;
2080
2081                         if (context->directories[t].n_items > 0)
2082                                 return true;
2083                 }
2084         }
2085
2086         if (context->dynamic_user &&
2087             (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2088              context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2089              context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2090                 return true;
2091
2092         if (context->log_namespace)
2093                 return true;
2094
2095         return false;
2096 }
2097
2098 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2099         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2100         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2101         _cleanup_close_ int unshare_ready_fd = -1;
2102         _cleanup_(sigkill_waitp) pid_t pid = 0;
2103         uint64_t c = 1;
2104         ssize_t n;
2105         int r;
2106
2107         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2108          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2109          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2110          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2111          * which waits for the parent to create the new user namespace while staying in the original namespace. The
2112          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2113          * continues execution normally.
2114          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2115          * does not need CAP_SETUID to write the single line mapping to itself. */
2116
2117         /* Can only set up multiple mappings with CAP_SETUID. */
2118         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2119                 r = asprintf(&uid_map,
2120                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2121                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2122                              ouid, ouid, uid, uid);
2123         else
2124                 r = asprintf(&uid_map,
2125                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2126                              ouid, ouid);
2127
2128         if (r < 0)
2129                 return -ENOMEM;
2130
2131         /* Can only set up multiple mappings with CAP_SETGID. */
2132         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2133                 r = asprintf(&gid_map,
2134                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2135                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2136                              ogid, ogid, gid, gid);
2137         else
2138                 r = asprintf(&gid_map,
2139                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2140                              ogid, ogid);
2141
2142         if (r < 0)
2143                 return -ENOMEM;
2144
2145         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2146          * namespace. */
2147         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2148         if (unshare_ready_fd < 0)
2149                 return -errno;
2150
2151         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2152          * failed. */
2153         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2154                 return -errno;
2155
2156         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2157         if (r < 0)
2158                 return r;
2159         if (r == 0) {
2160                 _cleanup_close_ int fd = -1;
2161                 const char *a;
2162                 pid_t ppid;
2163
2164                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2165                  * here, after the parent opened its own user namespace. */
2166
2167                 ppid = getppid();
2168                 errno_pipe[0] = safe_close(errno_pipe[0]);
2169
2170                 /* Wait until the parent unshared the user namespace */
2171                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2172                         r = -errno;
2173                         goto child_fail;
2174                 }
2175
2176                 /* Disable the setgroups() system call in the child user namespace, for good. */
2177                 a = procfs_file_alloca(ppid, "setgroups");
2178                 fd = open(a, O_WRONLY|O_CLOEXEC);
2179                 if (fd < 0) {
2180                         if (errno != ENOENT) {
2181                                 r = -errno;
2182                                 goto child_fail;
2183                         }
2184
2185                         /* If the file is missing the kernel is too old, let's continue anyway. */
2186                 } else {
2187                         if (write(fd, "deny\n", 5) < 0) {
2188                                 r = -errno;
2189                                 goto child_fail;
2190                         }
2191
2192                         fd = safe_close(fd);
2193                 }
2194
2195                 /* First write the GID map */
2196                 a = procfs_file_alloca(ppid, "gid_map");
2197                 fd = open(a, O_WRONLY|O_CLOEXEC);
2198                 if (fd < 0) {
2199                         r = -errno;
2200                         goto child_fail;
2201                 }
2202                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2203                         r = -errno;
2204                         goto child_fail;
2205                 }
2206                 fd = safe_close(fd);
2207
2208                 /* The write the UID map */
2209                 a = procfs_file_alloca(ppid, "uid_map");
2210                 fd = open(a, O_WRONLY|O_CLOEXEC);
2211                 if (fd < 0) {
2212                         r = -errno;
2213                         goto child_fail;
2214                 }
2215                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2216                         r = -errno;
2217                         goto child_fail;
2218                 }
2219
2220                 _exit(EXIT_SUCCESS);
2221
2222         child_fail:
2223                 (void) write(errno_pipe[1], &r, sizeof(r));
2224                 _exit(EXIT_FAILURE);
2225         }
2226
2227         errno_pipe[1] = safe_close(errno_pipe[1]);
2228
2229         if (unshare(CLONE_NEWUSER) < 0)
2230                 return -errno;
2231
2232         /* Let the child know that the namespace is ready now */
2233         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2234                 return -errno;
2235
2236         /* Try to read an error code from the child */
2237         n = read(errno_pipe[0], &r, sizeof(r));
2238         if (n < 0)
2239                 return -errno;
2240         if (n == sizeof(r)) { /* an error code was sent to us */
2241                 if (r < 0)
2242                         return r;
2243                 return -EIO;
2244         }
2245         if (n != 0) /* on success we should have read 0 bytes */
2246                 return -EIO;
2247
2248         r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2249         if (r < 0)
2250                 return r;
2251         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2252                 return -EIO;
2253
2254         return 0;
2255 }
2256
2257 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2258         if (!context->dynamic_user)
2259                 return false;
2260
2261         if (type == EXEC_DIRECTORY_CONFIGURATION)
2262                 return false;
2263
2264         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2265                 return false;
2266
2267         return true;
2268 }
2269
2270 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2271         _cleanup_free_ char *src_abs = NULL;
2272         int r;
2273
2274         assert(source);
2275
2276         src_abs = path_join(root, source);
2277         if (!src_abs)
2278                 return -ENOMEM;
2279
2280         STRV_FOREACH(dst, symlinks) {
2281                 _cleanup_free_ char *dst_abs = NULL;
2282
2283                 dst_abs = path_join(root, *dst);
2284                 if (!dst_abs)
2285                         return -ENOMEM;
2286
2287                 r = mkdir_parents_label(dst_abs, 0755);
2288                 if (r < 0)
2289                         return r;
2290
2291                 r = symlink_idempotent(src_abs, dst_abs, true);
2292                 if (r < 0)
2293                         return r;
2294         }
2295
2296         return 0;
2297 }
2298
2299 static int setup_exec_directory(
2300                 const ExecContext *context,
2301                 const ExecParameters *params,
2302                 uid_t uid,
2303                 gid_t gid,
2304                 ExecDirectoryType type,
2305                 bool needs_mount_namespace,
2306                 int *exit_status) {
2307
2308         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2309                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2310                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2311                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2312                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2313                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2314         };
2315         int r;
2316
2317         assert(context);
2318         assert(params);
2319         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2320         assert(exit_status);
2321
2322         if (!params->prefix[type])
2323                 return 0;
2324
2325         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2326                 if (!uid_is_valid(uid))
2327                         uid = 0;
2328                 if (!gid_is_valid(gid))
2329                         gid = 0;
2330         }
2331
2332         for (size_t i = 0; i < context->directories[type].n_items; i++) {
2333                 _cleanup_free_ char *p = NULL, *pp = NULL;
2334
2335                 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2336                 if (!p) {
2337                         r = -ENOMEM;
2338                         goto fail;
2339                 }
2340
2341                 r = mkdir_parents_label(p, 0755);
2342                 if (r < 0)
2343                         goto fail;
2344
2345                 if (exec_directory_is_private(context, type)) {
2346                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2347                          * case we want to avoid leaving a directory around fully accessible that is owned by
2348                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2349                          * trick used by container managers to prohibit host users to get access to files of
2350                          * the same UID in containers: we place everything inside a directory that has an
2351                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2352                          * for unprivileged host code. We then use fs namespacing to make this directory
2353                          * permeable for the service itself.
2354                          *
2355                          * Specifically: for a service which wants a special directory "foo/" we first create
2356                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2357                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2358                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2359                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2360                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2361                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2362                          * for the service and making sure it only gets access to the dirs it needs but no
2363                          * others. Tricky? Yes, absolutely, but it works!
2364                          *
2365                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2366                          * to be owned by the service itself.
2367                          *
2368                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2369                          * for sharing files or sockets with other services. */
2370
2371                         pp = path_join(params->prefix[type], "private");
2372                         if (!pp) {
2373                                 r = -ENOMEM;
2374                                 goto fail;
2375                         }
2376
2377                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2378                         r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2379                         if (r < 0)
2380                                 goto fail;
2381
2382                         if (!path_extend(&pp, context->directories[type].items[i].path)) {
2383                                 r = -ENOMEM;
2384                                 goto fail;
2385                         }
2386
2387                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2388                         r = mkdir_parents_label(pp, 0755);
2389                         if (r < 0)
2390                                 goto fail;
2391
2392                         if (is_dir(p, false) > 0 &&
2393                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2394
2395                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2396                                  * it over. Most likely the service has been upgraded from one that didn't use
2397                                  * DynamicUser=1, to one that does. */
2398
2399                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2400                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2401                                          exec_directory_type_to_string(type), p, pp);
2402
2403                                 if (rename(p, pp) < 0) {
2404                                         r = -errno;
2405                                         goto fail;
2406                                 }
2407                         } else {
2408                                 /* Otherwise, create the actual directory for the service */
2409
2410                                 r = mkdir_label(pp, context->directories[type].mode);
2411                                 if (r < 0 && r != -EEXIST)
2412                                         goto fail;
2413                         }
2414
2415                         /* And link it up from the original place. Note that if a mount namespace is going to be
2416                          * used, then this symlink remains on the host, and a new one for the child namespace will
2417                          * be created later. */
2418                         r = symlink_idempotent(pp, p, true);
2419                         if (r < 0)
2420                                 goto fail;
2421
2422                 } else {
2423                         _cleanup_free_ char *target = NULL;
2424
2425                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2426                             readlink_and_make_absolute(p, &target) >= 0) {
2427                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2428
2429                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2430                                  * by DynamicUser=1 (see above)?
2431                                  *
2432                                  * We do this for all directory types except for ConfigurationDirectory=,
2433                                  * since they all support the private/ symlink logic at least in some
2434                                  * configurations, see above. */
2435
2436                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2437                                 if (r < 0)
2438                                         goto fail;
2439
2440                                 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2441                                 if (!q) {
2442                                         r = -ENOMEM;
2443                                         goto fail;
2444                                 }
2445
2446                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2447                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2448                                 if (r < 0)
2449                                         goto fail;
2450
2451                                 if (path_equal(q_resolved, target_resolved)) {
2452
2453                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2454                                          * but is no longer. Let's move the directory back up. */
2455
2456                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2457                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2458                                                  exec_directory_type_to_string(type), q, p);
2459
2460                                         if (unlink(p) < 0) {
2461                                                 r = -errno;
2462                                                 goto fail;
2463                                         }
2464
2465                                         if (rename(q, p) < 0) {
2466                                                 r = -errno;
2467                                                 goto fail;
2468                                         }
2469                                 }
2470                         }
2471
2472                         r = mkdir_label(p, context->directories[type].mode);
2473                         if (r < 0) {
2474                                 if (r != -EEXIST)
2475                                         goto fail;
2476
2477                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2478                                         struct stat st;
2479
2480                                         /* Don't change the owner/access mode of the configuration directory,
2481                                          * as in the common case it is not written to by a service, and shall
2482                                          * not be writable. */
2483
2484                                         if (stat(p, &st) < 0) {
2485                                                 r = -errno;
2486                                                 goto fail;
2487                                         }
2488
2489                                         /* Still complain if the access mode doesn't match */
2490                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2491                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2492                                                             "(File system: %o %sMode: %o)",
2493                                                             exec_directory_type_to_string(type), context->directories[type].items[i].path,
2494                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2495
2496                                         continue;
2497                                 }
2498                         }
2499                 }
2500
2501                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2502                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2503                  * current UID/GID ownership.) */
2504                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2505                 if (r < 0)
2506                         goto fail;
2507
2508                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2509                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2510                  * assignments to exist. */
2511                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2512                 if (r < 0)
2513                         goto fail;
2514         }
2515
2516         /* If we are not going to run in a namespace, set up the symlinks - otherwise
2517          * they are set up later, to allow configuring empty var/run/etc. */
2518         if (!needs_mount_namespace)
2519                 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2520                         r = create_many_symlinks(params->prefix[type],
2521                                                  context->directories[type].items[i].path,
2522                                                  context->directories[type].items[i].symlinks);
2523                         if (r < 0)
2524                                 goto fail;
2525                 }
2526
2527         return 0;
2528
2529 fail:
2530         *exit_status = exit_status_table[type];
2531         return r;
2532 }
2533
2534 static int write_credential(
2535                 int dfd,
2536                 const char *id,
2537                 const void *data,
2538                 size_t size,
2539                 uid_t uid,
2540                 bool ownership_ok) {
2541
2542         _cleanup_(unlink_and_freep) char *tmp = NULL;
2543         _cleanup_close_ int fd = -1;
2544         int r;
2545
2546         r = tempfn_random_child("", "cred", &tmp);
2547         if (r < 0)
2548                 return r;
2549
2550         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2551         if (fd < 0) {
2552                 tmp = mfree(tmp);
2553                 return -errno;
2554         }
2555
2556         r = loop_write(fd, data, size, /* do_poll = */ false);
2557         if (r < 0)
2558                 return r;
2559
2560         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2561                 return -errno;
2562
2563         if (uid_is_valid(uid) && uid != getuid()) {
2564                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2565                 if (r < 0) {
2566                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2567                                 return r;
2568
2569                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2570                                             * to express: that the user gets read access and nothing
2571                                             * else. But if the backing fs can't support that (e.g. ramfs)
2572                                             * then we can use file ownership instead. But that's only safe if
2573                                             * we can then re-mount the whole thing read-only, so that the
2574                                             * user can no longer chmod() the file to gain write access. */
2575                                 return r;
2576
2577                         if (fchown(fd, uid, GID_INVALID) < 0)
2578                                 return -errno;
2579                 }
2580         }
2581
2582         if (renameat(dfd, tmp, dfd, id) < 0)
2583                 return -errno;
2584
2585         tmp = mfree(tmp);
2586         return 0;
2587 }
2588
2589 static char **credential_search_path(
2590                 const ExecParameters *params,
2591                 bool encrypted) {
2592
2593         _cleanup_strv_free_ char **l = NULL;
2594
2595         assert(params);
2596
2597         /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2598          * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2599          * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2600
2601         if (encrypted) {
2602                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2603                         return NULL;
2604
2605                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2606                         return NULL;
2607         }
2608
2609         if (params->received_credentials_directory)
2610                 if (strv_extend(&l, params->received_credentials_directory) < 0)
2611                         return NULL;
2612
2613         if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2614                 return NULL;
2615
2616         if (DEBUG_LOGGING) {
2617                 _cleanup_free_ char *t = strv_join(l, ":");
2618
2619                 log_debug("Credential search path is: %s", t);
2620         }
2621
2622         return TAKE_PTR(l);
2623 }
2624
2625 static int load_credential(
2626                 const ExecContext *context,
2627                 const ExecParameters *params,
2628                 const char *id,
2629                 const char *path,
2630                 bool encrypted,
2631                 const char *unit,
2632                 int read_dfd,
2633                 int write_dfd,
2634                 uid_t uid,
2635                 bool ownership_ok,
2636                 uint64_t *left) {
2637
2638         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2639         _cleanup_strv_free_ char **search_path = NULL;
2640         _cleanup_(erase_and_freep) char *data = NULL;
2641         _cleanup_free_ char *bindname = NULL;
2642         const char *source = NULL;
2643         bool missing_ok = true;
2644         size_t size, add, maxsz;
2645         int r;
2646
2647         assert(context);
2648         assert(params);
2649         assert(id);
2650         assert(path);
2651         assert(unit);
2652         assert(write_dfd >= 0);
2653         assert(left);
2654
2655         if (read_dfd >= 0) {
2656                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2657                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2658                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2659                  * open it. */
2660
2661                 if (!filename_is_valid(path)) /* safety check */
2662                         return -EINVAL;
2663
2664                 missing_ok = true;
2665                 source = path;
2666
2667         } else if (path_is_absolute(path)) {
2668                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2669                  * sockets */
2670
2671                 if (!path_is_valid(path)) /* safety check */
2672                         return -EINVAL;
2673
2674                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2675
2676                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2677                  * via the source socket address in case we read off an AF_UNIX socket. */
2678                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2679                         return -ENOMEM;
2680
2681                 missing_ok = false;
2682                 source = path;
2683
2684         } else if (credential_name_valid(path)) {
2685                 /* If this is a relative path, take it as credential name relative to the credentials
2686                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2687                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2688
2689                 search_path = credential_search_path(params, encrypted);
2690                 if (!search_path)
2691                         return -ENOMEM;
2692
2693                 missing_ok = true;
2694         } else
2695                 source = NULL;
2696
2697         if (encrypted)
2698                 flags |= READ_FULL_FILE_UNBASE64;
2699
2700         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2701
2702         if (search_path) {
2703                 STRV_FOREACH(d, search_path) {
2704                         _cleanup_free_ char *j = NULL;
2705
2706                         j = path_join(*d, path);
2707                         if (!j)
2708                                 return -ENOMEM;
2709
2710                         r = read_full_file_full(
2711                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2712                                         UINT64_MAX,
2713                                         maxsz,
2714                                         flags,
2715                                         NULL,
2716                                         &data, &size);
2717                         if (r != -ENOENT)
2718                                 break;
2719                 }
2720         } else if (source)
2721                 r = read_full_file_full(
2722                                 read_dfd, source,
2723                                 UINT64_MAX,
2724                                 maxsz,
2725                                 flags,
2726                                 bindname,
2727                                 &data, &size);
2728         else
2729                 r = -ENOENT;
2730
2731         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2732                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2733                  * will get clear errors if we don't pass such a missing credential on as they
2734                  * themselves will get ENOENT when trying to read them, which should not be much
2735                  * worse than when we handle the error here and make it fatal.
2736                  *
2737                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2738                  * we are fine, too. */
2739                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2740                 return 0;
2741         }
2742         if (r < 0)
2743                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2744
2745         if (encrypted) {
2746                 _cleanup_free_ void *plaintext = NULL;
2747                 size_t plaintext_size = 0;
2748
2749                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2750                 if (r < 0)
2751                         return r;
2752
2753                 free_and_replace(data, plaintext);
2754                 size = plaintext_size;
2755         }
2756
2757         add = strlen(id) + size;
2758         if (add > *left)
2759                 return -E2BIG;
2760
2761         r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2762         if (r < 0)
2763                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2764
2765         *left -= add;
2766         return 0;
2767 }
2768
2769 struct load_cred_args {
2770         const ExecContext *context;
2771         const ExecParameters *params;
2772         bool encrypted;
2773         const char *unit;
2774         int dfd;
2775         uid_t uid;
2776         bool ownership_ok;
2777         uint64_t *left;
2778 };
2779
2780 static int load_cred_recurse_dir_cb(
2781                 RecurseDirEvent event,
2782                 const char *path,
2783                 int dir_fd,
2784                 int inode_fd,
2785                 const struct dirent *de,
2786                 const struct statx *sx,
2787                 void *userdata) {
2788
2789         struct load_cred_args *args = ASSERT_PTR(userdata);
2790         _cleanup_free_ char *sub_id = NULL;
2791         int r;
2792
2793         if (event != RECURSE_DIR_ENTRY)
2794                 return RECURSE_DIR_CONTINUE;
2795
2796         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2797                 return RECURSE_DIR_CONTINUE;
2798
2799         sub_id = strreplace(path, "/", "_");
2800         if (!sub_id)
2801                 return -ENOMEM;
2802
2803         if (!credential_name_valid(sub_id))
2804                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2805
2806         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2807                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2808                 return RECURSE_DIR_CONTINUE;
2809         }
2810         if (errno != ENOENT)
2811                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2812
2813         r = load_credential(
2814                         args->context,
2815                         args->params,
2816                         sub_id,
2817                         de->d_name,
2818                         args->encrypted,
2819                         args->unit,
2820                         dir_fd,
2821                         args->dfd,
2822                         args->uid,
2823                         args->ownership_ok,
2824                         args->left);
2825         if (r < 0)
2826                 return r;
2827
2828         return RECURSE_DIR_CONTINUE;
2829 }
2830
2831 static int acquire_credentials(
2832                 const ExecContext *context,
2833                 const ExecParameters *params,
2834                 const char *unit,
2835                 const char *p,
2836                 uid_t uid,
2837                 bool ownership_ok) {
2838
2839         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2840         _cleanup_close_ int dfd = -1;
2841         ExecLoadCredential *lc;
2842         ExecSetCredential *sc;
2843         int r;
2844
2845         assert(context);
2846         assert(p);
2847
2848         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2849         if (dfd < 0)
2850                 return -errno;
2851
2852         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2853         HASHMAP_FOREACH(lc, context->load_credentials) {
2854                 _cleanup_close_ int sub_fd = -1;
2855
2856                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2857                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2858                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
2859                  * propagate a credential passed to us from further up. */
2860
2861                 if (path_is_absolute(lc->path)) {
2862                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2863                         if (sub_fd < 0 && !IN_SET(errno,
2864                                                   ENOTDIR,  /* Not a directory */
2865                                                   ENOENT))  /* Doesn't exist? */
2866                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2867                 }
2868
2869                 if (sub_fd < 0)
2870                         /* Regular file (incl. a credential passed in from higher up) */
2871                         r = load_credential(
2872                                         context,
2873                                         params,
2874                                         lc->id,
2875                                         lc->path,
2876                                         lc->encrypted,
2877                                         unit,
2878                                         -1,
2879                                         dfd,
2880                                         uid,
2881                                         ownership_ok,
2882                                         &left);
2883                 else
2884                         /* Directory */
2885                         r = recurse_dir(
2886                                         sub_fd,
2887                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2888                                         /* statx_mask= */ 0,
2889                                         /* n_depth_max= */ UINT_MAX,
2890                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2891                                         load_cred_recurse_dir_cb,
2892                                         &(struct load_cred_args) {
2893                                                 .context = context,
2894                                                 .params = params,
2895                                                 .encrypted = lc->encrypted,
2896                                                 .unit = unit,
2897                                                 .dfd = dfd,
2898                                                 .uid = uid,
2899                                                 .ownership_ok = ownership_ok,
2900                                                 .left = &left,
2901                                         });
2902                 if (r < 0)
2903                         return r;
2904         }
2905
2906         /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2907          * them, so that they can act as a "default" if the same credential is specified multiple times. */
2908         HASHMAP_FOREACH(sc, context->set_credentials) {
2909                 _cleanup_(erase_and_freep) void *plaintext = NULL;
2910                 const char *data;
2911                 size_t size, add;
2912
2913                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2914                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2915                  * slow and involved, hence it's nice to be able to skip that if the credential already
2916                  * exists anyway. */
2917                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2918                         continue;
2919                 if (errno != ENOENT)
2920                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2921
2922                 if (sc->encrypted) {
2923                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
2924                         if (r < 0)
2925                                 return r;
2926
2927                         data = plaintext;
2928                 } else {
2929                         data = sc->data;
2930                         size = sc->size;
2931                 }
2932
2933                 add = strlen(sc->id) + size;
2934                 if (add > left)
2935                         return -E2BIG;
2936
2937                 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2938                 if (r < 0)
2939                         return r;
2940
2941                 left -= add;
2942         }
2943
2944         if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2945                 return -errno;
2946
2947         /* After we created all keys with the right perms, also make sure the credential store as a whole is
2948          * accessible */
2949
2950         if (uid_is_valid(uid) && uid != getuid()) {
2951                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2952                 if (r < 0) {
2953                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2954                                 return r;
2955
2956                         if (!ownership_ok)
2957                                 return r;
2958
2959                         if (fchown(dfd, uid, GID_INVALID) < 0)
2960                                 return -errno;
2961                 }
2962         }
2963
2964         return 0;
2965 }
2966
2967 static int setup_credentials_internal(
2968                 const ExecContext *context,
2969                 const ExecParameters *params,
2970                 const char *unit,
2971                 const char *final,        /* This is where the credential store shall eventually end up at */
2972                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
2973                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
2974                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2975                 uid_t uid) {
2976
2977         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2978                                    * if we mounted something; false if we definitely can't mount anything */
2979         bool final_mounted;
2980         const char *where;
2981
2982         assert(context);
2983         assert(final);
2984         assert(workspace);
2985
2986         if (reuse_workspace) {
2987                 r = path_is_mount_point(workspace, NULL, 0);
2988                 if (r < 0)
2989                         return r;
2990                 if (r > 0)
2991                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2992                 else
2993                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2994         } else
2995                 workspace_mounted = -1; /* ditto */
2996
2997         r = path_is_mount_point(final, NULL, 0);
2998         if (r < 0)
2999                 return r;
3000         if (r > 0) {
3001                 /* If the final place already has something mounted, we use that. If the workspace also has
3002                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
3003                  * different). */
3004                 final_mounted = true;
3005
3006                 if (workspace_mounted < 0) {
3007                         /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3008                          * the final version to the workspace, and make it writable, so that we can make
3009                          * changes */
3010
3011                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3012                         if (r < 0)
3013                                 return r;
3014
3015                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3016                         if (r < 0)
3017                                 return r;
3018
3019                         workspace_mounted = true;
3020                 }
3021         } else
3022                 final_mounted = false;
3023
3024         if (workspace_mounted < 0) {
3025                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3026                 for (int try = 0;; try++) {
3027
3028                         if (try == 0) {
3029                                 /* Try "ramfs" first, since it's not swap backed */
3030                                 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3031                                 if (r >= 0) {
3032                                         workspace_mounted = true;
3033                                         break;
3034                                 }
3035
3036                         } else if (try == 1) {
3037                                 _cleanup_free_ char *opts = NULL;
3038
3039                                 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3040                                         return -ENOMEM;
3041
3042                                 /* Fall back to "tmpfs" otherwise */
3043                                 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3044                                 if (r >= 0) {
3045                                         workspace_mounted = true;
3046                                         break;
3047                                 }
3048
3049                         } else {
3050                                 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3051                                 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3052                                 if (r < 0) {
3053                                         if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3054                                                 return r;
3055
3056                                         if (must_mount) /* If we it's not OK to use the plain directory
3057                                                          * fallback, propagate all errors too */
3058                                                 return r;
3059
3060                                         /* If we lack privileges to bind mount stuff, then let's gracefully
3061                                          * proceed for compat with container envs, and just use the final dir
3062                                          * as is. */
3063
3064                                         workspace_mounted = false;
3065                                         break;
3066                                 }
3067
3068                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3069                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3070                                 if (r < 0)
3071                                         return r;
3072
3073                                 workspace_mounted = true;
3074                                 break;
3075                         }
3076                 }
3077         }
3078
3079         assert(!must_mount || workspace_mounted > 0);
3080         where = workspace_mounted ? workspace : final;
3081
3082         (void) label_fix_full(AT_FDCWD, where, final, 0);
3083
3084         r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3085         if (r < 0)
3086                 return r;
3087
3088         if (workspace_mounted) {
3089                 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3090                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3091                 if (r < 0)
3092                         return r;
3093
3094                 /* And mount it to the final place, read-only */
3095                 if (final_mounted)
3096                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3097                 else
3098                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3099                 if (r < 0)
3100                         return r;
3101         } else {
3102                 _cleanup_free_ char *parent = NULL;
3103
3104                 /* If we do not have our own mount put used the plain directory fallback, then we need to
3105                  * open access to the top-level credential directory and the per-service directory now */
3106
3107                 r = path_extract_directory(final, &parent);
3108                 if (r < 0)
3109                         return r;
3110                 if (chmod(parent, 0755) < 0)
3111                         return -errno;
3112         }
3113
3114         return 0;
3115 }
3116
3117 static int setup_credentials(
3118                 const ExecContext *context,
3119                 const ExecParameters *params,
3120                 const char *unit,
3121                 uid_t uid) {
3122
3123         _cleanup_free_ char *p = NULL, *q = NULL;
3124         int r;
3125
3126         assert(context);
3127         assert(params);
3128
3129         if (!exec_context_has_credentials(context))
3130                 return 0;
3131
3132         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3133                 return -EINVAL;
3134
3135         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3136          * and the subdir we mount over with a read-only file system readable by the service's user */
3137         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3138         if (!q)
3139                 return -ENOMEM;
3140
3141         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3142         if (r < 0 && r != -EEXIST)
3143                 return r;
3144
3145         p = path_join(q, unit);
3146         if (!p)
3147                 return -ENOMEM;
3148
3149         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3150         if (r < 0 && r != -EEXIST)
3151                 return r;
3152
3153         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3154         if (r < 0) {
3155                 _cleanup_free_ char *t = NULL, *u = NULL;
3156
3157                 /* If this is not a privilege or support issue then propagate the error */
3158                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3159                         return r;
3160
3161                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3162                  * it into place, so that users can't access half-initialized credential stores. */
3163                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3164                 if (!t)
3165                         return -ENOMEM;
3166
3167                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3168                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3169                  * after it is fully set up */
3170                 u = path_join(t, unit);
3171                 if (!u)
3172                         return -ENOMEM;
3173
3174                 FOREACH_STRING(i, t, u) {
3175                         r = mkdir_label(i, 0700);
3176                         if (r < 0 && r != -EEXIST)
3177                                 return r;
3178                 }
3179
3180                 r = setup_credentials_internal(
3181                                 context,
3182                                 params,
3183                                 unit,
3184                                 p,       /* final mount point */
3185                                 u,       /* temporary workspace to overmount */
3186                                 true,    /* reuse the workspace if it is already a mount */
3187                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
3188                                 uid);
3189
3190                 (void) rmdir(u); /* remove the workspace again if we can. */
3191
3192                 if (r < 0)
3193                         return r;
3194
3195         } else if (r == 0) {
3196
3197                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3198                  * we can use the same directory for all cases, after turning off propagation. Question
3199                  * though is: where do we turn off propagation exactly, and where do we place the workspace
3200                  * directory? We need some place that is guaranteed to be a mount point in the host, and
3201                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3202                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
3203                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3204                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3205                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3206                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3207                  * propagation on the former, and then overmount the latter.
3208                  *
3209                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3210                  * for this purpose, but there are few other candidates that work equally well for us, and
3211                  * given that the we do this in a privately namespaced short-lived single-threaded process
3212                  * that no one else sees this should be OK to do. */
3213
3214                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3215                 if (r < 0)
3216                         goto child_fail;
3217
3218                 r = setup_credentials_internal(
3219                                 context,
3220                                 params,
3221                                 unit,
3222                                 p,           /* final mount point */
3223                                 "/dev/shm",  /* temporary workspace to overmount */
3224                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3225                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
3226                                 uid);
3227                 if (r < 0)
3228                         goto child_fail;
3229
3230                 _exit(EXIT_SUCCESS);
3231
3232         child_fail:
3233                 _exit(EXIT_FAILURE);
3234         }
3235
3236         return 0;
3237 }
3238
3239 #if ENABLE_SMACK
3240 static int setup_smack(
3241                 const Manager *manager,
3242                 const ExecContext *context,
3243                 int executable_fd) {
3244         int r;
3245
3246         assert(context);
3247         assert(executable_fd >= 0);
3248
3249         if (context->smack_process_label) {
3250                 r = mac_smack_apply_pid(0, context->smack_process_label);
3251                 if (r < 0)
3252                         return r;
3253         } else if (manager->default_smack_process_label) {
3254                 _cleanup_free_ char *exec_label = NULL;
3255
3256                 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3257                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
3258                         return r;
3259
3260                 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
3261                 if (r < 0)
3262                         return r;
3263         }
3264
3265         return 0;
3266 }
3267 #endif
3268
3269 static int compile_bind_mounts(
3270                 const ExecContext *context,
3271                 const ExecParameters *params,
3272                 BindMount **ret_bind_mounts,
3273                 size_t *ret_n_bind_mounts,
3274                 char ***ret_empty_directories) {
3275
3276         _cleanup_strv_free_ char **empty_directories = NULL;
3277         BindMount *bind_mounts;
3278         size_t n, h = 0;
3279         int r;
3280
3281         assert(context);
3282         assert(params);
3283         assert(ret_bind_mounts);
3284         assert(ret_n_bind_mounts);
3285         assert(ret_empty_directories);
3286
3287         n = context->n_bind_mounts;
3288         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3289                 if (!params->prefix[t])
3290                         continue;
3291
3292                 n += context->directories[t].n_items;
3293         }
3294
3295         if (n <= 0) {
3296                 *ret_bind_mounts = NULL;
3297                 *ret_n_bind_mounts = 0;
3298                 *ret_empty_directories = NULL;
3299                 return 0;
3300         }
3301
3302         bind_mounts = new(BindMount, n);
3303         if (!bind_mounts)
3304                 return -ENOMEM;
3305
3306         for (size_t i = 0; i < context->n_bind_mounts; i++) {
3307                 BindMount *item = context->bind_mounts + i;
3308                 char *s, *d;
3309
3310                 s = strdup(item->source);
3311                 if (!s) {
3312                         r = -ENOMEM;
3313                         goto finish;
3314                 }
3315
3316                 d = strdup(item->destination);
3317                 if (!d) {
3318                         free(s);
3319                         r = -ENOMEM;
3320                         goto finish;
3321                 }
3322
3323                 bind_mounts[h++] = (BindMount) {
3324                         .source = s,
3325                         .destination = d,
3326                         .read_only = item->read_only,
3327                         .recursive = item->recursive,
3328                         .ignore_enoent = item->ignore_enoent,
3329                 };
3330         }
3331
3332         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3333                 if (!params->prefix[t])
3334                         continue;
3335
3336                 if (context->directories[t].n_items == 0)
3337                         continue;
3338
3339                 if (exec_directory_is_private(context, t) &&
3340                     !exec_context_with_rootfs(context)) {
3341                         char *private_root;
3342
3343                         /* So this is for a dynamic user, and we need to make sure the process can access its own
3344                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3345                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3346
3347                         private_root = path_join(params->prefix[t], "private");
3348                         if (!private_root) {
3349                                 r = -ENOMEM;
3350                                 goto finish;
3351                         }
3352
3353                         r = strv_consume(&empty_directories, private_root);
3354                         if (r < 0)
3355                                 goto finish;
3356                 }
3357
3358                 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3359                         char *s, *d;
3360
3361                         if (exec_directory_is_private(context, t))
3362                                 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3363                         else
3364                                 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3365                         if (!s) {
3366                                 r = -ENOMEM;
3367                                 goto finish;
3368                         }
3369
3370                         if (exec_directory_is_private(context, t) &&
3371                             exec_context_with_rootfs(context))
3372                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3373                                  * directory is not created on the root directory. So, let's bind-mount the directory
3374                                  * on the 'non-private' place. */
3375                                 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3376                         else
3377                                 d = strdup(s);
3378                         if (!d) {
3379                                 free(s);
3380                                 r = -ENOMEM;
3381                                 goto finish;
3382                         }
3383
3384                         bind_mounts[h++] = (BindMount) {
3385                                 .source = s,
3386                                 .destination = d,
3387                                 .read_only = false,
3388                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3389                                 .recursive = true,
3390                                 .ignore_enoent = false,
3391                         };
3392                 }
3393         }
3394
3395         assert(h == n);
3396
3397         *ret_bind_mounts = bind_mounts;
3398         *ret_n_bind_mounts = n;
3399         *ret_empty_directories = TAKE_PTR(empty_directories);
3400
3401         return (int) n;
3402
3403 finish:
3404         bind_mount_free_many(bind_mounts, h);
3405         return r;
3406 }
3407
3408 /* ret_symlinks will contain a list of pairs src:dest that describes
3409  * the symlinks to create later on. For example, the symlinks needed
3410  * to safely give private directories to DynamicUser=1 users. */
3411 static int compile_symlinks(
3412                 const ExecContext *context,
3413                 const ExecParameters *params,
3414                 char ***ret_symlinks) {
3415
3416         _cleanup_strv_free_ char **symlinks = NULL;
3417         int r;
3418
3419         assert(context);
3420         assert(params);
3421         assert(ret_symlinks);
3422
3423         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3424                 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3425                         _cleanup_free_ char *private_path = NULL, *path = NULL;
3426
3427                         STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3428                                 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3429
3430                                 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3431                                 dst_abs = path_join(params->prefix[dt], *symlink);
3432                                 if (!src_abs || !dst_abs)
3433                                         return -ENOMEM;
3434
3435                                 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3436                                 if (r < 0)
3437                                         return r;
3438                         }
3439
3440                         if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context))
3441                                 continue;
3442
3443                         private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3444                         if (!private_path)
3445                                 return -ENOMEM;
3446
3447                         path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3448                         if (!path)
3449                                 return -ENOMEM;
3450
3451                         r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3452                         if (r < 0)
3453                                 return r;
3454                 }
3455         }
3456
3457         *ret_symlinks = TAKE_PTR(symlinks);
3458
3459         return 0;
3460 }
3461
3462 static bool insist_on_sandboxing(
3463                 const ExecContext *context,
3464                 const char *root_dir,
3465                 const char *root_image,
3466                 const BindMount *bind_mounts,
3467                 size_t n_bind_mounts) {
3468
3469         assert(context);
3470         assert(n_bind_mounts == 0 || bind_mounts);
3471
3472         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3473          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3474          * rearrange stuff in a way we cannot ignore gracefully. */
3475
3476         if (context->n_temporary_filesystems > 0)
3477                 return true;
3478
3479         if (root_dir || root_image)
3480                 return true;
3481
3482         if (context->n_mount_images > 0)
3483                 return true;
3484
3485         if (context->dynamic_user)
3486                 return true;
3487
3488         if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3489                 return true;
3490
3491         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3492          * essential. */
3493         for (size_t i = 0; i < n_bind_mounts; i++)
3494                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3495                         return true;
3496
3497         if (context->log_namespace)
3498                 return true;
3499
3500         return false;
3501 }
3502
3503 static int apply_mount_namespace(
3504                 const Unit *u,
3505                 ExecCommandFlags command_flags,
3506                 const ExecContext *context,
3507                 const ExecParameters *params,
3508                 const ExecRuntime *runtime,
3509                 char **error_path) {
3510
3511         _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3512         const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3513         const char *root_dir = NULL, *root_image = NULL;
3514         _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3515                         *extension_dir = NULL;
3516         NamespaceInfo ns_info;
3517         bool needs_sandboxing;
3518         BindMount *bind_mounts = NULL;
3519         size_t n_bind_mounts = 0;
3520         int r;
3521
3522         assert(context);
3523
3524         if (params->flags & EXEC_APPLY_CHROOT) {
3525                 root_image = context->root_image;
3526
3527                 if (!root_image)
3528                         root_dir = context->root_directory;
3529         }
3530
3531         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3532         if (r < 0)
3533                 return r;
3534
3535         /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3536         r = compile_symlinks(context, params, &symlinks);
3537         if (r < 0)
3538                 goto finalize;
3539
3540         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3541         if (needs_sandboxing) {
3542                 /* The runtime struct only contains the parent of the private /tmp,
3543                  * which is non-accessible to world users. Inside of it there's a /tmp
3544                  * that is sticky, and that's the one we want to use here.
3545                  * This does not apply when we are using /run/systemd/empty as fallback. */
3546
3547                 if (context->private_tmp && runtime) {
3548                         if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3549                                 tmp_dir = runtime->tmp_dir;
3550                         else if (runtime->tmp_dir)
3551                                 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3552
3553                         if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3554                                 var_tmp_dir = runtime->var_tmp_dir;
3555                         else if (runtime->var_tmp_dir)
3556                                 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3557                 }
3558
3559                 ns_info = (NamespaceInfo) {
3560                         .ignore_protect_paths = false,
3561                         .private_dev = context->private_devices,
3562                         .protect_control_groups = context->protect_control_groups,
3563                         .protect_kernel_tunables = context->protect_kernel_tunables,
3564                         .protect_kernel_modules = context->protect_kernel_modules,
3565                         .protect_kernel_logs = context->protect_kernel_logs,
3566                         .protect_hostname = context->protect_hostname,
3567                         .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3568                         .private_mounts = context->private_mounts,
3569                         .protect_home = context->protect_home,
3570                         .protect_system = context->protect_system,
3571                         .protect_proc = context->protect_proc,
3572                         .proc_subset = context->proc_subset,
3573                         .private_ipc = context->private_ipc || context->ipc_namespace_path,
3574                         /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3575                         .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3576                 };
3577         } else if (!context->dynamic_user && root_dir)
3578                 /*
3579                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3580                  * sandbox info, otherwise enforce it, don't ignore protected paths and
3581                  * fail if we are enable to apply the sandbox inside the mount namespace.
3582                  */
3583                 ns_info = (NamespaceInfo) {
3584                         .ignore_protect_paths = true,
3585                 };
3586         else
3587                 ns_info = (NamespaceInfo) {};
3588
3589         if (context->mount_flags == MS_SHARED)
3590                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3591
3592         if (exec_context_has_credentials(context) &&
3593             params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3594             FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3595                 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3596                 if (!creds_path) {
3597                         r = -ENOMEM;
3598                         goto finalize;
3599                 }
3600         }
3601
3602         if (MANAGER_IS_SYSTEM(u->manager)) {
3603                 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3604                 if (!propagate_dir) {
3605                         r = -ENOMEM;
3606                         goto finalize;
3607                 }
3608
3609                 incoming_dir = strdup("/run/systemd/incoming");
3610                 if (!incoming_dir) {
3611                         r = -ENOMEM;
3612                         goto finalize;
3613                 }
3614
3615                 extension_dir = strdup("/run/systemd/unit-extensions");
3616                 if (!extension_dir) {
3617                         r = -ENOMEM;
3618                         goto finalize;
3619                 }
3620         } else
3621                 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3622                         r = -ENOMEM;
3623                         goto finalize;
3624                 }
3625
3626         r = setup_namespace(root_dir, root_image, context->root_image_options,
3627                             &ns_info, context->read_write_paths,
3628                             needs_sandboxing ? context->read_only_paths : NULL,
3629                             needs_sandboxing ? context->inaccessible_paths : NULL,
3630                             needs_sandboxing ? context->exec_paths : NULL,
3631                             needs_sandboxing ? context->no_exec_paths : NULL,
3632                             empty_directories,
3633                             symlinks,
3634                             bind_mounts,
3635                             n_bind_mounts,
3636                             context->temporary_filesystems,
3637                             context->n_temporary_filesystems,
3638                             context->mount_images,
3639                             context->n_mount_images,
3640                             tmp_dir,
3641                             var_tmp_dir,
3642                             creds_path,
3643                             context->log_namespace,
3644                             context->mount_flags,
3645                             context->root_hash, context->root_hash_size, context->root_hash_path,
3646                             context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3647                             context->root_verity,
3648                             context->extension_images,
3649                             context->n_extension_images,
3650                             context->extension_directories,
3651                             propagate_dir,
3652                             incoming_dir,
3653                             extension_dir,
3654                             root_dir || root_image ? params->notify_socket : NULL,
3655                             error_path);
3656
3657         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3658          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3659          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3660          * completely different execution environment. */
3661         if (r == -ENOANO) {
3662                 if (insist_on_sandboxing(
3663                                     context,
3664                                     root_dir, root_image,
3665                                     bind_mounts,
3666                                     n_bind_mounts)) {
3667                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3668                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3669                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3670
3671                         r = -EOPNOTSUPP;
3672                 } else {
3673                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3674                         r = 0;
3675                 }
3676         }
3677
3678 finalize:
3679         bind_mount_free_many(bind_mounts, n_bind_mounts);
3680         return r;
3681 }
3682
3683 static int apply_working_directory(
3684                 const ExecContext *context,
3685                 const ExecParameters *params,
3686                 const char *home,
3687                 int *exit_status) {
3688
3689         const char *d, *wd;
3690
3691         assert(context);
3692         assert(exit_status);
3693
3694         if (context->working_directory_home) {
3695
3696                 if (!home) {
3697                         *exit_status = EXIT_CHDIR;
3698                         return -ENXIO;
3699                 }
3700
3701                 wd = home;
3702
3703         } else
3704                 wd = empty_to_root(context->working_directory);
3705
3706         if (params->flags & EXEC_APPLY_CHROOT)
3707                 d = wd;
3708         else
3709                 d = prefix_roota(context->root_directory, wd);
3710
3711         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3712                 *exit_status = EXIT_CHDIR;
3713                 return -errno;
3714         }
3715
3716         return 0;
3717 }
3718
3719 static int apply_root_directory(
3720                 const ExecContext *context,
3721                 const ExecParameters *params,
3722                 const bool needs_mount_ns,
3723                 int *exit_status) {
3724
3725         assert(context);
3726         assert(exit_status);
3727
3728         if (params->flags & EXEC_APPLY_CHROOT)
3729                 if (!needs_mount_ns && context->root_directory)
3730                         if (chroot(context->root_directory) < 0) {
3731                                 *exit_status = EXIT_CHROOT;
3732                                 return -errno;
3733                         }
3734
3735         return 0;
3736 }
3737
3738 static int setup_keyring(
3739                 const Unit *u,
3740                 const ExecContext *context,
3741                 const ExecParameters *p,
3742                 uid_t uid, gid_t gid) {
3743
3744         key_serial_t keyring;
3745         int r = 0;
3746         uid_t saved_uid;
3747         gid_t saved_gid;
3748
3749         assert(u);
3750         assert(context);
3751         assert(p);
3752
3753         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3754          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3755          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3756          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3757          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3758          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3759
3760         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3761                 return 0;
3762
3763         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3764          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3765          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3766          * & group is just as nasty as acquiring a reference to the user keyring. */
3767
3768         saved_uid = getuid();
3769         saved_gid = getgid();
3770
3771         if (gid_is_valid(gid) && gid != saved_gid) {
3772                 if (setregid(gid, -1) < 0)
3773                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3774         }
3775
3776         if (uid_is_valid(uid) && uid != saved_uid) {
3777                 if (setreuid(uid, -1) < 0) {
3778                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3779                         goto out;
3780                 }
3781         }
3782
3783         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3784         if (keyring == -1) {
3785                 if (errno == ENOSYS)
3786                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3787                 else if (ERRNO_IS_PRIVILEGE(errno))
3788                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3789                 else if (errno == EDQUOT)
3790                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3791                 else
3792                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3793
3794                 goto out;
3795         }
3796
3797         /* When requested link the user keyring into the session keyring. */
3798         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3799
3800                 if (keyctl(KEYCTL_LINK,
3801                            KEY_SPEC_USER_KEYRING,
3802                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3803                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3804                         goto out;
3805                 }
3806         }
3807
3808         /* Restore uid/gid back */
3809         if (uid_is_valid(uid) && uid != saved_uid) {
3810                 if (setreuid(saved_uid, -1) < 0) {
3811                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3812                         goto out;
3813                 }
3814         }
3815
3816         if (gid_is_valid(gid) && gid != saved_gid) {
3817                 if (setregid(saved_gid, -1) < 0)
3818                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3819         }
3820
3821         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3822         if (!sd_id128_is_null(u->invocation_id)) {
3823                 key_serial_t key;
3824
3825                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3826                 if (key == -1)
3827                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3828                 else {
3829                         if (keyctl(KEYCTL_SETPERM, key,
3830                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3831                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3832                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3833                 }
3834         }
3835
3836 out:
3837         /* Revert back uid & gid for the last time, and exit */
3838         /* no extra logging, as only the first already reported error matters */
3839         if (getuid() != saved_uid)
3840                 (void) setreuid(saved_uid, -1);
3841
3842         if (getgid() != saved_gid)
3843                 (void) setregid(saved_gid, -1);
3844
3845         return r;
3846 }
3847
3848 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3849         assert(array);
3850         assert(n);
3851         assert(pair);
3852
3853         if (pair[0] >= 0)
3854                 array[(*n)++] = pair[0];
3855         if (pair[1] >= 0)
3856                 array[(*n)++] = pair[1];
3857 }
3858
3859 static int close_remaining_fds(
3860                 const ExecParameters *params,
3861                 const ExecRuntime *runtime,
3862                 const DynamicCreds *dcreds,
3863                 int user_lookup_fd,
3864                 int socket_fd,
3865                 const int *fds, size_t n_fds) {
3866
3867         size_t n_dont_close = 0;
3868         int dont_close[n_fds + 12];
3869
3870         assert(params);
3871
3872         if (params->stdin_fd >= 0)
3873                 dont_close[n_dont_close++] = params->stdin_fd;
3874         if (params->stdout_fd >= 0)
3875                 dont_close[n_dont_close++] = params->stdout_fd;
3876         if (params->stderr_fd >= 0)
3877                 dont_close[n_dont_close++] = params->stderr_fd;
3878
3879         if (socket_fd >= 0)
3880                 dont_close[n_dont_close++] = socket_fd;
3881         if (n_fds > 0) {
3882                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3883                 n_dont_close += n_fds;
3884         }
3885
3886         if (runtime) {
3887                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3888                 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3889         }
3890
3891         if (dcreds) {
3892                 if (dcreds->user)
3893                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3894                 if (dcreds->group)
3895                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3896         }
3897
3898         if (user_lookup_fd >= 0)
3899                 dont_close[n_dont_close++] = user_lookup_fd;
3900
3901         return close_all_fds(dont_close, n_dont_close);
3902 }
3903
3904 static int send_user_lookup(
3905                 Unit *unit,
3906                 int user_lookup_fd,
3907                 uid_t uid,
3908                 gid_t gid) {
3909
3910         assert(unit);
3911
3912         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3913          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3914          * specified. */
3915
3916         if (user_lookup_fd < 0)
3917                 return 0;
3918
3919         if (!uid_is_valid(uid) && !gid_is_valid(gid))
3920                 return 0;
3921
3922         if (writev(user_lookup_fd,
3923                (struct iovec[]) {
3924                            IOVEC_INIT(&uid, sizeof(uid)),
3925                            IOVEC_INIT(&gid, sizeof(gid)),
3926                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3927                 return -errno;
3928
3929         return 0;
3930 }
3931
3932 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3933         int r;
3934
3935         assert(c);
3936         assert(home);
3937         assert(buf);
3938
3939         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3940
3941         if (*home)
3942                 return 0;
3943
3944         if (!c->working_directory_home)
3945                 return 0;
3946
3947         r = get_home_dir(buf);
3948         if (r < 0)
3949                 return r;
3950
3951         *home = *buf;
3952         return 1;
3953 }
3954
3955 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3956         _cleanup_strv_free_ char ** list = NULL;
3957         int r;
3958
3959         assert(c);
3960         assert(p);
3961         assert(ret);
3962
3963         assert(c->dynamic_user);
3964
3965         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3966          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3967          * directories. */
3968
3969         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3970                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3971                         continue;
3972
3973                 if (!p->prefix[t])
3974                         continue;
3975
3976                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3977                         char *e;
3978
3979                         if (exec_directory_is_private(c, t))
3980                                 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3981                         else
3982                                 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3983                         if (!e)
3984                                 return -ENOMEM;
3985
3986                         r = strv_consume(&list, e);
3987                         if (r < 0)
3988                                 return r;
3989                 }
3990         }
3991
3992         *ret = TAKE_PTR(list);
3993
3994         return 0;
3995 }
3996
3997 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3998         bool using_subcgroup;
3999         char *p;
4000
4001         assert(params);
4002         assert(ret);
4003
4004         if (!params->cgroup_path)
4005                 return -EINVAL;
4006
4007         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4008          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4009          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4010          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4011          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4012          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4013          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4014          * flag, which is only passed for the former statements, not for the latter. */
4015
4016         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4017         if (using_subcgroup)
4018                 p = path_join(params->cgroup_path, ".control");
4019         else
4020                 p = strdup(params->cgroup_path);
4021         if (!p)
4022                 return -ENOMEM;
4023
4024         *ret = p;
4025         return using_subcgroup;
4026 }
4027
4028 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4029         _cleanup_(cpu_set_reset) CPUSet s = {};
4030         int r;
4031
4032         assert(c);
4033         assert(ret);
4034
4035         if (!c->numa_policy.nodes.set) {
4036                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4037                 return 0;
4038         }
4039
4040         r = numa_to_cpu_set(&c->numa_policy, &s);
4041         if (r < 0)
4042                 return r;
4043
4044         cpu_set_reset(ret);
4045
4046         return cpu_set_add_all(ret, &s);
4047 }
4048
4049 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4050         assert(c);
4051
4052         return c->cpu_affinity_from_numa;
4053 }
4054
4055 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4056         int r;
4057
4058         assert(fds);
4059         assert(n_fds);
4060         assert(*n_fds < fds_size);
4061         assert(ret_fd);
4062
4063         if (fd < 0) {
4064                 *ret_fd = -1;
4065                 return 0;
4066         }
4067
4068         if (fd < 3 + (int) *n_fds) {
4069                 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4070                  * the fds we pass to the process (or which are closed only during execve). */
4071
4072                 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4073                 if (r < 0)
4074                         return -errno;
4075
4076                 close_and_replace(fd, r);
4077         }
4078
4079         *ret_fd = fds[*n_fds] = fd;
4080         (*n_fds) ++;
4081         return 1;
4082 }
4083
4084 static int exec_child(
4085                 Unit *unit,
4086                 const ExecCommand *command,
4087                 const ExecContext *context,
4088                 const ExecParameters *params,
4089                 ExecRuntime *runtime,
4090                 DynamicCreds *dcreds,
4091                 int socket_fd,
4092                 const int named_iofds[static 3],
4093                 int *fds,
4094                 size_t n_socket_fds,
4095                 size_t n_storage_fds,
4096                 char **files_env,
4097                 int user_lookup_fd,
4098                 int *exit_status) {
4099
4100         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4101         int r, ngids = 0, exec_fd;
4102         _cleanup_free_ gid_t *supplementary_gids = NULL;
4103         const char *username = NULL, *groupname = NULL;
4104         _cleanup_free_ char *home_buffer = NULL;
4105         const char *home = NULL, *shell = NULL;
4106         char **final_argv = NULL;
4107         dev_t journal_stream_dev = 0;
4108         ino_t journal_stream_ino = 0;
4109         bool userns_set_up = false;
4110         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4111                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
4112                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
4113                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
4114 #if HAVE_SELINUX
4115         _cleanup_free_ char *mac_selinux_context_net = NULL;
4116         bool use_selinux = false;
4117 #endif
4118 #if ENABLE_SMACK
4119         bool use_smack = false;
4120 #endif
4121 #if HAVE_APPARMOR
4122         bool use_apparmor = false;
4123 #endif
4124         uid_t saved_uid = getuid();
4125         gid_t saved_gid = getgid();
4126         uid_t uid = UID_INVALID;
4127         gid_t gid = GID_INVALID;
4128         size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4129                n_keep_fds; /* total number of fds not to close */
4130         int secure_bits;
4131         _cleanup_free_ gid_t *gids_after_pam = NULL;
4132         int ngids_after_pam = 0;
4133
4134         assert(unit);
4135         assert(command);
4136         assert(context);
4137         assert(params);
4138         assert(exit_status);
4139
4140         /* Explicitly test for CVE-2021-4034 inspired invocations */
4141         assert(command->path);
4142         assert(!strv_isempty(command->argv));
4143
4144         rename_process_from_path(command->path);
4145
4146         /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4147          * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4148          * both of which will be demoted to SIG_DFL. */
4149         (void) default_signals(SIGNALS_CRASH_HANDLER,
4150                                SIGNALS_IGNORE);
4151
4152         if (context->ignore_sigpipe)
4153                 (void) ignore_signals(SIGPIPE);
4154
4155         r = reset_signal_mask();
4156         if (r < 0) {
4157                 *exit_status = EXIT_SIGNAL_MASK;
4158                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4159         }
4160
4161         if (params->idle_pipe)
4162                 do_idle_pipe_dance(params->idle_pipe);
4163
4164         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4165          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4166          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4167          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4168
4169         log_forget_fds();
4170         log_set_open_when_needed(true);
4171
4172         /* In case anything used libc syslog(), close this here, too */
4173         closelog();
4174
4175         int keep_fds[n_fds + 3];
4176         memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4177         n_keep_fds = n_fds;
4178
4179         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4180         if (r < 0) {
4181                 *exit_status = EXIT_FDS;
4182                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4183         }
4184
4185 #if HAVE_LIBBPF
4186         if (unit->manager->restrict_fs) {
4187                 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4188                 if (bpf_map_fd < 0) {
4189                         *exit_status = EXIT_FDS;
4190                         return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4191                 }
4192
4193                 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4194                 if (r < 0) {
4195                         *exit_status = EXIT_FDS;
4196                         return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4197                 }
4198         }
4199 #endif
4200
4201         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4202         if (r < 0) {
4203                 *exit_status = EXIT_FDS;
4204                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4205         }
4206
4207         if (!context->same_pgrp &&
4208             setsid() < 0) {
4209                 *exit_status = EXIT_SETSID;
4210                 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4211         }
4212
4213         exec_context_tty_reset(context, params);
4214
4215         if (unit_shall_confirm_spawn(unit)) {
4216                 _cleanup_free_ char *cmdline = NULL;
4217
4218                 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4219                 if (!cmdline) {
4220                         *exit_status = EXIT_MEMORY;
4221                         return log_oom();
4222                 }
4223
4224                 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4225                 if (r != CONFIRM_EXECUTE) {
4226                         if (r == CONFIRM_PRETEND_SUCCESS) {
4227                                 *exit_status = EXIT_SUCCESS;
4228                                 return 0;
4229                         }
4230                         *exit_status = EXIT_CONFIRM;
4231                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4232                                                     "Execution cancelled by the user");
4233                 }
4234         }
4235
4236         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4237          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4238          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4239          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4240          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4241         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4242             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4243                 *exit_status = EXIT_MEMORY;
4244                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4245         }
4246
4247         if (context->dynamic_user && dcreds) {
4248                 _cleanup_strv_free_ char **suggested_paths = NULL;
4249
4250                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4251                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4252                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4253                         *exit_status = EXIT_USER;
4254                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4255                 }
4256
4257                 r = compile_suggested_paths(context, params, &suggested_paths);
4258                 if (r < 0) {
4259                         *exit_status = EXIT_MEMORY;
4260                         return log_oom();
4261                 }
4262
4263                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4264                 if (r < 0) {
4265                         *exit_status = EXIT_USER;
4266                         if (r == -EILSEQ)
4267                                 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4268                                                             "Failed to update dynamic user credentials: User or group with specified name already exists.");
4269                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4270                 }
4271
4272                 if (!uid_is_valid(uid)) {
4273                         *exit_status = EXIT_USER;
4274                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4275                 }
4276
4277                 if (!gid_is_valid(gid)) {
4278                         *exit_status = EXIT_USER;
4279                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4280                 }
4281
4282                 if (dcreds->user)
4283                         username = dcreds->user->name;
4284
4285         } else {
4286                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4287                 if (r < 0) {
4288                         *exit_status = EXIT_USER;
4289                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4290                 }
4291
4292                 r = get_fixed_group(context, &groupname, &gid);
4293                 if (r < 0) {
4294                         *exit_status = EXIT_GROUP;
4295                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4296                 }
4297         }
4298
4299         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4300         r = get_supplementary_groups(context, username, groupname, gid,
4301                                      &supplementary_gids, &ngids);
4302         if (r < 0) {
4303                 *exit_status = EXIT_GROUP;
4304                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4305         }
4306
4307         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4308         if (r < 0) {
4309                 *exit_status = EXIT_USER;
4310                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4311         }
4312
4313         user_lookup_fd = safe_close(user_lookup_fd);
4314
4315         r = acquire_home(context, uid, &home, &home_buffer);
4316         if (r < 0) {
4317                 *exit_status = EXIT_CHDIR;
4318                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4319         }
4320
4321         /* If a socket is connected to STDIN/STDOUT/STDERR, we
4322          * must sure to drop O_NONBLOCK */
4323         if (socket_fd >= 0)
4324                 (void) fd_nonblock(socket_fd, false);
4325
4326         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4327          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4328         if (params->cgroup_path) {
4329                 _cleanup_free_ char *p = NULL;
4330
4331                 r = exec_parameters_get_cgroup_path(params, &p);
4332                 if (r < 0) {
4333                         *exit_status = EXIT_CGROUP;
4334                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4335                 }
4336
4337                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4338                 if (r == -EUCLEAN) {
4339                         *exit_status = EXIT_CGROUP;
4340                         return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4341                                                     "because the cgroup or one of its parents or "
4342                                                     "siblings is in the threaded mode: %m", p);
4343                 }
4344                 if (r < 0) {
4345                         *exit_status = EXIT_CGROUP;
4346                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4347                 }
4348         }
4349
4350         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4351                 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4352                 if (r < 0) {
4353                         *exit_status = EXIT_NETWORK;
4354                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4355                 }
4356         }
4357
4358         if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4359                 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4360                 if (r < 0) {
4361                         *exit_status = EXIT_NAMESPACE;
4362                         return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4363                 }
4364         }
4365
4366         r = setup_input(context, params, socket_fd, named_iofds);
4367         if (r < 0) {
4368                 *exit_status = EXIT_STDIN;
4369                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4370         }
4371
4372         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4373         if (r < 0) {
4374                 *exit_status = EXIT_STDOUT;
4375                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4376         }
4377
4378         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4379         if (r < 0) {
4380                 *exit_status = EXIT_STDERR;
4381                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4382         }
4383
4384         if (context->oom_score_adjust_set) {
4385                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4386                  * prohibit write access to this file, and we shouldn't trip up over that. */
4387                 r = set_oom_score_adjust(context->oom_score_adjust);
4388                 if (ERRNO_IS_PRIVILEGE(r))
4389                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4390                 else if (r < 0) {
4391                         *exit_status = EXIT_OOM_ADJUST;
4392                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4393                 }
4394         }
4395
4396         if (context->coredump_filter_set) {
4397                 r = set_coredump_filter(context->coredump_filter);
4398                 if (ERRNO_IS_PRIVILEGE(r))
4399                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4400                 else if (r < 0)
4401                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4402         }
4403
4404         if (context->nice_set) {
4405                 r = setpriority_closest(context->nice);
4406                 if (r < 0)
4407                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4408         }
4409
4410         if (context->cpu_sched_set) {
4411                 struct sched_param param = {
4412                         .sched_priority = context->cpu_sched_priority,
4413                 };
4414
4415                 r = sched_setscheduler(0,
4416                                        context->cpu_sched_policy |
4417                                        (context->cpu_sched_reset_on_fork ?
4418                                         SCHED_RESET_ON_FORK : 0),
4419                                        &param);
4420                 if (r < 0) {
4421                         *exit_status = EXIT_SETSCHEDULER;
4422                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4423                 }
4424         }
4425
4426         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4427                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4428                 const CPUSet *cpu_set;
4429
4430                 if (context->cpu_affinity_from_numa) {
4431                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4432                         if (r < 0) {
4433                                 *exit_status = EXIT_CPUAFFINITY;
4434                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4435                         }
4436
4437                         cpu_set = &converted_cpu_set;
4438                 } else
4439                         cpu_set = &context->cpu_set;
4440
4441                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4442                         *exit_status = EXIT_CPUAFFINITY;
4443                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4444                 }
4445         }
4446
4447         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4448                 r = apply_numa_policy(&context->numa_policy);
4449                 if (r == -EOPNOTSUPP)
4450                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4451                 else if (r < 0) {
4452                         *exit_status = EXIT_NUMA_POLICY;
4453                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4454                 }
4455         }
4456
4457         if (context->ioprio_set)
4458                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4459                         *exit_status = EXIT_IOPRIO;
4460                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4461                 }
4462
4463         if (context->timer_slack_nsec != NSEC_INFINITY)
4464                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4465                         *exit_status = EXIT_TIMERSLACK;
4466                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4467                 }
4468
4469         if (context->personality != PERSONALITY_INVALID) {
4470                 r = safe_personality(context->personality);
4471                 if (r < 0) {
4472                         *exit_status = EXIT_PERSONALITY;
4473                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4474                 }
4475         }
4476
4477         if (context->utmp_id) {
4478                 const char *line = context->tty_path ?
4479                         (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4480                         NULL;
4481                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4482                                       line,
4483                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
4484                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4485                                       USER_PROCESS,
4486                                       username);
4487         }
4488
4489         if (uid_is_valid(uid)) {
4490                 r = chown_terminal(STDIN_FILENO, uid);
4491                 if (r < 0) {
4492                         *exit_status = EXIT_STDIN;
4493                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4494                 }
4495         }
4496
4497         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4498          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4499          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4500          * touch a single hierarchy too. */
4501         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4502                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4503                 if (r < 0) {
4504                         *exit_status = EXIT_CGROUP;
4505                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4506                 }
4507         }
4508
4509         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4510
4511         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4512                 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4513                 if (r < 0)
4514                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4515         }
4516
4517         if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4518                 r = setup_credentials(context, params, unit->id, uid);
4519                 if (r < 0) {
4520                         *exit_status = EXIT_CREDENTIALS;
4521                         return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4522                 }
4523         }
4524
4525         r = build_environment(
4526                         unit,
4527                         context,
4528                         params,
4529                         n_fds,
4530                         home,
4531                         username,
4532                         shell,
4533                         journal_stream_dev,
4534                         journal_stream_ino,
4535                         &our_env);
4536         if (r < 0) {
4537                 *exit_status = EXIT_MEMORY;
4538                 return log_oom();
4539         }
4540
4541         r = build_pass_environment(context, &pass_env);
4542         if (r < 0) {
4543                 *exit_status = EXIT_MEMORY;
4544                 return log_oom();
4545         }
4546
4547         /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4548          * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4549          * not specify PATH but the unit has ExecSearchPath. */
4550         if (!strv_isempty(context->exec_search_path)) {
4551                 _cleanup_free_ char *joined = NULL;
4552
4553                 joined = strv_join(context->exec_search_path, ":");
4554                 if (!joined) {
4555                         *exit_status = EXIT_MEMORY;
4556                         return log_oom();
4557                 }
4558
4559                 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4560                 if (r < 0) {
4561                         *exit_status = EXIT_MEMORY;
4562                         return log_oom();
4563                 }
4564         }
4565
4566         accum_env = strv_env_merge(params->environment,
4567                                    our_env,
4568                                    joined_exec_search_path,
4569                                    pass_env,
4570                                    context->environment,
4571                                    files_env);
4572         if (!accum_env) {
4573                 *exit_status = EXIT_MEMORY;
4574                 return log_oom();
4575         }
4576         accum_env = strv_env_clean(accum_env);
4577
4578         (void) umask(context->umask);
4579
4580         r = setup_keyring(unit, context, params, uid, gid);
4581         if (r < 0) {
4582                 *exit_status = EXIT_KEYRING;
4583                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4584         }
4585
4586         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4587          * from it. */
4588         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4589
4590         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4591          * for it, and the kernel doesn't actually support ambient caps. */
4592         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4593
4594         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4595          * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4596          * desired. */
4597         if (needs_ambient_hack)
4598                 needs_setuid = false;
4599         else
4600                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4601
4602         if (needs_sandboxing) {
4603                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4604                  * /sys being present. The actual MAC context application will happen later, as late as
4605                  * possible, to avoid impacting our own code paths. */
4606
4607 #if HAVE_SELINUX
4608                 use_selinux = mac_selinux_use();
4609 #endif
4610 #if ENABLE_SMACK
4611                 use_smack = mac_smack_use();
4612 #endif
4613 #if HAVE_APPARMOR
4614                 use_apparmor = mac_apparmor_use();
4615 #endif
4616         }
4617
4618         if (needs_sandboxing) {
4619                 int which_failed;
4620
4621                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4622                  * is set here. (See below.) */
4623
4624                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4625                 if (r < 0) {
4626                         *exit_status = EXIT_LIMITS;
4627                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4628                 }
4629         }
4630
4631         if (needs_setuid && context->pam_name && username) {
4632                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4633                  * wins here. (See above.) */
4634
4635                 /* All fds passed in the fds array will be closed in the pam child process. */
4636                 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4637                 if (r < 0) {
4638                         *exit_status = EXIT_PAM;
4639                         return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4640                 }
4641
4642                 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4643                 if (ngids_after_pam < 0) {
4644                         *exit_status = EXIT_MEMORY;
4645                         return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4646                 }
4647         }
4648
4649         if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4650                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4651                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4652                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4653
4654                 userns_set_up = true;
4655                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4656                 if (r < 0) {
4657                         *exit_status = EXIT_USER;
4658                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4659                 }
4660         }
4661
4662         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4663
4664                 if (ns_type_supported(NAMESPACE_NET)) {
4665                         r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4666                         if (r == -EPERM)
4667                                 log_unit_warning_errno(unit, r,
4668                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4669                         else if (r < 0) {
4670                                 *exit_status = EXIT_NETWORK;
4671                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4672                         }
4673                 } else if (context->network_namespace_path) {
4674                         *exit_status = EXIT_NETWORK;
4675                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4676                                                     "NetworkNamespacePath= is not supported, refusing.");
4677                 } else
4678                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4679         }
4680
4681         if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4682
4683                 if (ns_type_supported(NAMESPACE_IPC)) {
4684                         r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4685                         if (r == -EPERM)
4686                                 log_unit_warning_errno(unit, r,
4687                                                        "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4688                         else if (r < 0) {
4689                                 *exit_status = EXIT_NAMESPACE;
4690                                 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4691                         }
4692                 } else if (context->ipc_namespace_path) {
4693                         *exit_status = EXIT_NAMESPACE;
4694                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4695                                                     "IPCNamespacePath= is not supported, refusing.");
4696                 } else
4697                         log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4698         }
4699
4700         if (needs_mount_namespace) {
4701                 _cleanup_free_ char *error_path = NULL;
4702
4703                 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4704                 if (r < 0) {
4705                         *exit_status = EXIT_NAMESPACE;
4706                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4707                                                     error_path ? ": " : "", strempty(error_path));
4708                 }
4709         }
4710
4711         if (needs_sandboxing) {
4712                 r = apply_protect_hostname(unit, context, exit_status);
4713                 if (r < 0)
4714                         return r;
4715         }
4716
4717         /* Drop groups as early as possible.
4718          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4719          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4720         if (needs_setuid) {
4721                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4722                 int ngids_to_enforce = 0;
4723
4724                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4725                                                    ngids,
4726                                                    gids_after_pam,
4727                                                    ngids_after_pam,
4728                                                    &gids_to_enforce);
4729                 if (ngids_to_enforce < 0) {
4730                         *exit_status = EXIT_MEMORY;
4731                         return log_unit_error_errno(unit,
4732                                                     ngids_to_enforce,
4733                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
4734                 }
4735
4736                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4737                 if (r < 0) {
4738                         *exit_status = EXIT_GROUP;
4739                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4740                 }
4741         }
4742
4743         /* If the user namespace was not set up above, try to do it now.
4744          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4745          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4746          * case of mount namespaces being less privileged when the mount point list is copied from a
4747          * different user namespace). */
4748
4749         if (needs_sandboxing && context->private_users && !userns_set_up) {
4750                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4751                 if (r < 0) {
4752                         *exit_status = EXIT_USER;
4753                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4754                 }
4755         }
4756
4757         /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4758          * shall execute. */
4759
4760         _cleanup_free_ char *executable = NULL;
4761         _cleanup_close_ int executable_fd = -1;
4762         r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4763         if (r < 0) {
4764                 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4765                         log_unit_struct_errno(unit, LOG_INFO, r,
4766                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4767                                               LOG_UNIT_INVOCATION_ID(unit),
4768                                               LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4769                                                                command->path),
4770                                               "EXECUTABLE=%s", command->path);
4771                         return 0;
4772                 }
4773
4774                 *exit_status = EXIT_EXEC;
4775
4776                 return log_unit_struct_errno(unit, LOG_INFO, r,
4777                                              "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4778                                              LOG_UNIT_INVOCATION_ID(unit),
4779                                              LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4780                                                               command->path),
4781                                              "EXECUTABLE=%s", command->path);
4782         }
4783
4784         r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4785         if (r < 0) {
4786                 *exit_status = EXIT_FDS;
4787                 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4788         }
4789
4790 #if HAVE_SELINUX
4791         if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4792                 int fd = -1;
4793
4794                 if (socket_fd >= 0)
4795                         fd = socket_fd;
4796                 else if (params->n_socket_fds == 1)
4797                         /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4798                          * use context from that fd to compute the label. */
4799                         fd = params->fds[0];
4800
4801                 if (fd >= 0) {
4802                         r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4803                         if (r < 0) {
4804                                 if (!context->selinux_context_ignore) {
4805                                         *exit_status = EXIT_SELINUX_CONTEXT;
4806                                         return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4807                                 }
4808                                 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4809                         }
4810                 }
4811         }
4812 #endif
4813
4814         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4815          * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4816          * however if we have it as we want to keep it open until the final execve(). */
4817
4818         r = close_all_fds(keep_fds, n_keep_fds);
4819         if (r >= 0)
4820                 r = shift_fds(fds, n_fds);
4821         if (r >= 0)
4822                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4823         if (r < 0) {
4824                 *exit_status = EXIT_FDS;
4825                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4826         }
4827
4828         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4829          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4830          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4831          * came this far. */
4832
4833         secure_bits = context->secure_bits;
4834
4835         if (needs_sandboxing) {
4836                 uint64_t bset;
4837
4838                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4839                  * requested. (Note this is placed after the general resource limit initialization, see
4840                  * above, in order to take precedence.) */
4841                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4842                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4843                                 *exit_status = EXIT_LIMITS;
4844                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4845                         }
4846                 }
4847
4848 #if ENABLE_SMACK
4849                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4850                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4851                 if (use_smack) {
4852                         r = setup_smack(unit->manager, context, executable_fd);
4853                         if (r < 0 && !context->smack_process_label_ignore) {
4854                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4855                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4856                         }
4857                 }
4858 #endif
4859
4860                 bset = context->capability_bounding_set;
4861                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4862                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4863                  * instead of us doing that */
4864                 if (needs_ambient_hack)
4865                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
4866                                 (UINT64_C(1) << CAP_SETUID) |
4867                                 (UINT64_C(1) << CAP_SETGID);
4868
4869                 if (!cap_test_all(bset)) {
4870                         r = capability_bounding_set_drop(bset, false);
4871                         if (r < 0) {
4872                                 *exit_status = EXIT_CAPABILITIES;
4873                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4874                         }
4875                 }
4876
4877                 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4878                  * keep-caps set.
4879                  * To be able to raise the ambient capabilities after setresuid() they have to be
4880                  * added to the inherited set and keep caps has to be set (done in enforce_user()).
4881                  * After setresuid() the ambient capabilities can be raised as they are present in
4882                  * the permitted and inhertiable set. However it is possible that someone wants to
4883                  * set ambient capabilities without changing the user, so we also set the ambient
4884                  * capabilities here.
4885                  * The requested ambient capabilities are raised in the inheritable set if the
4886                  * second argument is true. */
4887                 if (!needs_ambient_hack) {
4888                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
4889                         if (r < 0) {
4890                                 *exit_status = EXIT_CAPABILITIES;
4891                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4892                         }
4893                 }
4894         }
4895
4896         /* chroot to root directory first, before we lose the ability to chroot */
4897         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4898         if (r < 0)
4899                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4900
4901         if (needs_setuid) {
4902                 if (uid_is_valid(uid)) {
4903                         r = enforce_user(context, uid);
4904                         if (r < 0) {
4905                                 *exit_status = EXIT_USER;
4906                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4907                         }
4908
4909                         if (!needs_ambient_hack &&
4910                             context->capability_ambient_set != 0) {
4911
4912                                 /* Raise the ambient capabilities after user change. */
4913                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4914                                 if (r < 0) {
4915                                         *exit_status = EXIT_CAPABILITIES;
4916                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4917                                 }
4918                         }
4919                 }
4920         }
4921
4922         /* Apply working directory here, because the working directory might be on NFS and only the user running
4923          * this service might have the correct privilege to change to the working directory */
4924         r = apply_working_directory(context, params, home, exit_status);
4925         if (r < 0)
4926                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4927
4928         if (needs_sandboxing) {
4929                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4930                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4931                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4932                  * are restricted. */
4933
4934 #if HAVE_SELINUX
4935                 if (use_selinux) {
4936                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4937
4938                         if (exec_context) {
4939                                 r = setexeccon(exec_context);
4940                                 if (r < 0) {
4941                                         if (!context->selinux_context_ignore) {
4942                                                 *exit_status = EXIT_SELINUX_CONTEXT;
4943                                                 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4944                                         }
4945                                         log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4946                                 }
4947                         }
4948                 }
4949 #endif
4950
4951 #if HAVE_APPARMOR
4952                 if (use_apparmor && context->apparmor_profile) {
4953                         r = aa_change_onexec(context->apparmor_profile);
4954                         if (r < 0 && !context->apparmor_profile_ignore) {
4955                                 *exit_status = EXIT_APPARMOR_PROFILE;
4956                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4957                         }
4958                 }
4959 #endif
4960
4961                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4962                  * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4963                  * CAP_SETPCAP. */
4964                 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4965                         /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4966                          * effective set here.
4967                          * The effective set is overwritten during execve  with the following  values:
4968                          * - ambient set (for non-root processes)
4969                          * - (inheritable | bounding) set for root processes)
4970                          *
4971                          * Hence there is no security impact to raise it in the effective set before execve
4972                          */
4973                         r = capability_gain_cap_setpcap(NULL);
4974                         if (r < 0) {
4975                                 *exit_status = EXIT_CAPABILITIES;
4976                                 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4977                         }
4978                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4979                                 *exit_status = EXIT_SECUREBITS;
4980                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4981                         }
4982                 }
4983
4984                 if (context_has_no_new_privileges(context))
4985                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4986                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4987                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4988                         }
4989
4990 #if HAVE_SECCOMP
4991                 r = apply_address_families(unit, context);
4992                 if (r < 0) {
4993                         *exit_status = EXIT_ADDRESS_FAMILIES;
4994                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4995                 }
4996
4997                 r = apply_memory_deny_write_execute(unit, context);
4998                 if (r < 0) {
4999                         *exit_status = EXIT_SECCOMP;
5000                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5001                 }
5002
5003                 r = apply_restrict_realtime(unit, context);
5004                 if (r < 0) {
5005                         *exit_status = EXIT_SECCOMP;
5006                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5007                 }
5008
5009                 r = apply_restrict_suid_sgid(unit, context);
5010                 if (r < 0) {
5011                         *exit_status = EXIT_SECCOMP;
5012                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5013                 }
5014
5015                 r = apply_restrict_namespaces(unit, context);
5016                 if (r < 0) {
5017                         *exit_status = EXIT_SECCOMP;
5018                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5019                 }
5020
5021                 r = apply_protect_sysctl(unit, context);
5022                 if (r < 0) {
5023                         *exit_status = EXIT_SECCOMP;
5024                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5025                 }
5026
5027                 r = apply_protect_kernel_modules(unit, context);
5028                 if (r < 0) {
5029                         *exit_status = EXIT_SECCOMP;
5030                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5031                 }
5032
5033                 r = apply_protect_kernel_logs(unit, context);
5034                 if (r < 0) {
5035                         *exit_status = EXIT_SECCOMP;
5036                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5037                 }
5038
5039                 r = apply_protect_clock(unit, context);
5040                 if (r < 0) {
5041                         *exit_status = EXIT_SECCOMP;
5042                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5043                 }
5044
5045                 r = apply_private_devices(unit, context);
5046                 if (r < 0) {
5047                         *exit_status = EXIT_SECCOMP;
5048                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5049                 }
5050
5051                 r = apply_syscall_archs(unit, context);
5052                 if (r < 0) {
5053                         *exit_status = EXIT_SECCOMP;
5054                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5055                 }
5056
5057                 r = apply_lock_personality(unit, context);
5058                 if (r < 0) {
5059                         *exit_status = EXIT_SECCOMP;
5060                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5061                 }
5062
5063                 r = apply_syscall_log(unit, context);
5064                 if (r < 0) {
5065                         *exit_status = EXIT_SECCOMP;
5066                         return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5067                 }
5068
5069                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5070                  * by the filter as little as possible. */
5071                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5072                 if (r < 0) {
5073                         *exit_status = EXIT_SECCOMP;
5074                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5075                 }
5076 #endif
5077
5078 #if HAVE_LIBBPF
5079                 r = apply_restrict_filesystems(unit, context);
5080                 if (r < 0) {
5081                         *exit_status = EXIT_BPF;
5082                         return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5083                 }
5084 #endif
5085
5086         }
5087
5088         if (!strv_isempty(context->unset_environment)) {
5089                 char **ee = NULL;
5090
5091                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5092                 if (!ee) {
5093                         *exit_status = EXIT_MEMORY;
5094                         return log_oom();
5095                 }
5096
5097                 strv_free_and_replace(accum_env, ee);
5098         }
5099
5100         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5101                 replaced_argv = replace_env_argv(command->argv, accum_env);
5102                 if (!replaced_argv) {
5103                         *exit_status = EXIT_MEMORY;
5104                         return log_oom();
5105                 }
5106                 final_argv = replaced_argv;
5107         } else
5108                 final_argv = command->argv;
5109
5110         if (DEBUG_LOGGING) {
5111                 _cleanup_free_ char *line = NULL;
5112
5113                 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5114                 if (!line) {
5115                         *exit_status = EXIT_MEMORY;
5116                         return log_oom();
5117                 }
5118
5119                 log_unit_struct(unit, LOG_DEBUG,
5120                                 "EXECUTABLE=%s", executable,
5121                                 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5122         }
5123
5124         if (exec_fd >= 0) {
5125                 uint8_t hot = 1;
5126
5127                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5128                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5129
5130                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5131                         *exit_status = EXIT_EXEC;
5132                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5133                 }
5134         }
5135
5136         r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5137
5138         if (exec_fd >= 0) {
5139                 uint8_t hot = 0;
5140
5141                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5142                  * that POLLHUP on it no longer means execve() succeeded. */
5143
5144                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5145                         *exit_status = EXIT_EXEC;
5146                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5147                 }
5148         }
5149
5150         *exit_status = EXIT_EXEC;
5151         return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5152 }
5153
5154 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5155 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5156
5157 int exec_spawn(Unit *unit,
5158                ExecCommand *command,
5159                const ExecContext *context,
5160                const ExecParameters *params,
5161                ExecRuntime *runtime,
5162                DynamicCreds *dcreds,
5163                pid_t *ret) {
5164
5165         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5166         _cleanup_free_ char *subcgroup_path = NULL;
5167         _cleanup_strv_free_ char **files_env = NULL;
5168         size_t n_storage_fds = 0, n_socket_fds = 0;
5169         _cleanup_free_ char *line = NULL;
5170         pid_t pid;
5171
5172         assert(unit);
5173         assert(command);
5174         assert(context);
5175         assert(ret);
5176         assert(params);
5177         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5178
5179         if (context->std_input == EXEC_INPUT_SOCKET ||
5180             context->std_output == EXEC_OUTPUT_SOCKET ||
5181             context->std_error == EXEC_OUTPUT_SOCKET) {
5182
5183                 if (params->n_socket_fds > 1)
5184                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5185
5186                 if (params->n_socket_fds == 0)
5187                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5188
5189                 socket_fd = params->fds[0];
5190         } else {
5191                 socket_fd = -1;
5192                 fds = params->fds;
5193                 n_socket_fds = params->n_socket_fds;
5194                 n_storage_fds = params->n_storage_fds;
5195         }
5196
5197         r = exec_context_named_iofds(context, params, named_iofds);
5198         if (r < 0)
5199                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5200
5201         r = exec_context_load_environment(unit, context, &files_env);
5202         if (r < 0)
5203                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5204
5205         line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5206         if (!line)
5207                 return log_oom();
5208
5209         /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5210            and, until the next SELinux policy changes, we save further reloads in future children. */
5211         mac_selinux_maybe_reload();
5212
5213         log_unit_struct(unit, LOG_DEBUG,
5214                         LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5215                         "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5216                                                            the mount namespace in the child, but we want to log
5217                                                            from the parent, so we need to use the (possibly
5218                                                            inaccurate) path here. */
5219                         LOG_UNIT_INVOCATION_ID(unit));
5220
5221         if (params->cgroup_path) {
5222                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5223                 if (r < 0)
5224                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5225                 if (r > 0) { /* We are using a child cgroup */
5226                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5227                         if (r < 0)
5228                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5229
5230                         /* Normally we would not propagate the oomd xattrs to children but since we created this
5231                          * sub-cgroup internally we should do it. */
5232                         cgroup_oomd_xattr_apply(unit, subcgroup_path);
5233                 }
5234         }
5235
5236         pid = fork();
5237         if (pid < 0)
5238                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5239
5240         if (pid == 0) {
5241                 int exit_status = EXIT_SUCCESS;
5242
5243                 r = exec_child(unit,
5244                                command,
5245                                context,
5246                                params,
5247                                runtime,
5248                                dcreds,
5249                                socket_fd,
5250                                named_iofds,
5251                                fds,
5252                                n_socket_fds,
5253                                n_storage_fds,
5254                                files_env,
5255                                unit->manager->user_lookup_fds[1],
5256                                &exit_status);
5257
5258                 if (r < 0) {
5259                         const char *status =
5260                                 exit_status_to_string(exit_status,
5261                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5262
5263                         log_unit_struct_errno(unit, LOG_ERR, r,
5264                                               "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5265                                               LOG_UNIT_INVOCATION_ID(unit),
5266                                               LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5267                                                                status, command->path),
5268                                               "EXECUTABLE=%s", command->path);
5269                 }
5270
5271                 _exit(exit_status);
5272         }
5273
5274         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5275
5276         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5277          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5278          * process will be killed too). */
5279         if (subcgroup_path)
5280                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5281
5282         exec_status_start(&command->exec_status, pid);
5283
5284         *ret = pid;
5285         return 0;
5286 }
5287
5288 void exec_context_init(ExecContext *c) {
5289         assert(c);
5290
5291         c->umask = 0022;
5292         c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5293         c->cpu_sched_policy = SCHED_OTHER;
5294         c->syslog_priority = LOG_DAEMON|LOG_INFO;
5295         c->syslog_level_prefix = true;
5296         c->ignore_sigpipe = true;
5297         c->timer_slack_nsec = NSEC_INFINITY;
5298         c->personality = PERSONALITY_INVALID;
5299         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5300                 c->directories[t].mode = 0755;
5301         c->timeout_clean_usec = USEC_INFINITY;
5302         c->capability_bounding_set = CAP_ALL;
5303         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5304         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5305         c->log_level_max = -1;
5306 #if HAVE_SECCOMP
5307         c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5308 #endif
5309         c->tty_rows = UINT_MAX;
5310         c->tty_cols = UINT_MAX;
5311         numa_policy_reset(&c->numa_policy);
5312 }
5313
5314 void exec_context_done(ExecContext *c) {
5315         assert(c);
5316
5317         c->environment = strv_free(c->environment);
5318         c->environment_files = strv_free(c->environment_files);
5319         c->pass_environment = strv_free(c->pass_environment);
5320         c->unset_environment = strv_free(c->unset_environment);
5321
5322         rlimit_free_all(c->rlimit);
5323
5324         for (size_t l = 0; l < 3; l++) {
5325                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5326                 c->stdio_file[l] = mfree(c->stdio_file[l]);
5327         }
5328
5329         c->working_directory = mfree(c->working_directory);
5330         c->root_directory = mfree(c->root_directory);
5331         c->root_image = mfree(c->root_image);
5332         c->root_image_options = mount_options_free_all(c->root_image_options);
5333         c->root_hash = mfree(c->root_hash);
5334         c->root_hash_size = 0;
5335         c->root_hash_path = mfree(c->root_hash_path);
5336         c->root_hash_sig = mfree(c->root_hash_sig);
5337         c->root_hash_sig_size = 0;
5338         c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5339         c->root_verity = mfree(c->root_verity);
5340         c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5341         c->extension_directories = strv_free(c->extension_directories);
5342         c->tty_path = mfree(c->tty_path);
5343         c->syslog_identifier = mfree(c->syslog_identifier);
5344         c->user = mfree(c->user);
5345         c->group = mfree(c->group);
5346
5347         c->supplementary_groups = strv_free(c->supplementary_groups);
5348
5349         c->pam_name = mfree(c->pam_name);
5350
5351         c->read_only_paths = strv_free(c->read_only_paths);
5352         c->read_write_paths = strv_free(c->read_write_paths);
5353         c->inaccessible_paths = strv_free(c->inaccessible_paths);
5354         c->exec_paths = strv_free(c->exec_paths);
5355         c->no_exec_paths = strv_free(c->no_exec_paths);
5356         c->exec_search_path = strv_free(c->exec_search_path);
5357
5358         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5359         c->bind_mounts = NULL;
5360         c->n_bind_mounts = 0;
5361         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5362         c->temporary_filesystems = NULL;
5363         c->n_temporary_filesystems = 0;
5364         c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5365
5366         cpu_set_reset(&c->cpu_set);
5367         numa_policy_reset(&c->numa_policy);
5368
5369         c->utmp_id = mfree(c->utmp_id);
5370         c->selinux_context = mfree(c->selinux_context);
5371         c->apparmor_profile = mfree(c->apparmor_profile);
5372         c->smack_process_label = mfree(c->smack_process_label);
5373
5374         c->restrict_filesystems = set_free(c->restrict_filesystems);
5375
5376         c->syscall_filter = hashmap_free(c->syscall_filter);
5377         c->syscall_archs = set_free(c->syscall_archs);
5378         c->address_families = set_free(c->address_families);
5379
5380         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5381                 exec_directory_done(&c->directories[t]);
5382
5383         c->log_level_max = -1;
5384
5385         exec_context_free_log_extra_fields(c);
5386
5387         c->log_ratelimit_interval_usec = 0;
5388         c->log_ratelimit_burst = 0;
5389
5390         c->stdin_data = mfree(c->stdin_data);
5391         c->stdin_data_size = 0;
5392
5393         c->network_namespace_path = mfree(c->network_namespace_path);
5394         c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5395
5396         c->log_namespace = mfree(c->log_namespace);
5397
5398         c->load_credentials = hashmap_free(c->load_credentials);
5399         c->set_credentials = hashmap_free(c->set_credentials);
5400 }
5401
5402 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5403         assert(c);
5404
5405         if (!runtime_prefix)
5406                 return 0;
5407
5408         for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5409                 _cleanup_free_ char *p = NULL;
5410
5411                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5412                         p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5413                 else
5414                         p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5415                 if (!p)
5416                         return -ENOMEM;
5417
5418                 /* We execute this synchronously, since we need to be sure this is gone when we start the
5419                  * service next. */
5420                 (void) rm_rf(p, REMOVE_ROOT);
5421
5422                 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5423                         _cleanup_free_ char *symlink_abs = NULL;
5424
5425                         if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5426                                 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5427                         else
5428                                 symlink_abs = path_join(runtime_prefix, *symlink);
5429                         if (!symlink_abs)
5430                                 return -ENOMEM;
5431
5432                         (void) unlink(symlink_abs);
5433                 }
5434
5435         }
5436
5437         return 0;
5438 }
5439
5440 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5441         _cleanup_free_ char *p = NULL;
5442
5443         assert(c);
5444
5445         if (!runtime_prefix || !unit)
5446                 return 0;
5447
5448         p = path_join(runtime_prefix, "credentials", unit);
5449         if (!p)
5450                 return -ENOMEM;
5451
5452         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5453          * unmount it, and afterwards remove the mount point */
5454         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5455         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5456
5457         return 0;
5458 }
5459
5460 static void exec_command_done(ExecCommand *c) {
5461         assert(c);
5462
5463         c->path = mfree(c->path);
5464         c->argv = strv_free(c->argv);
5465 }
5466
5467 void exec_command_done_array(ExecCommand *c, size_t n) {
5468         for (size_t i = 0; i < n; i++)
5469                 exec_command_done(c+i);
5470 }
5471
5472 ExecCommand* exec_command_free_list(ExecCommand *c) {
5473         ExecCommand *i;
5474
5475         while ((i = c)) {
5476                 LIST_REMOVE(command, c, i);
5477                 exec_command_done(i);
5478                 free(i);
5479         }
5480
5481         return NULL;
5482 }
5483
5484 void exec_command_free_array(ExecCommand **c, size_t n) {
5485         for (size_t i = 0; i < n; i++)
5486                 c[i] = exec_command_free_list(c[i]);
5487 }
5488
5489 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5490         for (size_t i = 0; i < n; i++)
5491                 exec_status_reset(&c[i].exec_status);
5492 }
5493
5494 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5495         for (size_t i = 0; i < n; i++)
5496                 LIST_FOREACH(command, z, c[i])
5497                         exec_status_reset(&z->exec_status);
5498 }
5499
5500 typedef struct InvalidEnvInfo {
5501         const Unit *unit;
5502         const char *path;
5503 } InvalidEnvInfo;
5504
5505 static void invalid_env(const char *p, void *userdata) {
5506         InvalidEnvInfo *info = userdata;
5507
5508         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5509 }
5510
5511 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5512         assert(c);
5513
5514         switch (fd_index) {
5515
5516         case STDIN_FILENO:
5517                 if (c->std_input != EXEC_INPUT_NAMED_FD)
5518                         return NULL;
5519
5520                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5521
5522         case STDOUT_FILENO:
5523                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5524                         return NULL;
5525
5526                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5527
5528         case STDERR_FILENO:
5529                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5530                         return NULL;
5531
5532                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5533
5534         default:
5535                 return NULL;
5536         }
5537 }
5538
5539 static int exec_context_named_iofds(
5540                 const ExecContext *c,
5541                 const ExecParameters *p,
5542                 int named_iofds[static 3]) {
5543
5544         size_t targets;
5545         const char* stdio_fdname[3];
5546         size_t n_fds;
5547
5548         assert(c);
5549         assert(p);
5550         assert(named_iofds);
5551
5552         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5553                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5554                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
5555
5556         for (size_t i = 0; i < 3; i++)
5557                 stdio_fdname[i] = exec_context_fdname(c, i);
5558
5559         n_fds = p->n_storage_fds + p->n_socket_fds;
5560
5561         for (size_t i = 0; i < n_fds  && targets > 0; i++)
5562                 if (named_iofds[STDIN_FILENO] < 0 &&
5563                     c->std_input == EXEC_INPUT_NAMED_FD &&
5564                     stdio_fdname[STDIN_FILENO] &&
5565                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5566
5567                         named_iofds[STDIN_FILENO] = p->fds[i];
5568                         targets--;
5569
5570                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5571                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
5572                            stdio_fdname[STDOUT_FILENO] &&
5573                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5574
5575                         named_iofds[STDOUT_FILENO] = p->fds[i];
5576                         targets--;
5577
5578                 } else if (named_iofds[STDERR_FILENO] < 0 &&
5579                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
5580                            stdio_fdname[STDERR_FILENO] &&
5581                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5582
5583                         named_iofds[STDERR_FILENO] = p->fds[i];
5584                         targets--;
5585                 }
5586
5587         return targets == 0 ? 0 : -ENOENT;
5588 }
5589
5590 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5591         _cleanup_strv_free_ char **v = NULL;
5592         int r;
5593
5594         assert(c);
5595         assert(ret);
5596
5597         STRV_FOREACH(i, c->environment_files) {
5598                 _cleanup_globfree_ glob_t pglob = {};
5599                 bool ignore = false;
5600                 char *fn = *i;
5601
5602                 if (fn[0] == '-') {
5603                         ignore = true;
5604                         fn++;
5605                 }
5606
5607                 if (!path_is_absolute(fn)) {
5608                         if (ignore)
5609                                 continue;
5610                         return -EINVAL;
5611                 }
5612
5613                 /* Filename supports globbing, take all matching files */
5614                 r = safe_glob(fn, 0, &pglob);
5615                 if (r < 0) {
5616                         if (ignore)
5617                                 continue;
5618                         return r;
5619                 }
5620
5621                 /* When we don't match anything, -ENOENT should be returned */
5622                 assert(pglob.gl_pathc > 0);
5623
5624                 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5625                         _cleanup_strv_free_ char **p = NULL;
5626
5627                         r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5628                         if (r < 0) {
5629                                 if (ignore)
5630                                         continue;
5631                                 return r;
5632                         }
5633
5634                         /* Log invalid environment variables with filename */
5635                         if (p) {
5636                                 InvalidEnvInfo info = {
5637                                         .unit = unit,
5638                                         .path = pglob.gl_pathv[n]
5639                                 };
5640
5641                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
5642                         }
5643
5644                         if (!v)
5645                                 v = TAKE_PTR(p);
5646                         else {
5647                                 char **m = strv_env_merge(v, p);
5648                                 if (!m)
5649                                         return -ENOMEM;
5650
5651                                 strv_free_and_replace(v, m);
5652                         }
5653                 }
5654         }
5655
5656         *ret = TAKE_PTR(v);
5657
5658         return 0;
5659 }
5660
5661 static bool tty_may_match_dev_console(const char *tty) {
5662         _cleanup_free_ char *resolved = NULL;
5663
5664         if (!tty)
5665                 return true;
5666
5667         tty = skip_dev_prefix(tty);
5668
5669         /* trivial identity? */
5670         if (streq(tty, "console"))
5671                 return true;
5672
5673         if (resolve_dev_console(&resolved) < 0)
5674                 return true; /* if we could not resolve, assume it may */
5675
5676         /* "tty0" means the active VC, so it may be the same sometimes */
5677         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5678 }
5679
5680 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5681         assert(ec);
5682
5683         return ec->tty_reset ||
5684                 ec->tty_vhangup ||
5685                 ec->tty_vt_disallocate ||
5686                 is_terminal_input(ec->std_input) ||
5687                 is_terminal_output(ec->std_output) ||
5688                 is_terminal_output(ec->std_error);
5689 }
5690
5691 bool exec_context_may_touch_console(const ExecContext *ec) {
5692
5693         return exec_context_may_touch_tty(ec) &&
5694                tty_may_match_dev_console(exec_context_tty_path(ec));
5695 }
5696
5697 static void strv_fprintf(FILE *f, char **l) {
5698         assert(f);
5699
5700         STRV_FOREACH(g, l)
5701                 fprintf(f, " %s", *g);
5702 }
5703
5704 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5705         assert(f);
5706         assert(prefix);
5707         assert(name);
5708
5709         if (!strv_isempty(strv)) {
5710                 fprintf(f, "%s%s:", prefix, name);
5711                 strv_fprintf(f, strv);
5712                 fputs("\n", f);
5713         }
5714 }
5715
5716 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5717         int r;
5718
5719         assert(c);
5720         assert(f);
5721
5722         prefix = strempty(prefix);
5723
5724         fprintf(f,
5725                 "%sUMask: %04o\n"
5726                 "%sWorkingDirectory: %s\n"
5727                 "%sRootDirectory: %s\n"
5728                 "%sNonBlocking: %s\n"
5729                 "%sPrivateTmp: %s\n"
5730                 "%sPrivateDevices: %s\n"
5731                 "%sProtectKernelTunables: %s\n"
5732                 "%sProtectKernelModules: %s\n"
5733                 "%sProtectKernelLogs: %s\n"
5734                 "%sProtectClock: %s\n"
5735                 "%sProtectControlGroups: %s\n"
5736                 "%sPrivateNetwork: %s\n"
5737                 "%sPrivateUsers: %s\n"
5738                 "%sProtectHome: %s\n"
5739                 "%sProtectSystem: %s\n"
5740                 "%sMountAPIVFS: %s\n"
5741                 "%sIgnoreSIGPIPE: %s\n"
5742                 "%sMemoryDenyWriteExecute: %s\n"
5743                 "%sRestrictRealtime: %s\n"
5744                 "%sRestrictSUIDSGID: %s\n"
5745                 "%sKeyringMode: %s\n"
5746                 "%sProtectHostname: %s\n"
5747                 "%sProtectProc: %s\n"
5748                 "%sProcSubset: %s\n",
5749                 prefix, c->umask,
5750                 prefix, empty_to_root(c->working_directory),
5751                 prefix, empty_to_root(c->root_directory),
5752                 prefix, yes_no(c->non_blocking),
5753                 prefix, yes_no(c->private_tmp),
5754                 prefix, yes_no(c->private_devices),
5755                 prefix, yes_no(c->protect_kernel_tunables),
5756                 prefix, yes_no(c->protect_kernel_modules),
5757                 prefix, yes_no(c->protect_kernel_logs),
5758                 prefix, yes_no(c->protect_clock),
5759                 prefix, yes_no(c->protect_control_groups),
5760                 prefix, yes_no(c->private_network),
5761                 prefix, yes_no(c->private_users),
5762                 prefix, protect_home_to_string(c->protect_home),
5763                 prefix, protect_system_to_string(c->protect_system),
5764                 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5765                 prefix, yes_no(c->ignore_sigpipe),
5766                 prefix, yes_no(c->memory_deny_write_execute),
5767                 prefix, yes_no(c->restrict_realtime),
5768                 prefix, yes_no(c->restrict_suid_sgid),
5769                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5770                 prefix, yes_no(c->protect_hostname),
5771                 prefix, protect_proc_to_string(c->protect_proc),
5772                 prefix, proc_subset_to_string(c->proc_subset));
5773
5774         if (c->root_image)
5775                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5776
5777         if (c->root_image_options) {
5778                 fprintf(f, "%sRootImageOptions:", prefix);
5779                 LIST_FOREACH(mount_options, o, c->root_image_options)
5780                         if (!isempty(o->options))
5781                                 fprintf(f, " %s:%s",
5782                                         partition_designator_to_string(o->partition_designator),
5783                                         o->options);
5784                 fprintf(f, "\n");
5785         }
5786
5787         if (c->root_hash) {
5788                 _cleanup_free_ char *encoded = NULL;
5789                 encoded = hexmem(c->root_hash, c->root_hash_size);
5790                 if (encoded)
5791                         fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5792         }
5793
5794         if (c->root_hash_path)
5795                 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5796
5797         if (c->root_hash_sig) {
5798                 _cleanup_free_ char *encoded = NULL;
5799                 ssize_t len;
5800                 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5801                 if (len)
5802                         fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5803         }
5804
5805         if (c->root_hash_sig_path)
5806                 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5807
5808         if (c->root_verity)
5809                 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5810
5811         STRV_FOREACH(e, c->environment)
5812                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5813
5814         STRV_FOREACH(e, c->environment_files)
5815                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5816
5817         STRV_FOREACH(e, c->pass_environment)
5818                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5819
5820         STRV_FOREACH(e, c->unset_environment)
5821                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5822
5823         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5824
5825         for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5826                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5827
5828                 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5829                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5830
5831                         STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5832                                 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5833                 }
5834         }
5835
5836         fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5837
5838         if (c->nice_set)
5839                 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5840
5841         if (c->oom_score_adjust_set)
5842                 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5843
5844         if (c->coredump_filter_set)
5845                 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5846
5847         for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5848                 if (c->rlimit[i]) {
5849                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5850                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5851                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5852                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5853                 }
5854
5855         if (c->ioprio_set) {
5856                 _cleanup_free_ char *class_str = NULL;
5857
5858                 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5859                 if (r >= 0)
5860                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5861
5862                 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5863         }
5864
5865         if (c->cpu_sched_set) {
5866                 _cleanup_free_ char *policy_str = NULL;
5867
5868                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5869                 if (r >= 0)
5870                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5871
5872                 fprintf(f,
5873                         "%sCPUSchedulingPriority: %i\n"
5874                         "%sCPUSchedulingResetOnFork: %s\n",
5875                         prefix, c->cpu_sched_priority,
5876                         prefix, yes_no(c->cpu_sched_reset_on_fork));
5877         }
5878
5879         if (c->cpu_set.set) {
5880                 _cleanup_free_ char *affinity = NULL;
5881
5882                 affinity = cpu_set_to_range_string(&c->cpu_set);
5883                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5884         }
5885
5886         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5887                 _cleanup_free_ char *nodes = NULL;
5888
5889                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5890                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5891                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5892         }
5893
5894         if (c->timer_slack_nsec != NSEC_INFINITY)
5895                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5896
5897         fprintf(f,
5898                 "%sStandardInput: %s\n"
5899                 "%sStandardOutput: %s\n"
5900                 "%sStandardError: %s\n",
5901                 prefix, exec_input_to_string(c->std_input),
5902                 prefix, exec_output_to_string(c->std_output),
5903                 prefix, exec_output_to_string(c->std_error));
5904
5905         if (c->std_input == EXEC_INPUT_NAMED_FD)
5906                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5907         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5908                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5909         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5910                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5911
5912         if (c->std_input == EXEC_INPUT_FILE)
5913                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5914         if (c->std_output == EXEC_OUTPUT_FILE)
5915                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5916         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5917                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5918         if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5919                 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5920         if (c->std_error == EXEC_OUTPUT_FILE)
5921                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5922         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5923                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5924         if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5925                 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5926
5927         if (c->tty_path)
5928                 fprintf(f,
5929                         "%sTTYPath: %s\n"
5930                         "%sTTYReset: %s\n"
5931                         "%sTTYVHangup: %s\n"
5932                         "%sTTYVTDisallocate: %s\n"
5933                         "%sTTYRows: %u\n"
5934                         "%sTTYColumns: %u\n",
5935                         prefix, c->tty_path,
5936                         prefix, yes_no(c->tty_reset),
5937                         prefix, yes_no(c->tty_vhangup),
5938                         prefix, yes_no(c->tty_vt_disallocate),
5939                         prefix, c->tty_rows,
5940                         prefix, c->tty_cols);
5941
5942         if (IN_SET(c->std_output,
5943                    EXEC_OUTPUT_KMSG,
5944                    EXEC_OUTPUT_JOURNAL,
5945                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5946                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5947             IN_SET(c->std_error,
5948                    EXEC_OUTPUT_KMSG,
5949                    EXEC_OUTPUT_JOURNAL,
5950                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
5951                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5952
5953                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5954
5955                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5956                 if (r >= 0)
5957                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5958
5959                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5960                 if (r >= 0)
5961                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5962         }
5963
5964         if (c->log_level_max >= 0) {
5965                 _cleanup_free_ char *t = NULL;
5966
5967                 (void) log_level_to_string_alloc(c->log_level_max, &t);
5968
5969                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5970         }
5971
5972         if (c->log_ratelimit_interval_usec > 0)
5973                 fprintf(f,
5974                         "%sLogRateLimitIntervalSec: %s\n",
5975                         prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5976
5977         if (c->log_ratelimit_burst > 0)
5978                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5979
5980         for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5981                 fprintf(f, "%sLogExtraFields: ", prefix);
5982                 fwrite(c->log_extra_fields[j].iov_base,
5983                        1, c->log_extra_fields[j].iov_len,
5984                        f);
5985                 fputc('\n', f);
5986         }
5987
5988         if (c->log_namespace)
5989                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5990
5991         if (c->secure_bits) {
5992                 _cleanup_free_ char *str = NULL;
5993
5994                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5995                 if (r >= 0)
5996                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5997         }
5998
5999         if (c->capability_bounding_set != CAP_ALL) {
6000                 _cleanup_free_ char *str = NULL;
6001
6002                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6003                 if (r >= 0)
6004                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6005         }
6006
6007         if (c->capability_ambient_set != 0) {
6008                 _cleanup_free_ char *str = NULL;
6009
6010                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6011                 if (r >= 0)
6012                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6013         }
6014
6015         if (c->user)
6016                 fprintf(f, "%sUser: %s\n", prefix, c->user);
6017         if (c->group)
6018                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6019
6020         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6021
6022         strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6023
6024         if (c->pam_name)
6025                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6026
6027         strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6028         strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6029         strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6030         strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6031         strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6032         strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6033
6034         for (size_t i = 0; i < c->n_bind_mounts; i++)
6035                 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6036                         c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6037                         c->bind_mounts[i].ignore_enoent ? "-": "",
6038                         c->bind_mounts[i].source,
6039                         c->bind_mounts[i].destination,
6040                         c->bind_mounts[i].recursive ? "rbind" : "norbind");
6041
6042         for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6043                 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6044
6045                 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6046                         t->path,
6047                         isempty(t->options) ? "" : ":",
6048                         strempty(t->options));
6049         }
6050
6051         if (c->utmp_id)
6052                 fprintf(f,
6053                         "%sUtmpIdentifier: %s\n",
6054                         prefix, c->utmp_id);
6055
6056         if (c->selinux_context)
6057                 fprintf(f,
6058                         "%sSELinuxContext: %s%s\n",
6059                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6060
6061         if (c->apparmor_profile)
6062                 fprintf(f,
6063                         "%sAppArmorProfile: %s%s\n",
6064                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6065
6066         if (c->smack_process_label)
6067                 fprintf(f,
6068                         "%sSmackProcessLabel: %s%s\n",
6069                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6070
6071         if (c->personality != PERSONALITY_INVALID)
6072                 fprintf(f,
6073                         "%sPersonality: %s\n",
6074                         prefix, strna(personality_to_string(c->personality)));
6075
6076         fprintf(f,
6077                 "%sLockPersonality: %s\n",
6078                 prefix, yes_no(c->lock_personality));
6079
6080         if (c->syscall_filter) {
6081                 fprintf(f,
6082                         "%sSystemCallFilter: ",
6083                         prefix);
6084
6085                 if (!c->syscall_allow_list)
6086                         fputc('~', f);
6087
6088 #if HAVE_SECCOMP
6089                 void *id, *val;
6090                 bool first = true;
6091                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6092                         _cleanup_free_ char *name = NULL;
6093                         const char *errno_name = NULL;
6094                         int num = PTR_TO_INT(val);
6095
6096                         if (first)
6097                                 first = false;
6098                         else
6099                                 fputc(' ', f);
6100
6101                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6102                         fputs(strna(name), f);
6103
6104                         if (num >= 0) {
6105                                 errno_name = seccomp_errno_or_action_to_string(num);
6106                                 if (errno_name)
6107                                         fprintf(f, ":%s", errno_name);
6108                                 else
6109                                         fprintf(f, ":%d", num);
6110                         }
6111                 }
6112 #endif
6113
6114                 fputc('\n', f);
6115         }
6116
6117         if (c->syscall_archs) {
6118                 fprintf(f,
6119                         "%sSystemCallArchitectures:",
6120                         prefix);
6121
6122 #if HAVE_SECCOMP
6123                 void *id;
6124                 SET_FOREACH(id, c->syscall_archs)
6125                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6126 #endif
6127                 fputc('\n', f);
6128         }
6129
6130         if (exec_context_restrict_namespaces_set(c)) {
6131                 _cleanup_free_ char *s = NULL;
6132
6133                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6134                 if (r >= 0)
6135                         fprintf(f, "%sRestrictNamespaces: %s\n",
6136                                 prefix, strna(s));
6137         }
6138
6139 #if HAVE_LIBBPF
6140         if (exec_context_restrict_filesystems_set(c)) {
6141                 char *fs;
6142                 SET_FOREACH(fs, c->restrict_filesystems)
6143                         fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6144         }
6145 #endif
6146
6147         if (c->network_namespace_path)
6148                 fprintf(f,
6149                         "%sNetworkNamespacePath: %s\n",
6150                         prefix, c->network_namespace_path);
6151
6152         if (c->syscall_errno > 0) {
6153                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6154
6155 #if HAVE_SECCOMP
6156                 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6157                 if (errno_name)
6158                         fputs(errno_name, f);
6159                 else
6160                         fprintf(f, "%d", c->syscall_errno);
6161 #endif
6162                 fputc('\n', f);
6163         }
6164
6165         for (size_t i = 0; i < c->n_mount_images; i++) {
6166                 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6167                         c->mount_images[i].ignore_enoent ? "-": "",
6168                         c->mount_images[i].source,
6169                         c->mount_images[i].destination);
6170                 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6171                         fprintf(f, ":%s:%s",
6172                                 partition_designator_to_string(o->partition_designator),
6173                                 strempty(o->options));
6174                 fprintf(f, "\n");
6175         }
6176
6177         for (size_t i = 0; i < c->n_extension_images; i++) {
6178                 fprintf(f, "%sExtensionImages: %s%s", prefix,
6179                         c->extension_images[i].ignore_enoent ? "-": "",
6180                         c->extension_images[i].source);
6181                 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6182                         fprintf(f, ":%s:%s",
6183                                 partition_designator_to_string(o->partition_designator),
6184                                 strempty(o->options));
6185                 fprintf(f, "\n");
6186         }
6187
6188         strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6189 }
6190
6191 bool exec_context_maintains_privileges(const ExecContext *c) {
6192         assert(c);
6193
6194         /* Returns true if the process forked off would run under
6195          * an unchanged UID or as root. */
6196
6197         if (!c->user)
6198                 return true;
6199
6200         if (streq(c->user, "root") || streq(c->user, "0"))
6201                 return true;
6202
6203         return false;
6204 }
6205
6206 int exec_context_get_effective_ioprio(const ExecContext *c) {
6207         int p;
6208
6209         assert(c);
6210
6211         if (c->ioprio_set)
6212                 return c->ioprio;
6213
6214         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6215         if (p < 0)
6216                 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6217
6218         return ioprio_normalize(p);
6219 }
6220
6221 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6222         assert(c);
6223
6224         /* Explicit setting wins */
6225         if (c->mount_apivfs_set)
6226                 return c->mount_apivfs;
6227
6228         /* Default to "yes" if root directory or image are specified */
6229         if (exec_context_with_rootfs(c))
6230                 return true;
6231
6232         return false;
6233 }
6234
6235 void exec_context_free_log_extra_fields(ExecContext *c) {
6236         assert(c);
6237
6238         for (size_t l = 0; l < c->n_log_extra_fields; l++)
6239                 free(c->log_extra_fields[l].iov_base);
6240         c->log_extra_fields = mfree(c->log_extra_fields);
6241         c->n_log_extra_fields = 0;
6242 }
6243
6244 void exec_context_revert_tty(ExecContext *c) {
6245         _cleanup_close_ int fd = -1;
6246         const char *path;
6247         struct stat st;
6248         int r;
6249
6250         assert(c);
6251
6252         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6253         exec_context_tty_reset(c, NULL);
6254
6255         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6256          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6257          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6258         if (!exec_context_may_touch_tty(c))
6259                 return;
6260
6261         path = exec_context_tty_path(c);
6262         if (!path)
6263                 return;
6264
6265         fd = open(path, O_PATH|O_CLOEXEC);
6266         if (fd < 0)
6267                 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6268                                              "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6269                                              path);
6270
6271         if (fstat(fd, &st) < 0)
6272                 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6273
6274         /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6275          * if things are a character device, since a proper check either means we'd have to open the TTY and
6276          * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6277          * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6278          * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6279         if (!S_ISCHR(st.st_mode))
6280                 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6281
6282         r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6283         if (r < 0)
6284                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6285 }
6286
6287 int exec_context_get_clean_directories(
6288                 ExecContext *c,
6289                 char **prefix,
6290                 ExecCleanMask mask,
6291                 char ***ret) {
6292
6293         _cleanup_strv_free_ char **l = NULL;
6294         int r;
6295
6296         assert(c);
6297         assert(prefix);
6298         assert(ret);
6299
6300         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6301                 if (!FLAGS_SET(mask, 1U << t))
6302                         continue;
6303
6304                 if (!prefix[t])
6305                         continue;
6306
6307                 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6308                         char *j;
6309
6310                         j = path_join(prefix[t], c->directories[t].items[i].path);
6311                         if (!j)
6312                                 return -ENOMEM;
6313
6314                         r = strv_consume(&l, j);
6315                         if (r < 0)
6316                                 return r;
6317
6318                         /* Also remove private directories unconditionally. */
6319                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
6320                                 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6321                                 if (!j)
6322                                         return -ENOMEM;
6323
6324                                 r = strv_consume(&l, j);
6325                                 if (r < 0)
6326                                         return r;
6327                         }
6328
6329                         STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6330                                 j = path_join(prefix[t], *symlink);
6331                                 if (!j)
6332                                         return -ENOMEM;
6333
6334                                 r = strv_consume(&l, j);
6335                                 if (r < 0)
6336                                         return r;
6337                         }
6338                 }
6339         }
6340
6341         *ret = TAKE_PTR(l);
6342         return 0;
6343 }
6344
6345 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6346         ExecCleanMask mask = 0;
6347
6348         assert(c);
6349         assert(ret);
6350
6351         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6352                 if (c->directories[t].n_items > 0)
6353                         mask |= 1U << t;
6354
6355         *ret = mask;
6356         return 0;
6357 }
6358
6359 void exec_status_start(ExecStatus *s, pid_t pid) {
6360         assert(s);
6361
6362         *s = (ExecStatus) {
6363                 .pid = pid,
6364         };
6365
6366         dual_timestamp_get(&s->start_timestamp);
6367 }
6368
6369 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6370         assert(s);
6371
6372         if (s->pid != pid)
6373                 *s = (ExecStatus) {
6374                         .pid = pid,
6375                 };
6376
6377         dual_timestamp_get(&s->exit_timestamp);
6378
6379         s->code = code;
6380         s->status = status;
6381
6382         if (context && context->utmp_id)
6383                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6384 }
6385
6386 void exec_status_reset(ExecStatus *s) {
6387         assert(s);
6388
6389         *s = (ExecStatus) {};
6390 }
6391
6392 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6393         assert(s);
6394         assert(f);
6395
6396         if (s->pid <= 0)
6397                 return;
6398
6399         prefix = strempty(prefix);
6400
6401         fprintf(f,
6402                 "%sPID: "PID_FMT"\n",
6403                 prefix, s->pid);
6404
6405         if (dual_timestamp_is_set(&s->start_timestamp))
6406                 fprintf(f,
6407                         "%sStart Timestamp: %s\n",
6408                         prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6409
6410         if (dual_timestamp_is_set(&s->exit_timestamp))
6411                 fprintf(f,
6412                         "%sExit Timestamp: %s\n"
6413                         "%sExit Code: %s\n"
6414                         "%sExit Status: %i\n",
6415                         prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6416                         prefix, sigchld_code_to_string(s->code),
6417                         prefix, s->status);
6418 }
6419
6420 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6421         _cleanup_free_ char *cmd = NULL;
6422         const char *prefix2;
6423
6424         assert(c);
6425         assert(f);
6426
6427         prefix = strempty(prefix);
6428         prefix2 = strjoina(prefix, "\t");
6429
6430         cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6431
6432         fprintf(f,
6433                 "%sCommand Line: %s\n",
6434                 prefix, strnull(cmd));
6435
6436         exec_status_dump(&c->exec_status, f, prefix2);
6437 }
6438
6439 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6440         assert(f);
6441
6442         prefix = strempty(prefix);
6443
6444         LIST_FOREACH(command, i, c)
6445                 exec_command_dump(i, f, prefix);
6446 }
6447
6448 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6449         ExecCommand *end;
6450
6451         assert(l);
6452         assert(e);
6453
6454         if (*l) {
6455                 /* It's kind of important, that we keep the order here */
6456                 LIST_FIND_TAIL(command, *l, end);
6457                 LIST_INSERT_AFTER(command, *l, end, e);
6458         } else
6459               *l = e;
6460 }
6461
6462 int exec_command_set(ExecCommand *c, const char *path, ...) {
6463         va_list ap;
6464         char **l, *p;
6465
6466         assert(c);
6467         assert(path);
6468
6469         va_start(ap, path);
6470         l = strv_new_ap(path, ap);
6471         va_end(ap);
6472
6473         if (!l)
6474                 return -ENOMEM;
6475
6476         p = strdup(path);
6477         if (!p) {
6478                 strv_free(l);
6479                 return -ENOMEM;
6480         }
6481
6482         free_and_replace(c->path, p);
6483
6484         return strv_free_and_replace(c->argv, l);
6485 }
6486
6487 int exec_command_append(ExecCommand *c, const char *path, ...) {
6488         _cleanup_strv_free_ char **l = NULL;
6489         va_list ap;
6490         int r;
6491
6492         assert(c);
6493         assert(path);
6494
6495         va_start(ap, path);
6496         l = strv_new_ap(path, ap);
6497         va_end(ap);
6498
6499         if (!l)
6500                 return -ENOMEM;
6501
6502         r = strv_extend_strv(&c->argv, l, false);
6503         if (r < 0)
6504                 return r;
6505
6506         return 0;
6507 }
6508
6509 static void *remove_tmpdir_thread(void *p) {
6510         _cleanup_free_ char *path = p;
6511
6512         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6513         return NULL;
6514 }
6515
6516 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6517         int r;
6518
6519         if (!rt)
6520                 return NULL;
6521
6522         if (rt->manager)
6523                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6524
6525         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6526
6527         if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6528                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6529
6530                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6531                 if (r < 0)
6532                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6533                 else
6534                         rt->tmp_dir = NULL;
6535         }
6536
6537         if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6538                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6539
6540                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6541                 if (r < 0)
6542                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6543                 else
6544                         rt->var_tmp_dir = NULL;
6545         }
6546
6547         rt->id = mfree(rt->id);
6548         rt->tmp_dir = mfree(rt->tmp_dir);
6549         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6550         safe_close_pair(rt->netns_storage_socket);
6551         safe_close_pair(rt->ipcns_storage_socket);
6552         return mfree(rt);
6553 }
6554
6555 static void exec_runtime_freep(ExecRuntime **rt) {
6556         (void) exec_runtime_free(*rt, false);
6557 }
6558
6559 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6560         _cleanup_free_ char *id_copy = NULL;
6561         ExecRuntime *n;
6562
6563         assert(ret);
6564
6565         id_copy = strdup(id);
6566         if (!id_copy)
6567                 return -ENOMEM;
6568
6569         n = new(ExecRuntime, 1);
6570         if (!n)
6571                 return -ENOMEM;
6572
6573         *n = (ExecRuntime) {
6574                 .id = TAKE_PTR(id_copy),
6575                 .netns_storage_socket = { -1, -1 },
6576                 .ipcns_storage_socket = { -1, -1 },
6577         };
6578
6579         *ret = n;
6580         return 0;
6581 }
6582
6583 static int exec_runtime_add(
6584                 Manager *m,
6585                 const char *id,
6586                 char **tmp_dir,
6587                 char **var_tmp_dir,
6588                 int netns_storage_socket[2],
6589                 int ipcns_storage_socket[2],
6590                 ExecRuntime **ret) {
6591
6592         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6593         int r;
6594
6595         assert(m);
6596         assert(id);
6597
6598         /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6599
6600         r = exec_runtime_allocate(&rt, id);
6601         if (r < 0)
6602                 return r;
6603
6604         r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6605         if (r < 0)
6606                 return r;
6607
6608         assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6609         rt->tmp_dir = TAKE_PTR(*tmp_dir);
6610         rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6611
6612         if (netns_storage_socket) {
6613                 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6614                 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6615         }
6616
6617         if (ipcns_storage_socket) {
6618                 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6619                 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6620         }
6621
6622         rt->manager = m;
6623
6624         if (ret)
6625                 *ret = rt;
6626         /* do not remove created ExecRuntime object when the operation succeeds. */
6627         TAKE_PTR(rt);
6628         return 0;
6629 }
6630
6631 static int exec_runtime_make(
6632                 Manager *m,
6633                 const ExecContext *c,
6634                 const char *id,
6635                 ExecRuntime **ret) {
6636
6637         _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6638         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6639         int r;
6640
6641         assert(m);
6642         assert(c);
6643         assert(id);
6644
6645         /* It is not necessary to create ExecRuntime object. */
6646         if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6647                 *ret = NULL;
6648                 return 0;
6649         }
6650
6651         if (c->private_tmp &&
6652             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6653               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6654                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6655                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6656                 if (r < 0)
6657                         return r;
6658         }
6659
6660         if (c->private_network || c->network_namespace_path) {
6661                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6662                         return -errno;
6663         }
6664
6665         if (c->private_ipc || c->ipc_namespace_path) {
6666                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6667                         return -errno;
6668         }
6669
6670         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6671         if (r < 0)
6672                 return r;
6673
6674         return 1;
6675 }
6676
6677 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6678         ExecRuntime *rt;
6679         int r;
6680
6681         assert(m);
6682         assert(id);
6683         assert(ret);
6684
6685         rt = hashmap_get(m->exec_runtime_by_id, id);
6686         if (rt)
6687                 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6688                 goto ref;
6689
6690         if (!create) {
6691                 *ret = NULL;
6692                 return 0;
6693         }
6694
6695         /* If not found, then create a new object. */
6696         r = exec_runtime_make(m, c, id, &rt);
6697         if (r < 0)
6698                 return r;
6699         if (r == 0) {
6700                 /* When r == 0, it is not necessary to create ExecRuntime object. */
6701                 *ret = NULL;
6702                 return 0;
6703         }
6704
6705 ref:
6706         /* increment reference counter. */
6707         rt->n_ref++;
6708         *ret = rt;
6709         return 1;
6710 }
6711
6712 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6713         if (!rt)
6714                 return NULL;
6715
6716         assert(rt->n_ref > 0);
6717
6718         rt->n_ref--;
6719         if (rt->n_ref > 0)
6720                 return NULL;
6721
6722         return exec_runtime_free(rt, destroy);
6723 }
6724
6725 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6726         ExecRuntime *rt;
6727
6728         assert(m);
6729         assert(f);
6730         assert(fds);
6731
6732         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6733                 fprintf(f, "exec-runtime=%s", rt->id);
6734
6735                 if (rt->tmp_dir)
6736                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6737
6738                 if (rt->var_tmp_dir)
6739                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6740
6741                 if (rt->netns_storage_socket[0] >= 0) {
6742                         int copy;
6743
6744                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6745                         if (copy < 0)
6746                                 return copy;
6747
6748                         fprintf(f, " netns-socket-0=%i", copy);
6749                 }
6750
6751                 if (rt->netns_storage_socket[1] >= 0) {
6752                         int copy;
6753
6754                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6755                         if (copy < 0)
6756                                 return copy;
6757
6758                         fprintf(f, " netns-socket-1=%i", copy);
6759                 }
6760
6761                 if (rt->ipcns_storage_socket[0] >= 0) {
6762                         int copy;
6763
6764                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6765                         if (copy < 0)
6766                                 return copy;
6767
6768                         fprintf(f, " ipcns-socket-0=%i", copy);
6769                 }
6770
6771                 if (rt->ipcns_storage_socket[1] >= 0) {
6772                         int copy;
6773
6774                         copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6775                         if (copy < 0)
6776                                 return copy;
6777
6778                         fprintf(f, " ipcns-socket-1=%i", copy);
6779                 }
6780
6781                 fputc('\n', f);
6782         }
6783
6784         return 0;
6785 }
6786
6787 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6788         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6789         ExecRuntime *rt;
6790         int r;
6791
6792         /* This is for the migration from old (v237 or earlier) deserialization text.
6793          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6794          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6795          * so or not from the serialized text, then we always creates a new object owned by this. */
6796
6797         assert(u);
6798         assert(key);
6799         assert(value);
6800
6801         /* Manager manages ExecRuntime objects by the unit id.
6802          * So, we omit the serialized text when the unit does not have id (yet?)... */
6803         if (isempty(u->id)) {
6804                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6805                 return 0;
6806         }
6807
6808         if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6809                 return log_oom();
6810
6811         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6812         if (!rt) {
6813                 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6814                         return log_oom();
6815
6816                 rt = rt_create;
6817         }
6818
6819         if (streq(key, "tmp-dir")) {
6820                 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6821                         return -ENOMEM;
6822
6823         } else if (streq(key, "var-tmp-dir")) {
6824                 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6825                         return -ENOMEM;
6826
6827         } else if (streq(key, "netns-socket-0")) {
6828                 int fd;
6829
6830                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6831                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6832                         return 0;
6833                 }
6834
6835                 safe_close(rt->netns_storage_socket[0]);
6836                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6837
6838         } else if (streq(key, "netns-socket-1")) {
6839                 int fd;
6840
6841                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6842                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6843                         return 0;
6844                 }
6845
6846                 safe_close(rt->netns_storage_socket[1]);
6847                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6848
6849         } else
6850                 return 0;
6851
6852         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6853         if (rt_create) {
6854                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6855                 if (r < 0) {
6856                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6857                         return 0;
6858                 }
6859
6860                 rt_create->manager = u->manager;
6861
6862                 /* Avoid cleanup */
6863                 TAKE_PTR(rt_create);
6864         }
6865
6866         return 1;
6867 }
6868
6869 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6870         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6871         char *id = NULL;
6872         int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6873         const char *p, *v = ASSERT_PTR(value);
6874         size_t n;
6875
6876         assert(m);
6877         assert(fds);
6878
6879         n = strcspn(v, " ");
6880         id = strndupa_safe(v, n);
6881         if (v[n] != ' ')
6882                 goto finalize;
6883         p = v + n + 1;
6884
6885         v = startswith(p, "tmp-dir=");
6886         if (v) {
6887                 n = strcspn(v, " ");
6888                 tmp_dir = strndup(v, n);
6889                 if (!tmp_dir)
6890                         return log_oom();
6891                 if (v[n] != ' ')
6892                         goto finalize;
6893                 p = v + n + 1;
6894         }
6895
6896         v = startswith(p, "var-tmp-dir=");
6897         if (v) {
6898                 n = strcspn(v, " ");
6899                 var_tmp_dir = strndup(v, n);
6900                 if (!var_tmp_dir)
6901                         return log_oom();
6902                 if (v[n] != ' ')
6903                         goto finalize;
6904                 p = v + n + 1;
6905         }
6906
6907         v = startswith(p, "netns-socket-0=");
6908         if (v) {
6909                 char *buf;
6910
6911                 n = strcspn(v, " ");
6912                 buf = strndupa_safe(v, n);
6913
6914                 r = safe_atoi(buf, &netns_fdpair[0]);
6915                 if (r < 0)
6916                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6917                 if (!fdset_contains(fds, netns_fdpair[0]))
6918                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6919                                                "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6920                 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6921                 if (v[n] != ' ')
6922                         goto finalize;
6923                 p = v + n + 1;
6924         }
6925
6926         v = startswith(p, "netns-socket-1=");
6927         if (v) {
6928                 char *buf;
6929
6930                 n = strcspn(v, " ");
6931                 buf = strndupa_safe(v, n);
6932
6933                 r = safe_atoi(buf, &netns_fdpair[1]);
6934                 if (r < 0)
6935                         return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6936                 if (!fdset_contains(fds, netns_fdpair[1]))
6937                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6938                                                "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6939                 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6940                 if (v[n] != ' ')
6941                         goto finalize;
6942                 p = v + n + 1;
6943         }
6944
6945         v = startswith(p, "ipcns-socket-0=");
6946         if (v) {
6947                 char *buf;
6948
6949                 n = strcspn(v, " ");
6950                 buf = strndupa_safe(v, n);
6951
6952                 r = safe_atoi(buf, &ipcns_fdpair[0]);
6953                 if (r < 0)
6954                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6955                 if (!fdset_contains(fds, ipcns_fdpair[0]))
6956                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6957                                                "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6958                 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6959                 if (v[n] != ' ')
6960                         goto finalize;
6961                 p = v + n + 1;
6962         }
6963
6964         v = startswith(p, "ipcns-socket-1=");
6965         if (v) {
6966                 char *buf;
6967
6968                 n = strcspn(v, " ");
6969                 buf = strndupa_safe(v, n);
6970
6971                 r = safe_atoi(buf, &ipcns_fdpair[1]);
6972                 if (r < 0)
6973                         return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6974                 if (!fdset_contains(fds, ipcns_fdpair[1]))
6975                         return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6976                                                "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6977                 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6978         }
6979
6980 finalize:
6981         r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6982         if (r < 0)
6983                 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6984         return 0;
6985 }
6986
6987 void exec_runtime_vacuum(Manager *m) {
6988         ExecRuntime *rt;
6989
6990         assert(m);
6991
6992         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6993
6994         HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6995                 if (rt->n_ref > 0)
6996                         continue;
6997
6998                 (void) exec_runtime_free(rt, false);
6999         }
7000 }
7001
7002 void exec_params_clear(ExecParameters *p) {
7003         if (!p)
7004                 return;
7005
7006         p->environment = strv_free(p->environment);
7007         p->fd_names = strv_free(p->fd_names);
7008         p->fds = mfree(p->fds);
7009         p->exec_fd = safe_close(p->exec_fd);
7010 }
7011
7012 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7013         if (!sc)
7014                 return NULL;
7015
7016         free(sc->id);
7017         free(sc->data);
7018         return mfree(sc);
7019 }
7020
7021 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7022         if (!lc)
7023                 return NULL;
7024
7025         free(lc->id);
7026         free(lc->path);
7027         return mfree(lc);
7028 }
7029
7030 void exec_directory_done(ExecDirectory *d) {
7031         if (!d)
7032                 return;
7033
7034         for (size_t i = 0; i < d->n_items; i++) {
7035                 free(d->items[i].path);
7036                 strv_free(d->items[i].symlinks);
7037         }
7038
7039         d->items = mfree(d->items);
7040         d->n_items = 0;
7041         d->mode = 0755;
7042 }
7043
7044 int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
7045         _cleanup_strv_free_ char **s = NULL;
7046         _cleanup_free_ char *p = NULL;
7047
7048         assert(d);
7049         assert(n);
7050         assert(path);
7051
7052         p = strdup(path);
7053         if (!p)
7054                 return -ENOMEM;
7055
7056         if (symlinks) {
7057                 s = strv_copy(symlinks);
7058                 if (!s)
7059                         return -ENOMEM;
7060         }
7061
7062         if (!GREEDY_REALLOC(*d, *n + 1))
7063                 return -ENOMEM;
7064
7065         (*d)[(*n) ++] = (ExecDirectoryItem) {
7066                 .path = TAKE_PTR(p),
7067                 .symlinks = TAKE_PTR(s),
7068         };
7069
7070         return 0;
7071 }
7072
7073 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7074 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7075
7076 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7077         [EXEC_INPUT_NULL] = "null",
7078         [EXEC_INPUT_TTY] = "tty",
7079         [EXEC_INPUT_TTY_FORCE] = "tty-force",
7080         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7081         [EXEC_INPUT_SOCKET] = "socket",
7082         [EXEC_INPUT_NAMED_FD] = "fd",
7083         [EXEC_INPUT_DATA] = "data",
7084         [EXEC_INPUT_FILE] = "file",
7085 };
7086
7087 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7088
7089 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7090         [EXEC_OUTPUT_INHERIT] = "inherit",
7091         [EXEC_OUTPUT_NULL] = "null",
7092         [EXEC_OUTPUT_TTY] = "tty",
7093         [EXEC_OUTPUT_KMSG] = "kmsg",
7094         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7095         [EXEC_OUTPUT_JOURNAL] = "journal",
7096         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7097         [EXEC_OUTPUT_SOCKET] = "socket",
7098         [EXEC_OUTPUT_NAMED_FD] = "fd",
7099         [EXEC_OUTPUT_FILE] = "file",
7100         [EXEC_OUTPUT_FILE_APPEND] = "append",
7101         [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7102 };
7103
7104 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7105
7106 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7107         [EXEC_UTMP_INIT] = "init",
7108         [EXEC_UTMP_LOGIN] = "login",
7109         [EXEC_UTMP_USER] = "user",
7110 };
7111
7112 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7113
7114 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7115         [EXEC_PRESERVE_NO] = "no",
7116         [EXEC_PRESERVE_YES] = "yes",
7117         [EXEC_PRESERVE_RESTART] = "restart",
7118 };
7119
7120 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7121
7122 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7123 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7124         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7125         [EXEC_DIRECTORY_STATE] = "StateDirectory",
7126         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7127         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7128         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7129 };
7130
7131 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7132
7133 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7134 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7135         [EXEC_DIRECTORY_RUNTIME]       = "RuntimeDirectorySymlink",
7136         [EXEC_DIRECTORY_STATE]         = "StateDirectorySymlink",
7137         [EXEC_DIRECTORY_CACHE]         = "CacheDirectorySymlink",
7138         [EXEC_DIRECTORY_LOGS]          = "LogsDirectorySymlink",
7139         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7140 };
7141
7142 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7143
7144 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7145  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7146  * directories, specifically .timer units with their timestamp touch file. */
7147 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7148         [EXEC_DIRECTORY_RUNTIME] = "runtime",
7149         [EXEC_DIRECTORY_STATE] = "state",
7150         [EXEC_DIRECTORY_CACHE] = "cache",
7151         [EXEC_DIRECTORY_LOGS] = "logs",
7152         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7153 };
7154
7155 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7156
7157 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7158  * the service payload in. */
7159 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7160         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7161         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7162         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7163         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7164         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7165 };
7166
7167 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7168
7169 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7170         [EXEC_KEYRING_INHERIT] = "inherit",
7171         [EXEC_KEYRING_PRIVATE] = "private",
7172         [EXEC_KEYRING_SHARED] = "shared",
7173 };
7174
7175 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);