src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cgroup-setup.h"
  52 #include "cpu-set-util.h"
  53 #include "def.h"
  54 #include "env-file.h"
  55 #include "env-util.h"
  56 #include "errno-list.h"
  57 #include "execute.h"
  58 #include "exit-status.h"
  59 #include "fd-util.h"
  60 #include "format-util.h"
  61 #include "fs-util.h"
  62 #include "glob-util.h"
  63 #include "io-util.h"
  64 #include "ioprio.h"
  65 #include "label.h"
  66 #include "log.h"
  67 #include "macro.h"
  68 #include "manager.h"
  69 #include "memory-util.h"
  70 #include "missing.h"
  71 #include "mkdir.h"
  72 #include "namespace.h"
  73 #include "parse-util.h"
  74 #include "path-util.h"
  75 #include "process-util.h"
  76 #include "rlimit-util.h"
  77 #include "rm-rf.h"
  78 #if HAVE_SECCOMP
  79 #include "seccomp-util.h"
  80 #endif
  81 #include "securebits-util.h"
  82 #include "selinux-util.h"
  83 #include "signal-util.h"
  84 #include "smack-util.h"
  85 #include "socket-util.h"
  86 #include "special.h"
  87 #include "stat-util.h"
  88 #include "string-table.h"
  89 #include "string-util.h"
  90 #include "strv.h"
  91 #include "syslog-util.h"
  92 #include "terminal-util.h"
  93 #include "umask-util.h"
  94 #include "unit.h"
  95 #include "user-util.h"
  96 #include "utmp-wtmp.h"
  97
  98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 100
 101 #define SNDBUF_SIZE (8*1024*1024)
 102
 103 static int shift_fds(int fds[], size_t n_fds) {
 104         int start, restart_from;
 105
 106         if (n_fds <= 0)
 107                 return 0;
 108
 109         /* Modifies the fds array! (sorts it) */
 110
 111         assert(fds);
 112
 113         start = 0;
 114         for (;;) {
 115                 int i;
 116
 117                 restart_from = -1;
 118
 119                 for (i = start; i < (int) n_fds; i++) {
 120                         int nfd;
 121
 122                         /* Already at right index? */
 123                         if (fds[i] == i+3)
 124                                 continue;
 125
 126                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 127                         if (nfd < 0)
 128                                 return -errno;
 129
 130                         safe_close(fds[i]);
 131                         fds[i] = nfd;
 132
 133                         /* Hmm, the fd we wanted isn't free? Then
 134                          * let's remember that and try again from here */
 135                         if (nfd != i+3 && restart_from < 0)
 136                                 restart_from = i;
 137                 }
 138
 139                 if (restart_from < 0)
 140                         break;
 141
 142                 start = restart_from;
 143         }
 144
 145         return 0;
 146 }
 147
 148 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 149         size_t i, n_fds;
 150         int r;
 151
 152         n_fds = n_socket_fds + n_storage_fds;
 153         if (n_fds <= 0)
 154                 return 0;
 155
 156         assert(fds);
 157
 158         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 159          * O_NONBLOCK only applies to socket activation though. */
 160
 161         for (i = 0; i < n_fds; i++) {
 162
 163                 if (i < n_socket_fds) {
 164                         r = fd_nonblock(fds[i], nonblock);
 165                         if (r < 0)
 166                                 return r;
 167                 }
 168
 169                 /* We unconditionally drop FD_CLOEXEC from the fds,
 170                  * since after all we want to pass these fds to our
 171                  * children */
 172
 173                 r = fd_cloexec(fds[i], false);
 174                 if (r < 0)
 175                         return r;
 176         }
 177
 178         return 0;
 179 }
 180
 181 static const char *exec_context_tty_path(const ExecContext *context) {
 182         assert(context);
 183
 184         if (context->stdio_as_fds)
 185                 return NULL;
 186
 187         if (context->tty_path)
 188                 return context->tty_path;
 189
 190         return "/dev/console";
 191 }
 192
 193 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 194         const char *path;
 195
 196         assert(context);
 197
 198         path = exec_context_tty_path(context);
 199
 200         if (context->tty_vhangup) {
 201                 if (p && p->stdin_fd >= 0)
 202                         (void) terminal_vhangup_fd(p->stdin_fd);
 203                 else if (path)
 204                         (void) terminal_vhangup(path);
 205         }
 206
 207         if (context->tty_reset) {
 208                 if (p && p->stdin_fd >= 0)
 209                         (void) reset_terminal_fd(p->stdin_fd, true);
 210                 else if (path)
 211                         (void) reset_terminal(path);
 212         }
 213
 214         if (context->tty_vt_disallocate && path)
 215                 (void) vt_disallocate(path);
 216 }
 217
 218 static bool is_terminal_input(ExecInput i) {
 219         return IN_SET(i,
 220                       EXEC_INPUT_TTY,
 221                       EXEC_INPUT_TTY_FORCE,
 222                       EXEC_INPUT_TTY_FAIL);
 223 }
 224
 225 static bool is_terminal_output(ExecOutput o) {
 226         return IN_SET(o,
 227                       EXEC_OUTPUT_TTY,
 228                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 229                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 230                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 231 }
 232
 233 static bool is_syslog_output(ExecOutput o) {
 234         return IN_SET(o,
 235                       EXEC_OUTPUT_SYSLOG,
 236                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 237 }
 238
 239 static bool is_kmsg_output(ExecOutput o) {
 240         return IN_SET(o,
 241                       EXEC_OUTPUT_KMSG,
 242                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 243 }
 244
 245 static bool exec_context_needs_term(const ExecContext *c) {
 246         assert(c);
 247
 248         /* Return true if the execution context suggests we should set $TERM to something useful. */
 249
 250         if (is_terminal_input(c->std_input))
 251                 return true;
 252
 253         if (is_terminal_output(c->std_output))
 254                 return true;
 255
 256         if (is_terminal_output(c->std_error))
 257                 return true;
 258
 259         return !!c->tty_path;
 260 }
 261
 262 static int open_null_as(int flags, int nfd) {
 263         int fd;
 264
 265         assert(nfd >= 0);
 266
 267         fd = open("/dev/null", flags|O_NOCTTY);
 268         if (fd < 0)
 269                 return -errno;
 270
 271         return move_fd(fd, nfd, false);
 272 }
 273
 274 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 275         static const union sockaddr_union sa = {
 276                 .un.sun_family = AF_UNIX,
 277                 .un.sun_path = "/run/systemd/journal/stdout",
 278         };
 279         uid_t olduid = UID_INVALID;
 280         gid_t oldgid = GID_INVALID;
 281         int r;
 282
 283         if (gid_is_valid(gid)) {
 284                 oldgid = getgid();
 285
 286                 if (setegid(gid) < 0)
 287                         return -errno;
 288         }
 289
 290         if (uid_is_valid(uid)) {
 291                 olduid = getuid();
 292
 293                 if (seteuid(uid) < 0) {
 294                         r = -errno;
 295                         goto restore_gid;
 296                 }
 297         }
 298
 299         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 300
 301         /* If we fail to restore the uid or gid, things will likely
 302            fail later on. This should only happen if an LSM interferes. */
 303
 304         if (uid_is_valid(uid))
 305                 (void) seteuid(olduid);
 306
 307  restore_gid:
 308         if (gid_is_valid(gid))
 309                 (void) setegid(oldgid);
 310
 311         return r;
 312 }
 313
 314 static int connect_logger_as(
 315                 const Unit *unit,
 316                 const ExecContext *context,
 317                 const ExecParameters *params,
 318                 ExecOutput output,
 319                 const char *ident,
 320                 int nfd,
 321                 uid_t uid,
 322                 gid_t gid) {
 323
 324         _cleanup_close_ int fd = -1;
 325         int r;
 326
 327         assert(context);
 328         assert(params);
 329         assert(output < _EXEC_OUTPUT_MAX);
 330         assert(ident);
 331         assert(nfd >= 0);
 332
 333         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 334         if (fd < 0)
 335                 return -errno;
 336
 337         r = connect_journal_socket(fd, uid, gid);
 338         if (r < 0)
 339                 return r;
 340
 341         if (shutdown(fd, SHUT_RD) < 0)
 342                 return -errno;
 343
 344         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 345
 346         if (dprintf(fd,
 347                 "%s\n"
 348                 "%s\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n",
 354                 context->syslog_identifier ?: ident,
 355                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 356                 context->syslog_priority,
 357                 !!context->syslog_level_prefix,
 358                 is_syslog_output(output),
 359                 is_kmsg_output(output),
 360                 is_terminal_output(output)) < 0)
 361                 return -errno;
 362
 363         return move_fd(TAKE_FD(fd), nfd, false);
 364 }
 365
 366 static int open_terminal_as(const char *path, int flags, int nfd) {
 367         int fd;
 368
 369         assert(path);
 370         assert(nfd >= 0);
 371
 372         fd = open_terminal(path, flags | O_NOCTTY);
 373         if (fd < 0)
 374                 return fd;
 375
 376         return move_fd(fd, nfd, false);
 377 }
 378
 379 static int acquire_path(const char *path, int flags, mode_t mode) {
 380         union sockaddr_union sa = {};
 381         _cleanup_close_ int fd = -1;
 382         int r, salen;
 383
 384         assert(path);
 385
 386         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 387                 flags |= O_CREAT;
 388
 389         fd = open(path, flags|O_NOCTTY, mode);
 390         if (fd >= 0)
 391                 return TAKE_FD(fd);
 392
 393         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 394                 return -errno;
 395         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 396                 return -ENXIO;
 397
 398         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 399
 400         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 401         if (fd < 0)
 402                 return -errno;
 403
 404         salen = sockaddr_un_set_path(&sa.un, path);
 405         if (salen < 0)
 406                 return salen;
 407
 408         if (connect(fd, &sa.sa, salen) < 0)
 409                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 410                                                            * indication that his wasn't an AF_UNIX socket after all */
 411
 412         if ((flags & O_ACCMODE) == O_RDONLY)
 413                 r = shutdown(fd, SHUT_WR);
 414         else if ((flags & O_ACCMODE) == O_WRONLY)
 415                 r = shutdown(fd, SHUT_RD);
 416         else
 417                 return TAKE_FD(fd);
 418         if (r < 0)
 419                 return -errno;
 420
 421         return TAKE_FD(fd);
 422 }
 423
 424 static int fixup_input(
 425                 const ExecContext *context,
 426                 int socket_fd,
 427                 bool apply_tty_stdin) {
 428
 429         ExecInput std_input;
 430
 431         assert(context);
 432
 433         std_input = context->std_input;
 434
 435         if (is_terminal_input(std_input) && !apply_tty_stdin)
 436                 return EXEC_INPUT_NULL;
 437
 438         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 439                 return EXEC_INPUT_NULL;
 440
 441         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 442                 return EXEC_INPUT_NULL;
 443
 444         return std_input;
 445 }
 446
 447 static int fixup_output(ExecOutput std_output, int socket_fd) {
 448
 449         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 450                 return EXEC_OUTPUT_INHERIT;
 451
 452         return std_output;
 453 }
 454
 455 static int setup_input(
 456                 const ExecContext *context,
 457                 const ExecParameters *params,
 458                 int socket_fd,
 459                 const int named_iofds[static 3]) {
 460
 461         ExecInput i;
 462
 463         assert(context);
 464         assert(params);
 465         assert(named_iofds);
 466
 467         if (params->stdin_fd >= 0) {
 468                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 469                         return -errno;
 470
 471                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 472                 if (isatty(STDIN_FILENO)) {
 473                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 474                         (void) reset_terminal_fd(STDIN_FILENO, true);
 475                 }
 476
 477                 return STDIN_FILENO;
 478         }
 479
 480         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 481
 482         switch (i) {
 483
 484         case EXEC_INPUT_NULL:
 485                 return open_null_as(O_RDONLY, STDIN_FILENO);
 486
 487         case EXEC_INPUT_TTY:
 488         case EXEC_INPUT_TTY_FORCE:
 489         case EXEC_INPUT_TTY_FAIL: {
 490                 int fd;
 491
 492                 fd = acquire_terminal(exec_context_tty_path(context),
 493                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 494                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 495                                                                   ACQUIRE_TERMINAL_WAIT,
 496                                       USEC_INFINITY);
 497                 if (fd < 0)
 498                         return fd;
 499
 500                 return move_fd(fd, STDIN_FILENO, false);
 501         }
 502
 503         case EXEC_INPUT_SOCKET:
 504                 assert(socket_fd >= 0);
 505
 506                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 507
 508         case EXEC_INPUT_NAMED_FD:
 509                 assert(named_iofds[STDIN_FILENO] >= 0);
 510
 511                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 512                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 513
 514         case EXEC_INPUT_DATA: {
 515                 int fd;
 516
 517                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 518                 if (fd < 0)
 519                         return fd;
 520
 521                 return move_fd(fd, STDIN_FILENO, false);
 522         }
 523
 524         case EXEC_INPUT_FILE: {
 525                 bool rw;
 526                 int fd;
 527
 528                 assert(context->stdio_file[STDIN_FILENO]);
 529
 530                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 531                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 532
 533                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 534                 if (fd < 0)
 535                         return fd;
 536
 537                 return move_fd(fd, STDIN_FILENO, false);
 538         }
 539
 540         default:
 541                 assert_not_reached("Unknown input type");
 542         }
 543 }
 544
 545 static bool can_inherit_stderr_from_stdout(
 546                 const ExecContext *context,
 547                 ExecOutput o,
 548                 ExecOutput e) {
 549
 550         assert(context);
 551
 552         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 553          * stderr fd */
 554
 555         if (e == EXEC_OUTPUT_INHERIT)
 556                 return true;
 557         if (e != o)
 558                 return false;
 559
 560         if (e == EXEC_OUTPUT_NAMED_FD)
 561                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 562
 563         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 564                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 565
 566         return true;
 567 }
 568
 569 static int setup_output(
 570                 const Unit *unit,
 571                 const ExecContext *context,
 572                 const ExecParameters *params,
 573                 int fileno,
 574                 int socket_fd,
 575                 const int named_iofds[static 3],
 576                 const char *ident,
 577                 uid_t uid,
 578                 gid_t gid,
 579                 dev_t *journal_stream_dev,
 580                 ino_t *journal_stream_ino) {
 581
 582         ExecOutput o;
 583         ExecInput i;
 584         int r;
 585
 586         assert(unit);
 587         assert(context);
 588         assert(params);
 589         assert(ident);
 590         assert(journal_stream_dev);
 591         assert(journal_stream_ino);
 592
 593         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 594
 595                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 596                         return -errno;
 597
 598                 return STDOUT_FILENO;
 599         }
 600
 601         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 602                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 603                         return -errno;
 604
 605                 return STDERR_FILENO;
 606         }
 607
 608         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 609         o = fixup_output(context->std_output, socket_fd);
 610
 611         if (fileno == STDERR_FILENO) {
 612                 ExecOutput e;
 613                 e = fixup_output(context->std_error, socket_fd);
 614
 615                 /* This expects the input and output are already set up */
 616
 617                 /* Don't change the stderr file descriptor if we inherit all
 618                  * the way and are not on a tty */
 619                 if (e == EXEC_OUTPUT_INHERIT &&
 620                     o == EXEC_OUTPUT_INHERIT &&
 621                     i == EXEC_INPUT_NULL &&
 622                     !is_terminal_input(context->std_input) &&
 623                     getppid () != 1)
 624                         return fileno;
 625
 626                 /* Duplicate from stdout if possible */
 627                 if (can_inherit_stderr_from_stdout(context, o, e))
 628                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 629
 630                 o = e;
 631
 632         } else if (o == EXEC_OUTPUT_INHERIT) {
 633                 /* If input got downgraded, inherit the original value */
 634                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 635                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 636
 637                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 638                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 639                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 640
 641                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 642                 if (getppid() != 1)
 643                         return fileno;
 644
 645                 /* We need to open /dev/null here anew, to get the right access mode. */
 646                 return open_null_as(O_WRONLY, fileno);
 647         }
 648
 649         switch (o) {
 650
 651         case EXEC_OUTPUT_NULL:
 652                 return open_null_as(O_WRONLY, fileno);
 653
 654         case EXEC_OUTPUT_TTY:
 655                 if (is_terminal_input(i))
 656                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 657
 658                 /* We don't reset the terminal if this is just about output */
 659                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 660
 661         case EXEC_OUTPUT_SYSLOG:
 662         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 663         case EXEC_OUTPUT_KMSG:
 664         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 665         case EXEC_OUTPUT_JOURNAL:
 666         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 667                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 668                 if (r < 0) {
 669                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 670                         r = open_null_as(O_WRONLY, fileno);
 671                 } else {
 672                         struct stat st;
 673
 674                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 675                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 676                          * services to detect whether they are connected to the journal or not.
 677                          *
 678                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 679                          * about STDERR as that's usually the best way to do logging. */
 680
 681                         if (fstat(fileno, &st) >= 0 &&
 682                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 683                                 *journal_stream_dev = st.st_dev;
 684                                 *journal_stream_ino = st.st_ino;
 685                         }
 686                 }
 687                 return r;
 688
 689         case EXEC_OUTPUT_SOCKET:
 690                 assert(socket_fd >= 0);
 691
 692                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 693
 694         case EXEC_OUTPUT_NAMED_FD:
 695                 assert(named_iofds[fileno] >= 0);
 696
 697                 (void) fd_nonblock(named_iofds[fileno], false);
 698                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_FILE:
 701         case EXEC_OUTPUT_FILE_APPEND: {
 702                 bool rw;
 703                 int fd, flags;
 704
 705                 assert(context->stdio_file[fileno]);
 706
 707                 rw = context->std_input == EXEC_INPUT_FILE &&
 708                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 709
 710                 if (rw)
 711                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 712
 713                 flags = O_WRONLY;
 714                 if (o == EXEC_OUTPUT_FILE_APPEND)
 715                         flags |= O_APPEND;
 716
 717                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 718                 if (fd < 0)
 719                         return fd;
 720
 721                 return move_fd(fd, fileno, 0);
 722         }
 723
 724         default:
 725                 assert_not_reached("Unknown error type");
 726         }
 727 }
 728
 729 static int chown_terminal(int fd, uid_t uid) {
 730         int r;
 731
 732         assert(fd >= 0);
 733
 734         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 735         if (isatty(fd) < 1) {
 736                 if (IN_SET(errno, EINVAL, ENOTTY))
 737                         return 0; /* not a tty */
 738
 739                 return -errno;
 740         }
 741
 742         /* This might fail. What matters are the results. */
 743         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 744         if (r < 0)
 745                 return r;
 746
 747         return 1;
 748 }
 749
 750 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 751         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 752         int r;
 753
 754         assert(_saved_stdin);
 755         assert(_saved_stdout);
 756
 757         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 758         if (saved_stdin < 0)
 759                 return -errno;
 760
 761         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 762         if (saved_stdout < 0)
 763                 return -errno;
 764
 765         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 766         if (fd < 0)
 767                 return fd;
 768
 769         r = chown_terminal(fd, getuid());
 770         if (r < 0)
 771                 return r;
 772
 773         r = reset_terminal_fd(fd, true);
 774         if (r < 0)
 775                 return r;
 776
 777         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 778         fd = -1;
 779         if (r < 0)
 780                 return r;
 781
 782         *_saved_stdin = saved_stdin;
 783         *_saved_stdout = saved_stdout;
 784
 785         saved_stdin = saved_stdout = -1;
 786
 787         return 0;
 788 }
 789
 790 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 791         assert(err < 0);
 792
 793         if (err == -ETIMEDOUT)
 794                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 795         else {
 796                 errno = -err;
 797                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 798         }
 799 }
 800
 801 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 802         _cleanup_close_ int fd = -1;
 803
 804         assert(vc);
 805
 806         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 807         if (fd < 0)
 808                 return;
 809
 810         write_confirm_error_fd(err, fd, u);
 811 }
 812
 813 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 814         int r = 0;
 815
 816         assert(saved_stdin);
 817         assert(saved_stdout);
 818
 819         release_terminal();
 820
 821         if (*saved_stdin >= 0)
 822                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 823                         r = -errno;
 824
 825         if (*saved_stdout >= 0)
 826                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 827                         r = -errno;
 828
 829         *saved_stdin = safe_close(*saved_stdin);
 830         *saved_stdout = safe_close(*saved_stdout);
 831
 832         return r;
 833 }
 834
 835 enum {
 836         CONFIRM_PRETEND_FAILURE = -1,
 837         CONFIRM_PRETEND_SUCCESS =  0,
 838         CONFIRM_EXECUTE = 1,
 839 };
 840
 841 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 842         int saved_stdout = -1, saved_stdin = -1, r;
 843         _cleanup_free_ char *e = NULL;
 844         char c;
 845
 846         /* For any internal errors, assume a positive response. */
 847         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 848         if (r < 0) {
 849                 write_confirm_error(r, vc, u);
 850                 return CONFIRM_EXECUTE;
 851         }
 852
 853         /* confirm_spawn might have been disabled while we were sleeping. */
 854         if (manager_is_confirm_spawn_disabled(u->manager)) {
 855                 r = 1;
 856                 goto restore_stdio;
 857         }
 858
 859         e = ellipsize(cmdline, 60, 100);
 860         if (!e) {
 861                 log_oom();
 862                 r = CONFIRM_EXECUTE;
 863                 goto restore_stdio;
 864         }
 865
 866         for (;;) {
 867                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 868                 if (r < 0) {
 869                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 870                         r = CONFIRM_EXECUTE;
 871                         goto restore_stdio;
 872                 }
 873
 874                 switch (c) {
 875                 case 'c':
 876                         printf("Resuming normal execution.\n");
 877                         manager_disable_confirm_spawn();
 878                         r = 1;
 879                         break;
 880                 case 'D':
 881                         unit_dump(u, stdout, "  ");
 882                         continue; /* ask again */
 883                 case 'f':
 884                         printf("Failing execution.\n");
 885                         r = CONFIRM_PRETEND_FAILURE;
 886                         break;
 887                 case 'h':
 888                         printf("  c - continue, proceed without asking anymore\n"
 889                                "  D - dump, show the state of the unit\n"
 890                                "  f - fail, don't execute the command and pretend it failed\n"
 891                                "  h - help\n"
 892                                "  i - info, show a short summary of the unit\n"
 893                                "  j - jobs, show jobs that are in progress\n"
 894                                "  s - skip, don't execute the command and pretend it succeeded\n"
 895                                "  y - yes, execute the command\n");
 896                         continue; /* ask again */
 897                 case 'i':
 898                         printf("  Description: %s\n"
 899                                "  Unit:        %s\n"
 900                                "  Command:     %s\n",
 901                                u->id, u->description, cmdline);
 902                         continue; /* ask again */
 903                 case 'j':
 904                         manager_dump_jobs(u->manager, stdout, "  ");
 905                         continue; /* ask again */
 906                 case 'n':
 907                         /* 'n' was removed in favor of 'f'. */
 908                         printf("Didn't understand 'n', did you mean 'f'?\n");
 909                         continue; /* ask again */
 910                 case 's':
 911                         printf("Skipping execution.\n");
 912                         r = CONFIRM_PRETEND_SUCCESS;
 913                         break;
 914                 case 'y':
 915                         r = CONFIRM_EXECUTE;
 916                         break;
 917                 default:
 918                         assert_not_reached("Unhandled choice");
 919                 }
 920                 break;
 921         }
 922
 923 restore_stdio:
 924         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 925         return r;
 926 }
 927
 928 static int get_fixed_user(const ExecContext *c, const char **user,
 929                           uid_t *uid, gid_t *gid,
 930                           const char **home, const char **shell) {
 931         int r;
 932         const char *name;
 933
 934         assert(c);
 935
 936         if (!c->user)
 937                 return 0;
 938
 939         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 940          * (i.e. are "/" or "/bin/nologin"). */
 941
 942         name = c->user;
 943         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 944         if (r < 0)
 945                 return r;
 946
 947         *user = name;
 948         return 0;
 949 }
 950
 951 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 952         int r;
 953         const char *name;
 954
 955         assert(c);
 956
 957         if (!c->group)
 958                 return 0;
 959
 960         name = c->group;
 961         r = get_group_creds(&name, gid, 0);
 962         if (r < 0)
 963                 return r;
 964
 965         *group = name;
 966         return 0;
 967 }
 968
 969 static int get_supplementary_groups(const ExecContext *c, const char *user,
 970                                     const char *group, gid_t gid,
 971                                     gid_t **supplementary_gids, int *ngids) {
 972         char **i;
 973         int r, k = 0;
 974         int ngroups_max;
 975         bool keep_groups = false;
 976         gid_t *groups = NULL;
 977         _cleanup_free_ gid_t *l_gids = NULL;
 978
 979         assert(c);
 980
 981         /*
 982          * If user is given, then lookup GID and supplementary groups list.
 983          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 984          * here and as early as possible so we keep the list of supplementary
 985          * groups of the caller.
 986          */
 987         if (user && gid_is_valid(gid) && gid != 0) {
 988                 /* First step, initialize groups from /etc/groups */
 989                 if (initgroups(user, gid) < 0)
 990                         return -errno;
 991
 992                 keep_groups = true;
 993         }
 994
 995         if (strv_isempty(c->supplementary_groups))
 996                 return 0;
 997
 998         /*
 999          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1000          * be positive, otherwise fail.
1001          */
1002         errno = 0;
1003         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1004         if (ngroups_max <= 0)
1005                 return errno_or_else(EOPNOTSUPP);
1006
1007         l_gids = new(gid_t, ngroups_max);
1008         if (!l_gids)
1009                 return -ENOMEM;
1010
1011         if (keep_groups) {
1012                 /*
1013                  * Lookup the list of groups that the user belongs to, we
1014                  * avoid NSS lookups here too for gid=0.
1015                  */
1016                 k = ngroups_max;
1017                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1018                         return -EINVAL;
1019         } else
1020                 k = 0;
1021
1022         STRV_FOREACH(i, c->supplementary_groups) {
1023                 const char *g;
1024
1025                 if (k >= ngroups_max)
1026                         return -E2BIG;
1027
1028                 g = *i;
1029                 r = get_group_creds(&g, l_gids+k, 0);
1030                 if (r < 0)
1031                         return r;
1032
1033                 k++;
1034         }
1035
1036         /*
1037          * Sets ngids to zero to drop all supplementary groups, happens
1038          * when we are under root and SupplementaryGroups= is empty.
1039          */
1040         if (k == 0) {
1041                 *ngids = 0;
1042                 return 0;
1043         }
1044
1045         /* Otherwise get the final list of supplementary groups */
1046         groups = memdup(l_gids, sizeof(gid_t) * k);
1047         if (!groups)
1048                 return -ENOMEM;
1049
1050         *supplementary_gids = groups;
1051         *ngids = k;
1052
1053         groups = NULL;
1054
1055         return 0;
1056 }
1057
1058 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1059         int r;
1060
1061         /* Handle SupplementaryGroups= if it is not empty */
1062         if (ngids > 0) {
1063                 r = maybe_setgroups(ngids, supplementary_gids);
1064                 if (r < 0)
1065                         return r;
1066         }
1067
1068         if (gid_is_valid(gid)) {
1069                 /* Then set our gids */
1070                 if (setresgid(gid, gid, gid) < 0)
1071                         return -errno;
1072         }
1073
1074         return 0;
1075 }
1076
1077 static int enforce_user(const ExecContext *context, uid_t uid) {
1078         assert(context);
1079
1080         if (!uid_is_valid(uid))
1081                 return 0;
1082
1083         /* Sets (but doesn't look up) the uid and make sure we keep the
1084          * capabilities while doing so. */
1085
1086         if (context->capability_ambient_set != 0) {
1087
1088                 /* First step: If we need to keep capabilities but
1089                  * drop privileges we need to make sure we keep our
1090                  * caps, while we drop privileges. */
1091                 if (uid != 0) {
1092                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1093
1094                         if (prctl(PR_GET_SECUREBITS) != sb)
1095                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1096                                         return -errno;
1097                 }
1098         }
1099
1100         /* Second step: actually set the uids */
1101         if (setresuid(uid, uid, uid) < 0)
1102                 return -errno;
1103
1104         /* At this point we should have all necessary capabilities but
1105            are otherwise a normal user. However, the caps might got
1106            corrupted due to the setresuid() so we need clean them up
1107            later. This is done outside of this call. */
1108
1109         return 0;
1110 }
1111
1112 #if HAVE_PAM
1113
1114 static int null_conv(
1115                 int num_msg,
1116                 const struct pam_message **msg,
1117                 struct pam_response **resp,
1118                 void *appdata_ptr) {
1119
1120         /* We don't support conversations */
1121
1122         return PAM_CONV_ERR;
1123 }
1124
1125 #endif
1126
1127 static int setup_pam(
1128                 const char *name,
1129                 const char *user,
1130                 uid_t uid,
1131                 gid_t gid,
1132                 const char *tty,
1133                 char ***env,
1134                 int fds[], size_t n_fds) {
1135
1136 #if HAVE_PAM
1137
1138         static const struct pam_conv conv = {
1139                 .conv = null_conv,
1140                 .appdata_ptr = NULL
1141         };
1142
1143         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1144         pam_handle_t *handle = NULL;
1145         sigset_t old_ss;
1146         int pam_code = PAM_SUCCESS, r;
1147         char **nv, **e = NULL;
1148         bool close_session = false;
1149         pid_t pam_pid = 0, parent_pid;
1150         int flags = 0;
1151
1152         assert(name);
1153         assert(user);
1154         assert(env);
1155
1156         /* We set up PAM in the parent process, then fork. The child
1157          * will then stay around until killed via PR_GET_PDEATHSIG or
1158          * systemd via the cgroup logic. It will then remove the PAM
1159          * session again. The parent process will exec() the actual
1160          * daemon. We do things this way to ensure that the main PID
1161          * of the daemon is the one we initially fork()ed. */
1162
1163         r = barrier_create(&barrier);
1164         if (r < 0)
1165                 goto fail;
1166
1167         if (log_get_max_level() < LOG_DEBUG)
1168                 flags |= PAM_SILENT;
1169
1170         pam_code = pam_start(name, user, &conv, &handle);
1171         if (pam_code != PAM_SUCCESS) {
1172                 handle = NULL;
1173                 goto fail;
1174         }
1175
1176         if (!tty) {
1177                 _cleanup_free_ char *q = NULL;
1178
1179                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1180                  * out if that's the case, and read the TTY off it. */
1181
1182                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1183                         tty = strjoina("/dev/", q);
1184         }
1185
1186         if (tty) {
1187                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1188                 if (pam_code != PAM_SUCCESS)
1189                         goto fail;
1190         }
1191
1192         STRV_FOREACH(nv, *env) {
1193                 pam_code = pam_putenv(handle, *nv);
1194                 if (pam_code != PAM_SUCCESS)
1195                         goto fail;
1196         }
1197
1198         pam_code = pam_acct_mgmt(handle, flags);
1199         if (pam_code != PAM_SUCCESS)
1200                 goto fail;
1201
1202         pam_code = pam_open_session(handle, flags);
1203         if (pam_code != PAM_SUCCESS)
1204                 goto fail;
1205
1206         close_session = true;
1207
1208         e = pam_getenvlist(handle);
1209         if (!e) {
1210                 pam_code = PAM_BUF_ERR;
1211                 goto fail;
1212         }
1213
1214         /* Block SIGTERM, so that we know that it won't get lost in
1215          * the child */
1216
1217         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1218
1219         parent_pid = getpid_cached();
1220
1221         r = safe_fork("(sd-pam)", 0, &pam_pid);
1222         if (r < 0)
1223                 goto fail;
1224         if (r == 0) {
1225                 int sig, ret = EXIT_PAM;
1226
1227                 /* The child's job is to reset the PAM session on
1228                  * termination */
1229                 barrier_set_role(&barrier, BARRIER_CHILD);
1230
1231                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1232                  * are open here that have been opened by PAM. */
1233                 (void) close_many(fds, n_fds);
1234
1235                 /* Drop privileges - we don't need any to pam_close_session
1236                  * and this will make PR_SET_PDEATHSIG work in most cases.
1237                  * If this fails, ignore the error - but expect sd-pam threads
1238                  * to fail to exit normally */
1239
1240                 r = maybe_setgroups(0, NULL);
1241                 if (r < 0)
1242                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1243                 if (setresgid(gid, gid, gid) < 0)
1244                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1245                 if (setresuid(uid, uid, uid) < 0)
1246                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1247
1248                 (void) ignore_signals(SIGPIPE, -1);
1249
1250                 /* Wait until our parent died. This will only work if
1251                  * the above setresuid() succeeds, otherwise the kernel
1252                  * will not allow unprivileged parents kill their privileged
1253                  * children this way. We rely on the control groups kill logic
1254                  * to do the rest for us. */
1255                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1256                         goto child_finish;
1257
1258                 /* Tell the parent that our setup is done. This is especially
1259                  * important regarding dropping privileges. Otherwise, unit
1260                  * setup might race against our setresuid(2) call.
1261                  *
1262                  * If the parent aborted, we'll detect this below, hence ignore
1263                  * return failure here. */
1264                 (void) barrier_place(&barrier);
1265
1266                 /* Check if our parent process might already have died? */
1267                 if (getppid() == parent_pid) {
1268                         sigset_t ss;
1269
1270                         assert_se(sigemptyset(&ss) >= 0);
1271                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1272
1273                         for (;;) {
1274                                 if (sigwait(&ss, &sig) < 0) {
1275                                         if (errno == EINTR)
1276                                                 continue;
1277
1278                                         goto child_finish;
1279                                 }
1280
1281                                 assert(sig == SIGTERM);
1282                                 break;
1283                         }
1284                 }
1285
1286                 /* If our parent died we'll end the session */
1287                 if (getppid() != parent_pid) {
1288                         pam_code = pam_close_session(handle, flags);
1289                         if (pam_code != PAM_SUCCESS)
1290                                 goto child_finish;
1291                 }
1292
1293                 ret = 0;
1294
1295         child_finish:
1296                 pam_end(handle, pam_code | flags);
1297                 _exit(ret);
1298         }
1299
1300         barrier_set_role(&barrier, BARRIER_PARENT);
1301
1302         /* If the child was forked off successfully it will do all the
1303          * cleanups, so forget about the handle here. */
1304         handle = NULL;
1305
1306         /* Unblock SIGTERM again in the parent */
1307         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1308
1309         /* We close the log explicitly here, since the PAM modules
1310          * might have opened it, but we don't want this fd around. */
1311         closelog();
1312
1313         /* Synchronously wait for the child to initialize. We don't care for
1314          * errors as we cannot recover. However, warn loudly if it happens. */
1315         if (!barrier_place_and_sync(&barrier))
1316                 log_error("PAM initialization failed");
1317
1318         return strv_free_and_replace(*env, e);
1319
1320 fail:
1321         if (pam_code != PAM_SUCCESS) {
1322                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1323                 r = -EPERM;  /* PAM errors do not map to errno */
1324         } else
1325                 log_error_errno(r, "PAM failed: %m");
1326
1327         if (handle) {
1328                 if (close_session)
1329                         pam_code = pam_close_session(handle, flags);
1330
1331                 pam_end(handle, pam_code | flags);
1332         }
1333
1334         strv_free(e);
1335         closelog();
1336
1337         return r;
1338 #else
1339         return 0;
1340 #endif
1341 }
1342
1343 static void rename_process_from_path(const char *path) {
1344         char process_name[11];
1345         const char *p;
1346         size_t l;
1347
1348         /* This resulting string must fit in 10 chars (i.e. the length
1349          * of "/sbin/init") to look pretty in /bin/ps */
1350
1351         p = basename(path);
1352         if (isempty(p)) {
1353                 rename_process("(...)");
1354                 return;
1355         }
1356
1357         l = strlen(p);
1358         if (l > 8) {
1359                 /* The end of the process name is usually more
1360                  * interesting, since the first bit might just be
1361                  * "systemd-" */
1362                 p = p + l - 8;
1363                 l = 8;
1364         }
1365
1366         process_name[0] = '(';
1367         memcpy(process_name+1, p, l);
1368         process_name[1+l] = ')';
1369         process_name[1+l+1] = 0;
1370
1371         rename_process(process_name);
1372 }
1373
1374 static bool context_has_address_families(const ExecContext *c) {
1375         assert(c);
1376
1377         return c->address_families_whitelist ||
1378                 !set_isempty(c->address_families);
1379 }
1380
1381 static bool context_has_syscall_filters(const ExecContext *c) {
1382         assert(c);
1383
1384         return c->syscall_whitelist ||
1385                 !hashmap_isempty(c->syscall_filter);
1386 }
1387
1388 static bool context_has_no_new_privileges(const ExecContext *c) {
1389         assert(c);
1390
1391         if (c->no_new_privileges)
1392                 return true;
1393
1394         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1395                 return false;
1396
1397         /* We need NNP if we have any form of seccomp and are unprivileged */
1398         return context_has_address_families(c) ||
1399                 c->memory_deny_write_execute ||
1400                 c->restrict_realtime ||
1401                 c->restrict_suid_sgid ||
1402                 exec_context_restrict_namespaces_set(c) ||
1403                 c->protect_kernel_tunables ||
1404                 c->protect_kernel_modules ||
1405                 c->private_devices ||
1406                 context_has_syscall_filters(c) ||
1407                 !set_isempty(c->syscall_archs) ||
1408                 c->lock_personality ||
1409                 c->protect_hostname;
1410 }
1411
1412 #if HAVE_SECCOMP
1413
1414 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1415
1416         if (is_seccomp_available())
1417                 return false;
1418
1419         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1420         return true;
1421 }
1422
1423 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1424         uint32_t negative_action, default_action, action;
1425         int r;
1426
1427         assert(u);
1428         assert(c);
1429
1430         if (!context_has_syscall_filters(c))
1431                 return 0;
1432
1433         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1434                 return 0;
1435
1436         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1437
1438         if (c->syscall_whitelist) {
1439                 default_action = negative_action;
1440                 action = SCMP_ACT_ALLOW;
1441         } else {
1442                 default_action = SCMP_ACT_ALLOW;
1443                 action = negative_action;
1444         }
1445
1446         if (needs_ambient_hack) {
1447                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1448                 if (r < 0)
1449                         return r;
1450         }
1451
1452         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1453 }
1454
1455 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1456         assert(u);
1457         assert(c);
1458
1459         if (set_isempty(c->syscall_archs))
1460                 return 0;
1461
1462         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1463                 return 0;
1464
1465         return seccomp_restrict_archs(c->syscall_archs);
1466 }
1467
1468 static int apply_address_families(const Unit* u, const ExecContext *c) {
1469         assert(u);
1470         assert(c);
1471
1472         if (!context_has_address_families(c))
1473                 return 0;
1474
1475         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1476                 return 0;
1477
1478         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1479 }
1480
1481 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1482         assert(u);
1483         assert(c);
1484
1485         if (!c->memory_deny_write_execute)
1486                 return 0;
1487
1488         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1489                 return 0;
1490
1491         return seccomp_memory_deny_write_execute();
1492 }
1493
1494 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1495         assert(u);
1496         assert(c);
1497
1498         if (!c->restrict_realtime)
1499                 return 0;
1500
1501         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1502                 return 0;
1503
1504         return seccomp_restrict_realtime();
1505 }
1506
1507 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1508         assert(u);
1509         assert(c);
1510
1511         if (!c->restrict_suid_sgid)
1512                 return 0;
1513
1514         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1515                 return 0;
1516
1517         return seccomp_restrict_suid_sgid();
1518 }
1519
1520 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1521         assert(u);
1522         assert(c);
1523
1524         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1525          * let's protect even those systems where this is left on in the kernel. */
1526
1527         if (!c->protect_kernel_tunables)
1528                 return 0;
1529
1530         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1531                 return 0;
1532
1533         return seccomp_protect_sysctl();
1534 }
1535
1536 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1537         assert(u);
1538         assert(c);
1539
1540         /* Turn off module syscalls on ProtectKernelModules=yes */
1541
1542         if (!c->protect_kernel_modules)
1543                 return 0;
1544
1545         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1546                 return 0;
1547
1548         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1549 }
1550
1551 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1552         assert(u);
1553         assert(c);
1554
1555         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1556
1557         if (!c->private_devices)
1558                 return 0;
1559
1560         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1561                 return 0;
1562
1563         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1564 }
1565
1566 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1567         assert(u);
1568         assert(c);
1569
1570         if (!exec_context_restrict_namespaces_set(c))
1571                 return 0;
1572
1573         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1574                 return 0;
1575
1576         return seccomp_restrict_namespaces(c->restrict_namespaces);
1577 }
1578
1579 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1580         unsigned long personality;
1581         int r;
1582
1583         assert(u);
1584         assert(c);
1585
1586         if (!c->lock_personality)
1587                 return 0;
1588
1589         if (skip_seccomp_unavailable(u, "LockPersonality="))
1590                 return 0;
1591
1592         personality = c->personality;
1593
1594         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1595         if (personality == PERSONALITY_INVALID) {
1596
1597                 r = opinionated_personality(&personality);
1598                 if (r < 0)
1599                         return r;
1600         }
1601
1602         return seccomp_lock_personality(personality);
1603 }
1604
1605 #endif
1606
1607 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1608         assert(idle_pipe);
1609
1610         idle_pipe[1] = safe_close(idle_pipe[1]);
1611         idle_pipe[2] = safe_close(idle_pipe[2]);
1612
1613         if (idle_pipe[0] >= 0) {
1614                 int r;
1615
1616                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1617
1618                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1619                         ssize_t n;
1620
1621                         /* Signal systemd that we are bored and want to continue. */
1622                         n = write(idle_pipe[3], "x", 1);
1623                         if (n > 0)
1624                                 /* Wait for systemd to react to the signal above. */
1625                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1626                 }
1627
1628                 idle_pipe[0] = safe_close(idle_pipe[0]);
1629
1630         }
1631
1632         idle_pipe[3] = safe_close(idle_pipe[3]);
1633 }
1634
1635 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1636
1637 static int build_environment(
1638                 const Unit *u,
1639                 const ExecContext *c,
1640                 const ExecParameters *p,
1641                 size_t n_fds,
1642                 const char *home,
1643                 const char *username,
1644                 const char *shell,
1645                 dev_t journal_stream_dev,
1646                 ino_t journal_stream_ino,
1647                 char ***ret) {
1648
1649         _cleanup_strv_free_ char **our_env = NULL;
1650         ExecDirectoryType t;
1651         size_t n_env = 0;
1652         char *x;
1653
1654         assert(u);
1655         assert(c);
1656         assert(p);
1657         assert(ret);
1658
1659         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1660         if (!our_env)
1661                 return -ENOMEM;
1662
1663         if (n_fds > 0) {
1664                 _cleanup_free_ char *joined = NULL;
1665
1666                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1667                         return -ENOMEM;
1668                 our_env[n_env++] = x;
1669
1670                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1671                         return -ENOMEM;
1672                 our_env[n_env++] = x;
1673
1674                 joined = strv_join(p->fd_names, ":");
1675                 if (!joined)
1676                         return -ENOMEM;
1677
1678                 x = strjoin("LISTEN_FDNAMES=", joined);
1679                 if (!x)
1680                         return -ENOMEM;
1681                 our_env[n_env++] = x;
1682         }
1683
1684         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1685                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1686                         return -ENOMEM;
1687                 our_env[n_env++] = x;
1688
1689                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1690                         return -ENOMEM;
1691                 our_env[n_env++] = x;
1692         }
1693
1694         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1695          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1696          * check the database directly. */
1697         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1698                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1699                 if (!x)
1700                         return -ENOMEM;
1701                 our_env[n_env++] = x;
1702         }
1703
1704         if (home) {
1705                 x = strjoin("HOME=", home);
1706                 if (!x)
1707                         return -ENOMEM;
1708
1709                 path_simplify(x + 5, true);
1710                 our_env[n_env++] = x;
1711         }
1712
1713         if (username) {
1714                 x = strjoin("LOGNAME=", username);
1715                 if (!x)
1716                         return -ENOMEM;
1717                 our_env[n_env++] = x;
1718
1719                 x = strjoin("USER=", username);
1720                 if (!x)
1721                         return -ENOMEM;
1722                 our_env[n_env++] = x;
1723         }
1724
1725         if (shell) {
1726                 x = strjoin("SHELL=", shell);
1727                 if (!x)
1728                         return -ENOMEM;
1729
1730                 path_simplify(x + 6, true);
1731                 our_env[n_env++] = x;
1732         }
1733
1734         if (!sd_id128_is_null(u->invocation_id)) {
1735                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1736                         return -ENOMEM;
1737
1738                 our_env[n_env++] = x;
1739         }
1740
1741         if (exec_context_needs_term(c)) {
1742                 const char *tty_path, *term = NULL;
1743
1744                 tty_path = exec_context_tty_path(c);
1745
1746                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1747                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1748                  * passes to PID 1 ends up all the way in the console login shown. */
1749
1750                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1751                         term = getenv("TERM");
1752                 if (!term)
1753                         term = default_term_for_tty(tty_path);
1754
1755                 x = strjoin("TERM=", term);
1756                 if (!x)
1757                         return -ENOMEM;
1758                 our_env[n_env++] = x;
1759         }
1760
1761         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1762                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1763                         return -ENOMEM;
1764
1765                 our_env[n_env++] = x;
1766         }
1767
1768         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1769                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1770                 const char *n;
1771
1772                 if (!p->prefix[t])
1773                         continue;
1774
1775                 if (strv_isempty(c->directories[t].paths))
1776                         continue;
1777
1778                 n = exec_directory_env_name_to_string(t);
1779                 if (!n)
1780                         continue;
1781
1782                 pre = strjoin(p->prefix[t], "/");
1783                 if (!pre)
1784                         return -ENOMEM;
1785
1786                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1787                 if (!joined)
1788                         return -ENOMEM;
1789
1790                 x = strjoin(n, "=", joined);
1791                 if (!x)
1792                         return -ENOMEM;
1793
1794                 our_env[n_env++] = x;
1795         }
1796
1797         our_env[n_env++] = NULL;
1798         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1799
1800         *ret = TAKE_PTR(our_env);
1801
1802         return 0;
1803 }
1804
1805 static int build_pass_environment(const ExecContext *c, char ***ret) {
1806         _cleanup_strv_free_ char **pass_env = NULL;
1807         size_t n_env = 0, n_bufsize = 0;
1808         char **i;
1809
1810         STRV_FOREACH(i, c->pass_environment) {
1811                 _cleanup_free_ char *x = NULL;
1812                 char *v;
1813
1814                 v = getenv(*i);
1815                 if (!v)
1816                         continue;
1817                 x = strjoin(*i, "=", v);
1818                 if (!x)
1819                         return -ENOMEM;
1820
1821                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1822                         return -ENOMEM;
1823
1824                 pass_env[n_env++] = TAKE_PTR(x);
1825                 pass_env[n_env] = NULL;
1826         }
1827
1828         *ret = TAKE_PTR(pass_env);
1829
1830         return 0;
1831 }
1832
1833 static bool exec_needs_mount_namespace(
1834                 const ExecContext *context,
1835                 const ExecParameters *params,
1836                 const ExecRuntime *runtime) {
1837
1838         assert(context);
1839         assert(params);
1840
1841         if (context->root_image)
1842                 return true;
1843
1844         if (!strv_isempty(context->read_write_paths) ||
1845             !strv_isempty(context->read_only_paths) ||
1846             !strv_isempty(context->inaccessible_paths))
1847                 return true;
1848
1849         if (context->n_bind_mounts > 0)
1850                 return true;
1851
1852         if (context->n_temporary_filesystems > 0)
1853                 return true;
1854
1855         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1856                 return true;
1857
1858         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1859                 return true;
1860
1861         if (context->private_devices ||
1862             context->private_mounts ||
1863             context->protect_system != PROTECT_SYSTEM_NO ||
1864             context->protect_home != PROTECT_HOME_NO ||
1865             context->protect_kernel_tunables ||
1866             context->protect_kernel_modules ||
1867             context->protect_control_groups)
1868                 return true;
1869
1870         if (context->root_directory) {
1871                 ExecDirectoryType t;
1872
1873                 if (context->mount_apivfs)
1874                         return true;
1875
1876                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1877                         if (!params->prefix[t])
1878                                 continue;
1879
1880                         if (!strv_isempty(context->directories[t].paths))
1881                                 return true;
1882                 }
1883         }
1884
1885         if (context->dynamic_user &&
1886             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1887              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1888              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1889                 return true;
1890
1891         return false;
1892 }
1893
1894 static int setup_private_users(uid_t uid, gid_t gid) {
1895         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1896         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1897         _cleanup_close_ int unshare_ready_fd = -1;
1898         _cleanup_(sigkill_waitp) pid_t pid = 0;
1899         uint64_t c = 1;
1900         ssize_t n;
1901         int r;
1902
1903         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1904          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1905          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1906          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1907          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1908          * continues execution normally. */
1909
1910         if (uid != 0 && uid_is_valid(uid)) {
1911                 r = asprintf(&uid_map,
1912                              "0 0 1\n"                      /* Map root → root */
1913                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1914                              uid, uid);
1915                 if (r < 0)
1916                         return -ENOMEM;
1917         } else {
1918                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1919                 if (!uid_map)
1920                         return -ENOMEM;
1921         }
1922
1923         if (gid != 0 && gid_is_valid(gid)) {
1924                 r = asprintf(&gid_map,
1925                              "0 0 1\n"                      /* Map root → root */
1926                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1927                              gid, gid);
1928                 if (r < 0)
1929                         return -ENOMEM;
1930         } else {
1931                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1932                 if (!gid_map)
1933                         return -ENOMEM;
1934         }
1935
1936         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1937          * namespace. */
1938         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1939         if (unshare_ready_fd < 0)
1940                 return -errno;
1941
1942         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1943          * failed. */
1944         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1945                 return -errno;
1946
1947         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1948         if (r < 0)
1949                 return r;
1950         if (r == 0) {
1951                 _cleanup_close_ int fd = -1;
1952                 const char *a;
1953                 pid_t ppid;
1954
1955                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1956                  * here, after the parent opened its own user namespace. */
1957
1958                 ppid = getppid();
1959                 errno_pipe[0] = safe_close(errno_pipe[0]);
1960
1961                 /* Wait until the parent unshared the user namespace */
1962                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1963                         r = -errno;
1964                         goto child_fail;
1965                 }
1966
1967                 /* Disable the setgroups() system call in the child user namespace, for good. */
1968                 a = procfs_file_alloca(ppid, "setgroups");
1969                 fd = open(a, O_WRONLY|O_CLOEXEC);
1970                 if (fd < 0) {
1971                         if (errno != ENOENT) {
1972                                 r = -errno;
1973                                 goto child_fail;
1974                         }
1975
1976                         /* If the file is missing the kernel is too old, let's continue anyway. */
1977                 } else {
1978                         if (write(fd, "deny\n", 5) < 0) {
1979                                 r = -errno;
1980                                 goto child_fail;
1981                         }
1982
1983                         fd = safe_close(fd);
1984                 }
1985
1986                 /* First write the GID map */
1987                 a = procfs_file_alloca(ppid, "gid_map");
1988                 fd = open(a, O_WRONLY|O_CLOEXEC);
1989                 if (fd < 0) {
1990                         r = -errno;
1991                         goto child_fail;
1992                 }
1993                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1994                         r = -errno;
1995                         goto child_fail;
1996                 }
1997                 fd = safe_close(fd);
1998
1999                 /* The write the UID map */
2000                 a = procfs_file_alloca(ppid, "uid_map");
2001                 fd = open(a, O_WRONLY|O_CLOEXEC);
2002                 if (fd < 0) {
2003                         r = -errno;
2004                         goto child_fail;
2005                 }
2006                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2007                         r = -errno;
2008                         goto child_fail;
2009                 }
2010
2011                 _exit(EXIT_SUCCESS);
2012
2013         child_fail:
2014                 (void) write(errno_pipe[1], &r, sizeof(r));
2015                 _exit(EXIT_FAILURE);
2016         }
2017
2018         errno_pipe[1] = safe_close(errno_pipe[1]);
2019
2020         if (unshare(CLONE_NEWUSER) < 0)
2021                 return -errno;
2022
2023         /* Let the child know that the namespace is ready now */
2024         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2025                 return -errno;
2026
2027         /* Try to read an error code from the child */
2028         n = read(errno_pipe[0], &r, sizeof(r));
2029         if (n < 0)
2030                 return -errno;
2031         if (n == sizeof(r)) { /* an error code was sent to us */
2032                 if (r < 0)
2033                         return r;
2034                 return -EIO;
2035         }
2036         if (n != 0) /* on success we should have read 0 bytes */
2037                 return -EIO;
2038
2039         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2040         pid = 0;
2041         if (r < 0)
2042                 return r;
2043         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2044                 return -EIO;
2045
2046         return 0;
2047 }
2048
2049 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2050         if (!context->dynamic_user)
2051                 return false;
2052
2053         if (type == EXEC_DIRECTORY_CONFIGURATION)
2054                 return false;
2055
2056         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2057                 return false;
2058
2059         return true;
2060 }
2061
2062 static int setup_exec_directory(
2063                 const ExecContext *context,
2064                 const ExecParameters *params,
2065                 uid_t uid,
2066                 gid_t gid,
2067                 ExecDirectoryType type,
2068                 int *exit_status) {
2069
2070         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2071                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2072                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2073                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2074                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2075                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2076         };
2077         char **rt;
2078         int r;
2079
2080         assert(context);
2081         assert(params);
2082         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2083         assert(exit_status);
2084
2085         if (!params->prefix[type])
2086                 return 0;
2087
2088         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2089                 if (!uid_is_valid(uid))
2090                         uid = 0;
2091                 if (!gid_is_valid(gid))
2092                         gid = 0;
2093         }
2094
2095         STRV_FOREACH(rt, context->directories[type].paths) {
2096                 _cleanup_free_ char *p = NULL, *pp = NULL;
2097
2098                 p = path_join(params->prefix[type], *rt);
2099                 if (!p) {
2100                         r = -ENOMEM;
2101                         goto fail;
2102                 }
2103
2104                 r = mkdir_parents_label(p, 0755);
2105                 if (r < 0)
2106                         goto fail;
2107
2108                 if (exec_directory_is_private(context, type)) {
2109                         _cleanup_free_ char *private_root = NULL;
2110
2111                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2112                          * case we want to avoid leaving a directory around fully accessible that is owned by
2113                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2114                          * trick used by container managers to prohibit host users to get access to files of
2115                          * the same UID in containers: we place everything inside a directory that has an
2116                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2117                          * for unprivileged host code. We then use fs namespacing to make this directory
2118                          * permeable for the service itself.
2119                          *
2120                          * Specifically: for a service which wants a special directory "foo/" we first create
2121                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2122                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2123                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2124                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2125                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2126                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2127                          * for the service and making sure it only gets access to the dirs it needs but no
2128                          * others. Tricky? Yes, absolutely, but it works!
2129                          *
2130                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2131                          * to be owned by the service itself.
2132                          *
2133                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2134                          * for sharing files or sockets with other services. */
2135
2136                         private_root = path_join(params->prefix[type], "private");
2137                         if (!private_root) {
2138                                 r = -ENOMEM;
2139                                 goto fail;
2140                         }
2141
2142                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2143                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2144                         if (r < 0)
2145                                 goto fail;
2146
2147                         pp = path_join(private_root, *rt);
2148                         if (!pp) {
2149                                 r = -ENOMEM;
2150                                 goto fail;
2151                         }
2152
2153                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2154                         r = mkdir_parents_label(pp, 0755);
2155                         if (r < 0)
2156                                 goto fail;
2157
2158                         if (is_dir(p, false) > 0 &&
2159                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2160
2161                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2162                                  * it over. Most likely the service has been upgraded from one that didn't use
2163                                  * DynamicUser=1, to one that does. */
2164
2165                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2166                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2167                                          exec_directory_type_to_string(type), p, pp);
2168
2169                                 if (rename(p, pp) < 0) {
2170                                         r = -errno;
2171                                         goto fail;
2172                                 }
2173                         } else {
2174                                 /* Otherwise, create the actual directory for the service */
2175
2176                                 r = mkdir_label(pp, context->directories[type].mode);
2177                                 if (r < 0 && r != -EEXIST)
2178                                         goto fail;
2179                         }
2180
2181                         /* And link it up from the original place */
2182                         r = symlink_idempotent(pp, p, true);
2183                         if (r < 0)
2184                                 goto fail;
2185
2186                 } else {
2187                         _cleanup_free_ char *target = NULL;
2188
2189                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2190                             readlink_and_make_absolute(p, &target) >= 0) {
2191                                 _cleanup_free_ char *q = NULL;
2192
2193                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2194                                  * by DynamicUser=1 (see above)?
2195                                  *
2196                                  * We do this for all directory types except for ConfigurationDirectory=,
2197                                  * since they all support the private/ symlink logic at least in some
2198                                  * configurations, see above. */
2199
2200                                 q = path_join(params->prefix[type], "private", *rt);
2201                                 if (!q) {
2202                                         r = -ENOMEM;
2203                                         goto fail;
2204                                 }
2205
2206                                 if (path_equal(q, target)) {
2207
2208                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2209                                          * but is no longer. Let's move the directory back up. */
2210
2211                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2212                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2213                                                  exec_directory_type_to_string(type), q, p);
2214
2215                                         if (unlink(p) < 0) {
2216                                                 r = -errno;
2217                                                 goto fail;
2218                                         }
2219
2220                                         if (rename(q, p) < 0) {
2221                                                 r = -errno;
2222                                                 goto fail;
2223                                         }
2224                                 }
2225                         }
2226
2227                         r = mkdir_label(p, context->directories[type].mode);
2228                         if (r < 0) {
2229                                 if (r != -EEXIST)
2230                                         goto fail;
2231
2232                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2233                                         struct stat st;
2234
2235                                         /* Don't change the owner/access mode of the configuration directory,
2236                                          * as in the common case it is not written to by a service, and shall
2237                                          * not be writable. */
2238
2239                                         if (stat(p, &st) < 0) {
2240                                                 r = -errno;
2241                                                 goto fail;
2242                                         }
2243
2244                                         /* Still complain if the access mode doesn't match */
2245                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2246                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2247                                                             "(File system: %o %sMode: %o)",
2248                                                             exec_directory_type_to_string(type), *rt,
2249                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2250
2251                                         continue;
2252                                 }
2253                         }
2254                 }
2255
2256                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2257                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2258                  * current UID/GID ownership.) */
2259                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2260                 if (r < 0)
2261                         goto fail;
2262
2263                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2264                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2265                  * assignments to exist.*/
2266                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2267                 if (r < 0)
2268                         goto fail;
2269         }
2270
2271         return 0;
2272
2273 fail:
2274         *exit_status = exit_status_table[type];
2275         return r;
2276 }
2277
2278 #if ENABLE_SMACK
2279 static int setup_smack(
2280                 const ExecContext *context,
2281                 const ExecCommand *command) {
2282
2283         int r;
2284
2285         assert(context);
2286         assert(command);
2287
2288         if (context->smack_process_label) {
2289                 r = mac_smack_apply_pid(0, context->smack_process_label);
2290                 if (r < 0)
2291                         return r;
2292         }
2293 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2294         else {
2295                 _cleanup_free_ char *exec_label = NULL;
2296
2297                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2298                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2299                         return r;
2300
2301                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2302                 if (r < 0)
2303                         return r;
2304         }
2305 #endif
2306
2307         return 0;
2308 }
2309 #endif
2310
2311 static int compile_bind_mounts(
2312                 const ExecContext *context,
2313                 const ExecParameters *params,
2314                 BindMount **ret_bind_mounts,
2315                 size_t *ret_n_bind_mounts,
2316                 char ***ret_empty_directories) {
2317
2318         _cleanup_strv_free_ char **empty_directories = NULL;
2319         BindMount *bind_mounts;
2320         size_t n, h = 0, i;
2321         ExecDirectoryType t;
2322         int r;
2323
2324         assert(context);
2325         assert(params);
2326         assert(ret_bind_mounts);
2327         assert(ret_n_bind_mounts);
2328         assert(ret_empty_directories);
2329
2330         n = context->n_bind_mounts;
2331         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2332                 if (!params->prefix[t])
2333                         continue;
2334
2335                 n += strv_length(context->directories[t].paths);
2336         }
2337
2338         if (n <= 0) {
2339                 *ret_bind_mounts = NULL;
2340                 *ret_n_bind_mounts = 0;
2341                 *ret_empty_directories = NULL;
2342                 return 0;
2343         }
2344
2345         bind_mounts = new(BindMount, n);
2346         if (!bind_mounts)
2347                 return -ENOMEM;
2348
2349         for (i = 0; i < context->n_bind_mounts; i++) {
2350                 BindMount *item = context->bind_mounts + i;
2351                 char *s, *d;
2352
2353                 s = strdup(item->source);
2354                 if (!s) {
2355                         r = -ENOMEM;
2356                         goto finish;
2357                 }
2358
2359                 d = strdup(item->destination);
2360                 if (!d) {
2361                         free(s);
2362                         r = -ENOMEM;
2363                         goto finish;
2364                 }
2365
2366                 bind_mounts[h++] = (BindMount) {
2367                         .source = s,
2368                         .destination = d,
2369                         .read_only = item->read_only,
2370                         .recursive = item->recursive,
2371                         .ignore_enoent = item->ignore_enoent,
2372                 };
2373         }
2374
2375         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2376                 char **suffix;
2377
2378                 if (!params->prefix[t])
2379                         continue;
2380
2381                 if (strv_isempty(context->directories[t].paths))
2382                         continue;
2383
2384                 if (exec_directory_is_private(context, t) &&
2385                     !(context->root_directory || context->root_image)) {
2386                         char *private_root;
2387
2388                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2389                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2390                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2391
2392                         private_root = path_join(params->prefix[t], "private");
2393                         if (!private_root) {
2394                                 r = -ENOMEM;
2395                                 goto finish;
2396                         }
2397
2398                         r = strv_consume(&empty_directories, private_root);
2399                         if (r < 0)
2400                                 goto finish;
2401                 }
2402
2403                 STRV_FOREACH(suffix, context->directories[t].paths) {
2404                         char *s, *d;
2405
2406                         if (exec_directory_is_private(context, t))
2407                                 s = path_join(params->prefix[t], "private", *suffix);
2408                         else
2409                                 s = path_join(params->prefix[t], *suffix);
2410                         if (!s) {
2411                                 r = -ENOMEM;
2412                                 goto finish;
2413                         }
2414
2415                         if (exec_directory_is_private(context, t) &&
2416                             (context->root_directory || context->root_image))
2417                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2418                                  * directory is not created on the root directory. So, let's bind-mount the directory
2419                                  * on the 'non-private' place. */
2420                                 d = path_join(params->prefix[t], *suffix);
2421                         else
2422                                 d = strdup(s);
2423                         if (!d) {
2424                                 free(s);
2425                                 r = -ENOMEM;
2426                                 goto finish;
2427                         }
2428
2429                         bind_mounts[h++] = (BindMount) {
2430                                 .source = s,
2431                                 .destination = d,
2432                                 .read_only = false,
2433                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2434                                 .recursive = true,
2435                                 .ignore_enoent = false,
2436                         };
2437                 }
2438         }
2439
2440         assert(h == n);
2441
2442         *ret_bind_mounts = bind_mounts;
2443         *ret_n_bind_mounts = n;
2444         *ret_empty_directories = TAKE_PTR(empty_directories);
2445
2446         return (int) n;
2447
2448 finish:
2449         bind_mount_free_many(bind_mounts, h);
2450         return r;
2451 }
2452
2453 static int apply_mount_namespace(
2454                 const Unit *u,
2455                 const ExecCommand *command,
2456                 const ExecContext *context,
2457                 const ExecParameters *params,
2458                 const ExecRuntime *runtime,
2459                 char **error_path) {
2460
2461         _cleanup_strv_free_ char **empty_directories = NULL;
2462         char *tmp = NULL, *var = NULL;
2463         const char *root_dir = NULL, *root_image = NULL;
2464         NamespaceInfo ns_info;
2465         bool needs_sandboxing;
2466         BindMount *bind_mounts = NULL;
2467         size_t n_bind_mounts = 0;
2468         int r;
2469
2470         assert(context);
2471
2472         /* The runtime struct only contains the parent of the private /tmp,
2473          * which is non-accessible to world users. Inside of it there's a /tmp
2474          * that is sticky, and that's the one we want to use here. */
2475
2476         if (context->private_tmp && runtime) {
2477                 if (runtime->tmp_dir)
2478                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2479                 if (runtime->var_tmp_dir)
2480                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2481         }
2482
2483         if (params->flags & EXEC_APPLY_CHROOT) {
2484                 root_image = context->root_image;
2485
2486                 if (!root_image)
2487                         root_dir = context->root_directory;
2488         }
2489
2490         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2491         if (r < 0)
2492                 return r;
2493
2494         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2495         if (needs_sandboxing)
2496                 ns_info = (NamespaceInfo) {
2497                         .ignore_protect_paths = false,
2498                         .private_dev = context->private_devices,
2499                         .protect_control_groups = context->protect_control_groups,
2500                         .protect_kernel_tunables = context->protect_kernel_tunables,
2501                         .protect_kernel_modules = context->protect_kernel_modules,
2502                         .protect_hostname = context->protect_hostname,
2503                         .mount_apivfs = context->mount_apivfs,
2504                         .private_mounts = context->private_mounts,
2505                 };
2506         else if (!context->dynamic_user && root_dir)
2507                 /*
2508                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2509                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2510                  * fail if we are enable to apply the sandbox inside the mount namespace.
2511                  */
2512                 ns_info = (NamespaceInfo) {
2513                         .ignore_protect_paths = true,
2514                 };
2515         else
2516                 ns_info = (NamespaceInfo) {};
2517
2518         if (context->mount_flags == MS_SHARED)
2519                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2520
2521         r = setup_namespace(root_dir, root_image,
2522                             &ns_info, context->read_write_paths,
2523                             needs_sandboxing ? context->read_only_paths : NULL,
2524                             needs_sandboxing ? context->inaccessible_paths : NULL,
2525                             empty_directories,
2526                             bind_mounts,
2527                             n_bind_mounts,
2528                             context->temporary_filesystems,
2529                             context->n_temporary_filesystems,
2530                             tmp,
2531                             var,
2532                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2533                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2534                             context->mount_flags,
2535                             DISSECT_IMAGE_DISCARD_ON_LOOP,
2536                             error_path);
2537
2538         bind_mount_free_many(bind_mounts, n_bind_mounts);
2539
2540         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2541          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2542          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2543          * completely different execution environment. */
2544         if (r == -ENOANO) {
2545                 if (n_bind_mounts == 0 &&
2546                     context->n_temporary_filesystems == 0 &&
2547                     !root_dir && !root_image &&
2548                     !context->dynamic_user) {
2549                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2550                         return 0;
2551                 }
2552
2553                 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2554                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2555                                n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2556
2557                 return -EOPNOTSUPP;
2558         }
2559
2560         return r;
2561 }
2562
2563 static int apply_working_directory(
2564                 const ExecContext *context,
2565                 const ExecParameters *params,
2566                 const char *home,
2567                 int *exit_status) {
2568
2569         const char *d, *wd;
2570
2571         assert(context);
2572         assert(exit_status);
2573
2574         if (context->working_directory_home) {
2575
2576                 if (!home) {
2577                         *exit_status = EXIT_CHDIR;
2578                         return -ENXIO;
2579                 }
2580
2581                 wd = home;
2582
2583         } else if (context->working_directory)
2584                 wd = context->working_directory;
2585         else
2586                 wd = "/";
2587
2588         if (params->flags & EXEC_APPLY_CHROOT)
2589                 d = wd;
2590         else
2591                 d = prefix_roota(context->root_directory, wd);
2592
2593         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2594                 *exit_status = EXIT_CHDIR;
2595                 return -errno;
2596         }
2597
2598         return 0;
2599 }
2600
2601 static int apply_root_directory(
2602                 const ExecContext *context,
2603                 const ExecParameters *params,
2604                 const bool needs_mount_ns,
2605                 int *exit_status) {
2606
2607         assert(context);
2608         assert(exit_status);
2609
2610         if (params->flags & EXEC_APPLY_CHROOT) {
2611                 if (!needs_mount_ns && context->root_directory)
2612                         if (chroot(context->root_directory) < 0) {
2613                                 *exit_status = EXIT_CHROOT;
2614                                 return -errno;
2615                         }
2616         }
2617
2618         return 0;
2619 }
2620
2621 static int setup_keyring(
2622                 const Unit *u,
2623                 const ExecContext *context,
2624                 const ExecParameters *p,
2625                 uid_t uid, gid_t gid) {
2626
2627         key_serial_t keyring;
2628         int r = 0;
2629         uid_t saved_uid;
2630         gid_t saved_gid;
2631
2632         assert(u);
2633         assert(context);
2634         assert(p);
2635
2636         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2637          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2638          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2639          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2640          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2641          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2642
2643         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2644                 return 0;
2645
2646         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2647          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2648          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2649          * & group is just as nasty as acquiring a reference to the user keyring. */
2650
2651         saved_uid = getuid();
2652         saved_gid = getgid();
2653
2654         if (gid_is_valid(gid) && gid != saved_gid) {
2655                 if (setregid(gid, -1) < 0)
2656                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2657         }
2658
2659         if (uid_is_valid(uid) && uid != saved_uid) {
2660                 if (setreuid(uid, -1) < 0) {
2661                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2662                         goto out;
2663                 }
2664         }
2665
2666         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2667         if (keyring == -1) {
2668                 if (errno == ENOSYS)
2669                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2670                 else if (IN_SET(errno, EACCES, EPERM))
2671                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2672                 else if (errno == EDQUOT)
2673                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2674                 else
2675                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2676
2677                 goto out;
2678         }
2679
2680         /* When requested link the user keyring into the session keyring. */
2681         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2682
2683                 if (keyctl(KEYCTL_LINK,
2684                            KEY_SPEC_USER_KEYRING,
2685                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2686                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2687                         goto out;
2688                 }
2689         }
2690
2691         /* Restore uid/gid back */
2692         if (uid_is_valid(uid) && uid != saved_uid) {
2693                 if (setreuid(saved_uid, -1) < 0) {
2694                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2695                         goto out;
2696                 }
2697         }
2698
2699         if (gid_is_valid(gid) && gid != saved_gid) {
2700                 if (setregid(saved_gid, -1) < 0)
2701                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2702         }
2703
2704         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2705         if (!sd_id128_is_null(u->invocation_id)) {
2706                 key_serial_t key;
2707
2708                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2709                 if (key == -1)
2710                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2711                 else {
2712                         if (keyctl(KEYCTL_SETPERM, key,
2713                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2714                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2715                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2716                 }
2717         }
2718
2719 out:
2720         /* Revert back uid & gid for the the last time, and exit */
2721         /* no extra logging, as only the first already reported error matters */
2722         if (getuid() != saved_uid)
2723                 (void) setreuid(saved_uid, -1);
2724
2725         if (getgid() != saved_gid)
2726                 (void) setregid(saved_gid, -1);
2727
2728         return r;
2729 }
2730
2731 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2732         assert(array);
2733         assert(n);
2734         assert(pair);
2735
2736         if (pair[0] >= 0)
2737                 array[(*n)++] = pair[0];
2738         if (pair[1] >= 0)
2739                 array[(*n)++] = pair[1];
2740 }
2741
2742 static int close_remaining_fds(
2743                 const ExecParameters *params,
2744                 const ExecRuntime *runtime,
2745                 const DynamicCreds *dcreds,
2746                 int user_lookup_fd,
2747                 int socket_fd,
2748                 int exec_fd,
2749                 int *fds, size_t n_fds) {
2750
2751         size_t n_dont_close = 0;
2752         int dont_close[n_fds + 12];
2753
2754         assert(params);
2755
2756         if (params->stdin_fd >= 0)
2757                 dont_close[n_dont_close++] = params->stdin_fd;
2758         if (params->stdout_fd >= 0)
2759                 dont_close[n_dont_close++] = params->stdout_fd;
2760         if (params->stderr_fd >= 0)
2761                 dont_close[n_dont_close++] = params->stderr_fd;
2762
2763         if (socket_fd >= 0)
2764                 dont_close[n_dont_close++] = socket_fd;
2765         if (exec_fd >= 0)
2766                 dont_close[n_dont_close++] = exec_fd;
2767         if (n_fds > 0) {
2768                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2769                 n_dont_close += n_fds;
2770         }
2771
2772         if (runtime)
2773                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2774
2775         if (dcreds) {
2776                 if (dcreds->user)
2777                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2778                 if (dcreds->group)
2779                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2780         }
2781
2782         if (user_lookup_fd >= 0)
2783                 dont_close[n_dont_close++] = user_lookup_fd;
2784
2785         return close_all_fds(dont_close, n_dont_close);
2786 }
2787
2788 static int send_user_lookup(
2789                 Unit *unit,
2790                 int user_lookup_fd,
2791                 uid_t uid,
2792                 gid_t gid) {
2793
2794         assert(unit);
2795
2796         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2797          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2798          * specified. */
2799
2800         if (user_lookup_fd < 0)
2801                 return 0;
2802
2803         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2804                 return 0;
2805
2806         if (writev(user_lookup_fd,
2807                (struct iovec[]) {
2808                            IOVEC_INIT(&uid, sizeof(uid)),
2809                            IOVEC_INIT(&gid, sizeof(gid)),
2810                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2811                 return -errno;
2812
2813         return 0;
2814 }
2815
2816 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2817         int r;
2818
2819         assert(c);
2820         assert(home);
2821         assert(buf);
2822
2823         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2824
2825         if (*home)
2826                 return 0;
2827
2828         if (!c->working_directory_home)
2829                 return 0;
2830
2831         r = get_home_dir(buf);
2832         if (r < 0)
2833                 return r;
2834
2835         *home = *buf;
2836         return 1;
2837 }
2838
2839 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2840         _cleanup_strv_free_ char ** list = NULL;
2841         ExecDirectoryType t;
2842         int r;
2843
2844         assert(c);
2845         assert(p);
2846         assert(ret);
2847
2848         assert(c->dynamic_user);
2849
2850         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2851          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2852          * directories. */
2853
2854         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2855                 char **i;
2856
2857                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2858                         continue;
2859
2860                 if (!p->prefix[t])
2861                         continue;
2862
2863                 STRV_FOREACH(i, c->directories[t].paths) {
2864                         char *e;
2865
2866                         if (exec_directory_is_private(c, t))
2867                                 e = path_join(p->prefix[t], "private", *i);
2868                         else
2869                                 e = path_join(p->prefix[t], *i);
2870                         if (!e)
2871                                 return -ENOMEM;
2872
2873                         r = strv_consume(&list, e);
2874                         if (r < 0)
2875                                 return r;
2876                 }
2877         }
2878
2879         *ret = TAKE_PTR(list);
2880
2881         return 0;
2882 }
2883
2884 static char *exec_command_line(char **argv);
2885
2886 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2887         bool using_subcgroup;
2888         char *p;
2889
2890         assert(params);
2891         assert(ret);
2892
2893         if (!params->cgroup_path)
2894                 return -EINVAL;
2895
2896         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2897          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2898          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2899          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2900          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2901          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2902          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2903          * flag, which is only passed for the former statements, not for the latter. */
2904
2905         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2906         if (using_subcgroup)
2907                 p = path_join(params->cgroup_path, ".control");
2908         else
2909                 p = strdup(params->cgroup_path);
2910         if (!p)
2911                 return -ENOMEM;
2912
2913         *ret = p;
2914         return using_subcgroup;
2915 }
2916
2917 static int exec_child(
2918                 Unit *unit,
2919                 const ExecCommand *command,
2920                 const ExecContext *context,
2921                 const ExecParameters *params,
2922                 ExecRuntime *runtime,
2923                 DynamicCreds *dcreds,
2924                 int socket_fd,
2925                 const int named_iofds[static 3],
2926                 int *fds,
2927                 size_t n_socket_fds,
2928                 size_t n_storage_fds,
2929                 char **files_env,
2930                 int user_lookup_fd,
2931                 int *exit_status) {
2932
2933         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2934         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2935         _cleanup_free_ gid_t *supplementary_gids = NULL;
2936         const char *username = NULL, *groupname = NULL;
2937         _cleanup_free_ char *home_buffer = NULL;
2938         const char *home = NULL, *shell = NULL;
2939         char **final_argv = NULL;
2940         dev_t journal_stream_dev = 0;
2941         ino_t journal_stream_ino = 0;
2942         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2943                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2944                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2945                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2946 #if HAVE_SELINUX
2947         _cleanup_free_ char *mac_selinux_context_net = NULL;
2948         bool use_selinux = false;
2949 #endif
2950 #if ENABLE_SMACK
2951         bool use_smack = false;
2952 #endif
2953 #if HAVE_APPARMOR
2954         bool use_apparmor = false;
2955 #endif
2956         uid_t uid = UID_INVALID;
2957         gid_t gid = GID_INVALID;
2958         size_t n_fds;
2959         ExecDirectoryType dt;
2960         int secure_bits;
2961
2962         assert(unit);
2963         assert(command);
2964         assert(context);
2965         assert(params);
2966         assert(exit_status);
2967
2968         rename_process_from_path(command->path);
2969
2970         /* We reset exactly these signals, since they are the
2971          * only ones we set to SIG_IGN in the main daemon. All
2972          * others we leave untouched because we set them to
2973          * SIG_DFL or a valid handler initially, both of which
2974          * will be demoted to SIG_DFL. */
2975         (void) default_signals(SIGNALS_CRASH_HANDLER,
2976                                SIGNALS_IGNORE, -1);
2977
2978         if (context->ignore_sigpipe)
2979                 (void) ignore_signals(SIGPIPE, -1);
2980
2981         r = reset_signal_mask();
2982         if (r < 0) {
2983                 *exit_status = EXIT_SIGNAL_MASK;
2984                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2985         }
2986
2987         if (params->idle_pipe)
2988                 do_idle_pipe_dance(params->idle_pipe);
2989
2990         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2991          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2992          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2993          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2994
2995         log_forget_fds();
2996         log_set_open_when_needed(true);
2997
2998         /* In case anything used libc syslog(), close this here, too */
2999         closelog();
3000
3001         n_fds = n_socket_fds + n_storage_fds;
3002         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3003         if (r < 0) {
3004                 *exit_status = EXIT_FDS;
3005                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3006         }
3007
3008         if (!context->same_pgrp)
3009                 if (setsid() < 0) {
3010                         *exit_status = EXIT_SETSID;
3011                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3012                 }
3013
3014         exec_context_tty_reset(context, params);
3015
3016         if (unit_shall_confirm_spawn(unit)) {
3017                 const char *vc = params->confirm_spawn;
3018                 _cleanup_free_ char *cmdline = NULL;
3019
3020                 cmdline = exec_command_line(command->argv);
3021                 if (!cmdline) {
3022                         *exit_status = EXIT_MEMORY;
3023                         return log_oom();
3024                 }
3025
3026                 r = ask_for_confirmation(vc, unit, cmdline);
3027                 if (r != CONFIRM_EXECUTE) {
3028                         if (r == CONFIRM_PRETEND_SUCCESS) {
3029                                 *exit_status = EXIT_SUCCESS;
3030                                 return 0;
3031                         }
3032                         *exit_status = EXIT_CONFIRM;
3033                         log_unit_error(unit, "Execution cancelled by the user");
3034                         return -ECANCELED;
3035                 }
3036         }
3037
3038         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3039          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3040          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3041          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3042          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3043         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3044             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3045                 *exit_status = EXIT_MEMORY;
3046                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3047         }
3048
3049         if (context->dynamic_user && dcreds) {
3050                 _cleanup_strv_free_ char **suggested_paths = NULL;
3051
3052                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3053                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3054                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3055                         *exit_status = EXIT_USER;
3056                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3057                 }
3058
3059                 r = compile_suggested_paths(context, params, &suggested_paths);
3060                 if (r < 0) {
3061                         *exit_status = EXIT_MEMORY;
3062                         return log_oom();
3063                 }
3064
3065                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3066                 if (r < 0) {
3067                         *exit_status = EXIT_USER;
3068                         if (r == -EILSEQ) {
3069                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3070                                 return -EOPNOTSUPP;
3071                         }
3072                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3073                 }
3074
3075                 if (!uid_is_valid(uid)) {
3076                         *exit_status = EXIT_USER;
3077                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3078                         return -ESRCH;
3079                 }
3080
3081                 if (!gid_is_valid(gid)) {
3082                         *exit_status = EXIT_USER;
3083                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3084                         return -ESRCH;
3085                 }
3086
3087                 if (dcreds->user)
3088                         username = dcreds->user->name;
3089
3090         } else {
3091                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3092                 if (r < 0) {
3093                         *exit_status = EXIT_USER;
3094                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3095                 }
3096
3097                 r = get_fixed_group(context, &groupname, &gid);
3098                 if (r < 0) {
3099                         *exit_status = EXIT_GROUP;
3100                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3101                 }
3102         }
3103
3104         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3105         r = get_supplementary_groups(context, username, groupname, gid,
3106                                      &supplementary_gids, &ngids);
3107         if (r < 0) {
3108                 *exit_status = EXIT_GROUP;
3109                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3110         }
3111
3112         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3113         if (r < 0) {
3114                 *exit_status = EXIT_USER;
3115                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3116         }
3117
3118         user_lookup_fd = safe_close(user_lookup_fd);
3119
3120         r = acquire_home(context, uid, &home, &home_buffer);
3121         if (r < 0) {
3122                 *exit_status = EXIT_CHDIR;
3123                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3124         }
3125
3126         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3127          * must sure to drop O_NONBLOCK */
3128         if (socket_fd >= 0)
3129                 (void) fd_nonblock(socket_fd, false);
3130
3131         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3132          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3133         if (params->cgroup_path) {
3134                 _cleanup_free_ char *p = NULL;
3135
3136                 r = exec_parameters_get_cgroup_path(params, &p);
3137                 if (r < 0) {
3138                         *exit_status = EXIT_CGROUP;
3139                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3140                 }
3141
3142                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3143                 if (r < 0) {
3144                         *exit_status = EXIT_CGROUP;
3145                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3146                 }
3147         }
3148
3149         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3150                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3151                 if (r < 0) {
3152                         *exit_status = EXIT_NETWORK;
3153                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3154                 }
3155         }
3156
3157         r = setup_input(context, params, socket_fd, named_iofds);
3158         if (r < 0) {
3159                 *exit_status = EXIT_STDIN;
3160                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3161         }
3162
3163         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3164         if (r < 0) {
3165                 *exit_status = EXIT_STDOUT;
3166                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3167         }
3168
3169         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3170         if (r < 0) {
3171                 *exit_status = EXIT_STDERR;
3172                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3173         }
3174
3175         if (context->oom_score_adjust_set) {
3176                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3177                  * prohibit write access to this file, and we shouldn't trip up over that. */
3178                 r = set_oom_score_adjust(context->oom_score_adjust);
3179                 if (IN_SET(r, -EPERM, -EACCES))
3180                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3181                 else if (r < 0) {
3182                         *exit_status = EXIT_OOM_ADJUST;
3183                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3184                 }
3185         }
3186
3187         if (context->nice_set)
3188                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3189                         *exit_status = EXIT_NICE;
3190                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3191                 }
3192
3193         if (context->cpu_sched_set) {
3194                 struct sched_param param = {
3195                         .sched_priority = context->cpu_sched_priority,
3196                 };
3197
3198                 r = sched_setscheduler(0,
3199                                        context->cpu_sched_policy |
3200                                        (context->cpu_sched_reset_on_fork ?
3201                                         SCHED_RESET_ON_FORK : 0),
3202                                        &param);
3203                 if (r < 0) {
3204                         *exit_status = EXIT_SETSCHEDULER;
3205                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3206                 }
3207         }
3208
3209         if (context->cpu_set.set)
3210                 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3211                         *exit_status = EXIT_CPUAFFINITY;
3212                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3213                 }
3214
3215         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3216                 r = apply_numa_policy(&context->numa_policy);
3217                 if (r == -EOPNOTSUPP)
3218                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3219                 else if (r < 0) {
3220                         *exit_status = EXIT_NUMA_POLICY;
3221                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3222                 }
3223         }
3224
3225         if (context->ioprio_set)
3226                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3227                         *exit_status = EXIT_IOPRIO;
3228                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3229                 }
3230
3231         if (context->timer_slack_nsec != NSEC_INFINITY)
3232                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3233                         *exit_status = EXIT_TIMERSLACK;
3234                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3235                 }
3236
3237         if (context->personality != PERSONALITY_INVALID) {
3238                 r = safe_personality(context->personality);
3239                 if (r < 0) {
3240                         *exit_status = EXIT_PERSONALITY;
3241                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3242                 }
3243         }
3244
3245         if (context->utmp_id)
3246                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3247                                       context->tty_path,
3248                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3249                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3250                                       USER_PROCESS,
3251                                       username);
3252
3253         if (uid_is_valid(uid)) {
3254                 r = chown_terminal(STDIN_FILENO, uid);
3255                 if (r < 0) {
3256                         *exit_status = EXIT_STDIN;
3257                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3258                 }
3259         }
3260
3261         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3262          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3263          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3264          * touch a single hierarchy too. */
3265         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3266                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3267                 if (r < 0) {
3268                         *exit_status = EXIT_CGROUP;
3269                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3270                 }
3271         }
3272
3273         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3274                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3275                 if (r < 0)
3276                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3277         }
3278
3279         r = build_environment(
3280                         unit,
3281                         context,
3282                         params,
3283                         n_fds,
3284                         home,
3285                         username,
3286                         shell,
3287                         journal_stream_dev,
3288                         journal_stream_ino,
3289                         &our_env);
3290         if (r < 0) {
3291                 *exit_status = EXIT_MEMORY;
3292                 return log_oom();
3293         }
3294
3295         r = build_pass_environment(context, &pass_env);
3296         if (r < 0) {
3297                 *exit_status = EXIT_MEMORY;
3298                 return log_oom();
3299         }
3300
3301         accum_env = strv_env_merge(5,
3302                                    params->environment,
3303                                    our_env,
3304                                    pass_env,
3305                                    context->environment,
3306                                    files_env,
3307                                    NULL);
3308         if (!accum_env) {
3309                 *exit_status = EXIT_MEMORY;
3310                 return log_oom();
3311         }
3312         accum_env = strv_env_clean(accum_env);
3313
3314         (void) umask(context->umask);
3315
3316         r = setup_keyring(unit, context, params, uid, gid);
3317         if (r < 0) {
3318                 *exit_status = EXIT_KEYRING;
3319                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3320         }
3321
3322         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3323         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3324
3325         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3326         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3327
3328         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3329         if (needs_ambient_hack)
3330                 needs_setuid = false;
3331         else
3332                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3333
3334         if (needs_sandboxing) {
3335                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3336                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3337                  * impacting our own code paths. */
3338
3339 #if HAVE_SELINUX
3340                 use_selinux = mac_selinux_use();
3341 #endif
3342 #if ENABLE_SMACK
3343                 use_smack = mac_smack_use();
3344 #endif
3345 #if HAVE_APPARMOR
3346                 use_apparmor = mac_apparmor_use();
3347 #endif
3348         }
3349
3350         if (needs_sandboxing) {
3351                 int which_failed;
3352
3353                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3354                  * is set here. (See below.) */
3355
3356                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3357                 if (r < 0) {
3358                         *exit_status = EXIT_LIMITS;
3359                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3360                 }
3361         }
3362
3363         if (needs_setuid) {
3364
3365                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3366                  * wins here. (See above.) */
3367
3368                 if (context->pam_name && username) {
3369                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3370                         if (r < 0) {
3371                                 *exit_status = EXIT_PAM;
3372                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3373                         }
3374                 }
3375         }
3376
3377         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3378
3379                 if (ns_type_supported(NAMESPACE_NET)) {
3380                         r = setup_netns(runtime->netns_storage_socket);
3381                         if (r < 0) {
3382                                 *exit_status = EXIT_NETWORK;
3383                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3384                         }
3385                 } else if (context->network_namespace_path) {
3386                         *exit_status = EXIT_NETWORK;
3387                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3388                 } else
3389                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3390         }
3391
3392         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3393         if (needs_mount_namespace) {
3394                 _cleanup_free_ char *error_path = NULL;
3395
3396                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3397                 if (r < 0) {
3398                         *exit_status = EXIT_NAMESPACE;
3399                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3400                                                     error_path ? ": " : "", strempty(error_path));
3401                 }
3402         }
3403
3404         if (context->protect_hostname) {
3405                 if (ns_type_supported(NAMESPACE_UTS)) {
3406                         if (unshare(CLONE_NEWUTS) < 0) {
3407                                 *exit_status = EXIT_NAMESPACE;
3408                                 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3409                         }
3410                 } else
3411                         log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3412 #if HAVE_SECCOMP
3413                 r = seccomp_protect_hostname();
3414                 if (r < 0) {
3415                         *exit_status = EXIT_SECCOMP;
3416                         return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3417                 }
3418 #endif
3419         }
3420
3421         /* Drop groups as early as possbile */
3422         if (needs_setuid) {
3423                 r = enforce_groups(gid, supplementary_gids, ngids);
3424                 if (r < 0) {
3425                         *exit_status = EXIT_GROUP;
3426                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3427                 }
3428         }
3429
3430         if (needs_sandboxing) {
3431 #if HAVE_SELINUX
3432                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3433                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3434                         if (r < 0) {
3435                                 *exit_status = EXIT_SELINUX_CONTEXT;
3436                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3437                         }
3438                 }
3439 #endif
3440
3441                 if (context->private_users) {
3442                         r = setup_private_users(uid, gid);
3443                         if (r < 0) {
3444                                 *exit_status = EXIT_USER;
3445                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3446                         }
3447                 }
3448         }
3449
3450         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3451          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3452          * however if we have it as we want to keep it open until the final execve(). */
3453
3454         if (params->exec_fd >= 0) {
3455                 exec_fd = params->exec_fd;
3456
3457                 if (exec_fd < 3 + (int) n_fds) {
3458                         int moved_fd;
3459
3460                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3461                          * process we are about to execute. */
3462
3463                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3464                         if (moved_fd < 0) {
3465                                 *exit_status = EXIT_FDS;
3466                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3467                         }
3468
3469                         safe_close(exec_fd);
3470                         exec_fd = moved_fd;
3471                 } else {
3472                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3473                         r = fd_cloexec(exec_fd, true);
3474                         if (r < 0) {
3475                                 *exit_status = EXIT_FDS;
3476                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3477                         }
3478                 }
3479
3480                 fds_with_exec_fd = newa(int, n_fds + 1);
3481                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3482                 fds_with_exec_fd[n_fds] = exec_fd;
3483                 n_fds_with_exec_fd = n_fds + 1;
3484         } else {
3485                 fds_with_exec_fd = fds;
3486                 n_fds_with_exec_fd = n_fds;
3487         }
3488
3489         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3490         if (r >= 0)
3491                 r = shift_fds(fds, n_fds);
3492         if (r >= 0)
3493                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3494         if (r < 0) {
3495                 *exit_status = EXIT_FDS;
3496                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3497         }
3498
3499         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3500          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3501          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3502          * came this far. */
3503
3504         secure_bits = context->secure_bits;
3505
3506         if (needs_sandboxing) {
3507                 uint64_t bset;
3508
3509                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3510                  * requested. (Note this is placed after the general resource limit initialization, see
3511                  * above, in order to take precedence.) */
3512                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3513                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3514                                 *exit_status = EXIT_LIMITS;
3515                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3516                         }
3517                 }
3518
3519 #if ENABLE_SMACK
3520                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3521                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3522                 if (use_smack) {
3523                         r = setup_smack(context, command);
3524                         if (r < 0) {
3525                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3526                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3527                         }
3528                 }
3529 #endif
3530
3531                 bset = context->capability_bounding_set;
3532                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3533                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3534                  * instead of us doing that */
3535                 if (needs_ambient_hack)
3536                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3537                                 (UINT64_C(1) << CAP_SETUID) |
3538                                 (UINT64_C(1) << CAP_SETGID);
3539
3540                 if (!cap_test_all(bset)) {
3541                         r = capability_bounding_set_drop(bset, false);
3542                         if (r < 0) {
3543                                 *exit_status = EXIT_CAPABILITIES;
3544                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3545                         }
3546                 }
3547
3548                 /* This is done before enforce_user, but ambient set
3549                  * does not survive over setresuid() if keep_caps is not set. */
3550                 if (!needs_ambient_hack &&
3551                     context->capability_ambient_set != 0) {
3552                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3553                         if (r < 0) {
3554                                 *exit_status = EXIT_CAPABILITIES;
3555                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3556                         }
3557                 }
3558         }
3559
3560         /* chroot to root directory first, before we lose the ability to chroot */
3561         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3562         if (r < 0)
3563                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3564
3565         if (needs_setuid) {
3566                 if (uid_is_valid(uid)) {
3567                         r = enforce_user(context, uid);
3568                         if (r < 0) {
3569                                 *exit_status = EXIT_USER;
3570                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3571                         }
3572
3573                         if (!needs_ambient_hack &&
3574                             context->capability_ambient_set != 0) {
3575
3576                                 /* Fix the ambient capabilities after user change. */
3577                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3578                                 if (r < 0) {
3579                                         *exit_status = EXIT_CAPABILITIES;
3580                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3581                                 }
3582
3583                                 /* If we were asked to change user and ambient capabilities
3584                                  * were requested, we had to add keep-caps to the securebits
3585                                  * so that we would maintain the inherited capability set
3586                                  * through the setresuid(). Make sure that the bit is added
3587                                  * also to the context secure_bits so that we don't try to
3588                                  * drop the bit away next. */
3589
3590                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3591                         }
3592                 }
3593         }
3594
3595         /* Apply working directory here, because the working directory might be on NFS and only the user running
3596          * this service might have the correct privilege to change to the working directory */
3597         r = apply_working_directory(context, params, home, exit_status);
3598         if (r < 0)
3599                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3600
3601         if (needs_sandboxing) {
3602                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3603                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3604                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3605                  * are restricted. */
3606
3607 #if HAVE_SELINUX
3608                 if (use_selinux) {
3609                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3610
3611                         if (exec_context) {
3612                                 r = setexeccon(exec_context);
3613                                 if (r < 0) {
3614                                         *exit_status = EXIT_SELINUX_CONTEXT;
3615                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3616                                 }
3617                         }
3618                 }
3619 #endif
3620
3621 #if HAVE_APPARMOR
3622                 if (use_apparmor && context->apparmor_profile) {
3623                         r = aa_change_onexec(context->apparmor_profile);
3624                         if (r < 0 && !context->apparmor_profile_ignore) {
3625                                 *exit_status = EXIT_APPARMOR_PROFILE;
3626                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3627                         }
3628                 }
3629 #endif
3630
3631                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3632                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3633                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3634                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3635                                 *exit_status = EXIT_SECUREBITS;
3636                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3637                         }
3638
3639                 if (context_has_no_new_privileges(context))
3640                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3641                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3642                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3643                         }
3644
3645 #if HAVE_SECCOMP
3646                 r = apply_address_families(unit, context);
3647                 if (r < 0) {
3648                         *exit_status = EXIT_ADDRESS_FAMILIES;
3649                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3650                 }
3651
3652                 r = apply_memory_deny_write_execute(unit, context);
3653                 if (r < 0) {
3654                         *exit_status = EXIT_SECCOMP;
3655                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3656                 }
3657
3658                 r = apply_restrict_realtime(unit, context);
3659                 if (r < 0) {
3660                         *exit_status = EXIT_SECCOMP;
3661                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3662                 }
3663
3664                 r = apply_restrict_suid_sgid(unit, context);
3665                 if (r < 0) {
3666                         *exit_status = EXIT_SECCOMP;
3667                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3668                 }
3669
3670                 r = apply_restrict_namespaces(unit, context);
3671                 if (r < 0) {
3672                         *exit_status = EXIT_SECCOMP;
3673                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3674                 }
3675
3676                 r = apply_protect_sysctl(unit, context);
3677                 if (r < 0) {
3678                         *exit_status = EXIT_SECCOMP;
3679                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3680                 }
3681
3682                 r = apply_protect_kernel_modules(unit, context);
3683                 if (r < 0) {
3684                         *exit_status = EXIT_SECCOMP;
3685                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3686                 }
3687
3688                 r = apply_private_devices(unit, context);
3689                 if (r < 0) {
3690                         *exit_status = EXIT_SECCOMP;
3691                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3692                 }
3693
3694                 r = apply_syscall_archs(unit, context);
3695                 if (r < 0) {
3696                         *exit_status = EXIT_SECCOMP;
3697                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3698                 }
3699
3700                 r = apply_lock_personality(unit, context);
3701                 if (r < 0) {
3702                         *exit_status = EXIT_SECCOMP;
3703                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3704                 }
3705
3706                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3707                  * by the filter as little as possible. */
3708                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3709                 if (r < 0) {
3710                         *exit_status = EXIT_SECCOMP;
3711                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3712                 }
3713 #endif
3714         }
3715
3716         if (!strv_isempty(context->unset_environment)) {
3717                 char **ee = NULL;
3718
3719                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3720                 if (!ee) {
3721                         *exit_status = EXIT_MEMORY;
3722                         return log_oom();
3723                 }
3724
3725                 strv_free_and_replace(accum_env, ee);
3726         }
3727
3728         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3729                 replaced_argv = replace_env_argv(command->argv, accum_env);
3730                 if (!replaced_argv) {
3731                         *exit_status = EXIT_MEMORY;
3732                         return log_oom();
3733                 }
3734                 final_argv = replaced_argv;
3735         } else
3736                 final_argv = command->argv;
3737
3738         if (DEBUG_LOGGING) {
3739                 _cleanup_free_ char *line;
3740
3741                 line = exec_command_line(final_argv);
3742                 if (line)
3743                         log_struct(LOG_DEBUG,
3744                                    "EXECUTABLE=%s", command->path,
3745                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3746                                    LOG_UNIT_ID(unit),
3747                                    LOG_UNIT_INVOCATION_ID(unit));
3748         }
3749
3750         if (exec_fd >= 0) {
3751                 uint8_t hot = 1;
3752
3753                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3754                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3755
3756                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3757                         *exit_status = EXIT_EXEC;
3758                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3759                 }
3760         }
3761
3762         execve(command->path, final_argv, accum_env);
3763         r = -errno;
3764
3765         if (exec_fd >= 0) {
3766                 uint8_t hot = 0;
3767
3768                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3769                  * that POLLHUP on it no longer means execve() succeeded. */
3770
3771                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3772                         *exit_status = EXIT_EXEC;
3773                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3774                 }
3775         }
3776
3777         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3778                 log_struct_errno(LOG_INFO, r,
3779                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3780                                  LOG_UNIT_ID(unit),
3781                                  LOG_UNIT_INVOCATION_ID(unit),
3782                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3783                                                   command->path),
3784                                  "EXECUTABLE=%s", command->path);
3785                 return 0;
3786         }
3787
3788         *exit_status = EXIT_EXEC;
3789         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3790 }
3791
3792 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3793 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3794
3795 int exec_spawn(Unit *unit,
3796                ExecCommand *command,
3797                const ExecContext *context,
3798                const ExecParameters *params,
3799                ExecRuntime *runtime,
3800                DynamicCreds *dcreds,
3801                pid_t *ret) {
3802
3803         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3804         _cleanup_free_ char *subcgroup_path = NULL;
3805         _cleanup_strv_free_ char **files_env = NULL;
3806         size_t n_storage_fds = 0, n_socket_fds = 0;
3807         _cleanup_free_ char *line = NULL;
3808         pid_t pid;
3809
3810         assert(unit);
3811         assert(command);
3812         assert(context);
3813         assert(ret);
3814         assert(params);
3815         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3816
3817         if (context->std_input == EXEC_INPUT_SOCKET ||
3818             context->std_output == EXEC_OUTPUT_SOCKET ||
3819             context->std_error == EXEC_OUTPUT_SOCKET) {
3820
3821                 if (params->n_socket_fds > 1) {
3822                         log_unit_error(unit, "Got more than one socket.");
3823                         return -EINVAL;
3824                 }
3825
3826                 if (params->n_socket_fds == 0) {
3827                         log_unit_error(unit, "Got no socket.");
3828                         return -EINVAL;
3829                 }
3830
3831                 socket_fd = params->fds[0];
3832         } else {
3833                 socket_fd = -1;
3834                 fds = params->fds;
3835                 n_socket_fds = params->n_socket_fds;
3836                 n_storage_fds = params->n_storage_fds;
3837         }
3838
3839         r = exec_context_named_iofds(context, params, named_iofds);
3840         if (r < 0)
3841                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3842
3843         r = exec_context_load_environment(unit, context, &files_env);
3844         if (r < 0)
3845                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3846
3847         line = exec_command_line(command->argv);
3848         if (!line)
3849                 return log_oom();
3850
3851         log_struct(LOG_DEBUG,
3852                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3853                    "EXECUTABLE=%s", command->path,
3854                    LOG_UNIT_ID(unit),
3855                    LOG_UNIT_INVOCATION_ID(unit));
3856
3857         if (params->cgroup_path) {
3858                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3859                 if (r < 0)
3860                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3861                 if (r > 0) { /* We are using a child cgroup */
3862                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3863                         if (r < 0)
3864                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3865                 }
3866         }
3867
3868         pid = fork();
3869         if (pid < 0)
3870                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3871
3872         if (pid == 0) {
3873                 int exit_status = EXIT_SUCCESS;
3874
3875                 r = exec_child(unit,
3876                                command,
3877                                context,
3878                                params,
3879                                runtime,
3880                                dcreds,
3881                                socket_fd,
3882                                named_iofds,
3883                                fds,
3884                                n_socket_fds,
3885                                n_storage_fds,
3886                                files_env,
3887                                unit->manager->user_lookup_fds[1],
3888                                &exit_status);
3889
3890                 if (r < 0) {
3891                         const char *status =
3892                                 exit_status_to_string(exit_status,
3893                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
3894
3895                         log_struct_errno(LOG_ERR, r,
3896                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3897                                          LOG_UNIT_ID(unit),
3898                                          LOG_UNIT_INVOCATION_ID(unit),
3899                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3900                                                           status, command->path),
3901                                          "EXECUTABLE=%s", command->path);
3902                 }
3903
3904                 _exit(exit_status);
3905         }
3906
3907         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3908
3909         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3910          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3911          * process will be killed too). */
3912         if (subcgroup_path)
3913                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3914
3915         exec_status_start(&command->exec_status, pid);
3916
3917         *ret = pid;
3918         return 0;
3919 }
3920
3921 void exec_context_init(ExecContext *c) {
3922         ExecDirectoryType i;
3923
3924         assert(c);
3925
3926         c->umask = 0022;
3927         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3928         c->cpu_sched_policy = SCHED_OTHER;
3929         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3930         c->syslog_level_prefix = true;
3931         c->ignore_sigpipe = true;
3932         c->timer_slack_nsec = NSEC_INFINITY;
3933         c->personality = PERSONALITY_INVALID;
3934         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3935                 c->directories[i].mode = 0755;
3936         c->timeout_clean_usec = USEC_INFINITY;
3937         c->capability_bounding_set = CAP_ALL;
3938         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3939         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3940         c->log_level_max = -1;
3941         numa_policy_reset(&c->numa_policy);
3942 }
3943
3944 void exec_context_done(ExecContext *c) {
3945         ExecDirectoryType i;
3946         size_t l;
3947
3948         assert(c);
3949
3950         c->environment = strv_free(c->environment);
3951         c->environment_files = strv_free(c->environment_files);
3952         c->pass_environment = strv_free(c->pass_environment);
3953         c->unset_environment = strv_free(c->unset_environment);
3954
3955         rlimit_free_all(c->rlimit);
3956
3957         for (l = 0; l < 3; l++) {
3958                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3959                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3960         }
3961
3962         c->working_directory = mfree(c->working_directory);
3963         c->root_directory = mfree(c->root_directory);
3964         c->root_image = mfree(c->root_image);
3965         c->tty_path = mfree(c->tty_path);
3966         c->syslog_identifier = mfree(c->syslog_identifier);
3967         c->user = mfree(c->user);
3968         c->group = mfree(c->group);
3969
3970         c->supplementary_groups = strv_free(c->supplementary_groups);
3971
3972         c->pam_name = mfree(c->pam_name);
3973
3974         c->read_only_paths = strv_free(c->read_only_paths);
3975         c->read_write_paths = strv_free(c->read_write_paths);
3976         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3977
3978         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3979         c->bind_mounts = NULL;
3980         c->n_bind_mounts = 0;
3981         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3982         c->temporary_filesystems = NULL;
3983         c->n_temporary_filesystems = 0;
3984
3985         cpu_set_reset(&c->cpu_set);
3986         numa_policy_reset(&c->numa_policy);
3987
3988         c->utmp_id = mfree(c->utmp_id);
3989         c->selinux_context = mfree(c->selinux_context);
3990         c->apparmor_profile = mfree(c->apparmor_profile);
3991         c->smack_process_label = mfree(c->smack_process_label);
3992
3993         c->syscall_filter = hashmap_free(c->syscall_filter);
3994         c->syscall_archs = set_free(c->syscall_archs);
3995         c->address_families = set_free(c->address_families);
3996
3997         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3998                 c->directories[i].paths = strv_free(c->directories[i].paths);
3999
4000         c->log_level_max = -1;
4001
4002         exec_context_free_log_extra_fields(c);
4003
4004         c->log_ratelimit_interval_usec = 0;
4005         c->log_ratelimit_burst = 0;
4006
4007         c->stdin_data = mfree(c->stdin_data);
4008         c->stdin_data_size = 0;
4009
4010         c->network_namespace_path = mfree(c->network_namespace_path);
4011 }
4012
4013 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4014         char **i;
4015
4016         assert(c);
4017
4018         if (!runtime_prefix)
4019                 return 0;
4020
4021         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4022                 _cleanup_free_ char *p;
4023
4024                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4025                         p = path_join(runtime_prefix, "private", *i);
4026                 else
4027                         p = path_join(runtime_prefix, *i);
4028                 if (!p)
4029                         return -ENOMEM;
4030
4031                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4032                  * service next. */
4033                 (void) rm_rf(p, REMOVE_ROOT);
4034         }
4035
4036         return 0;
4037 }
4038
4039 static void exec_command_done(ExecCommand *c) {
4040         assert(c);
4041
4042         c->path = mfree(c->path);
4043         c->argv = strv_free(c->argv);
4044 }
4045
4046 void exec_command_done_array(ExecCommand *c, size_t n) {
4047         size_t i;
4048
4049         for (i = 0; i < n; i++)
4050                 exec_command_done(c+i);
4051 }
4052
4053 ExecCommand* exec_command_free_list(ExecCommand *c) {
4054         ExecCommand *i;
4055
4056         while ((i = c)) {
4057                 LIST_REMOVE(command, c, i);
4058                 exec_command_done(i);
4059                 free(i);
4060         }
4061
4062         return NULL;
4063 }
4064
4065 void exec_command_free_array(ExecCommand **c, size_t n) {
4066         size_t i;
4067
4068         for (i = 0; i < n; i++)
4069                 c[i] = exec_command_free_list(c[i]);
4070 }
4071
4072 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4073         size_t i;
4074
4075         for (i = 0; i < n; i++)
4076                 exec_status_reset(&c[i].exec_status);
4077 }
4078
4079 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4080         size_t i;
4081
4082         for (i = 0; i < n; i++) {
4083                 ExecCommand *z;
4084
4085                 LIST_FOREACH(command, z, c[i])
4086                         exec_status_reset(&z->exec_status);
4087         }
4088 }
4089
4090 typedef struct InvalidEnvInfo {
4091         const Unit *unit;
4092         const char *path;
4093 } InvalidEnvInfo;
4094
4095 static void invalid_env(const char *p, void *userdata) {
4096         InvalidEnvInfo *info = userdata;
4097
4098         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4099 }
4100
4101 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4102         assert(c);
4103
4104         switch (fd_index) {
4105
4106         case STDIN_FILENO:
4107                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4108                         return NULL;
4109
4110                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4111
4112         case STDOUT_FILENO:
4113                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4114                         return NULL;
4115
4116                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4117
4118         case STDERR_FILENO:
4119                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4120                         return NULL;
4121
4122                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4123
4124         default:
4125                 return NULL;
4126         }
4127 }
4128
4129 static int exec_context_named_iofds(
4130                 const ExecContext *c,
4131                 const ExecParameters *p,
4132                 int named_iofds[static 3]) {
4133
4134         size_t i, targets;
4135         const char* stdio_fdname[3];
4136         size_t n_fds;
4137
4138         assert(c);
4139         assert(p);
4140         assert(named_iofds);
4141
4142         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4143                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4144                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4145
4146         for (i = 0; i < 3; i++)
4147                 stdio_fdname[i] = exec_context_fdname(c, i);
4148
4149         n_fds = p->n_storage_fds + p->n_socket_fds;
4150
4151         for (i = 0; i < n_fds  && targets > 0; i++)
4152                 if (named_iofds[STDIN_FILENO] < 0 &&
4153                     c->std_input == EXEC_INPUT_NAMED_FD &&
4154                     stdio_fdname[STDIN_FILENO] &&
4155                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4156
4157                         named_iofds[STDIN_FILENO] = p->fds[i];
4158                         targets--;
4159
4160                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4161                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4162                            stdio_fdname[STDOUT_FILENO] &&
4163                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4164
4165                         named_iofds[STDOUT_FILENO] = p->fds[i];
4166                         targets--;
4167
4168                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4169                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4170                            stdio_fdname[STDERR_FILENO] &&
4171                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4172
4173                         named_iofds[STDERR_FILENO] = p->fds[i];
4174                         targets--;
4175                 }
4176
4177         return targets == 0 ? 0 : -ENOENT;
4178 }
4179
4180 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4181         char **i, **r = NULL;
4182
4183         assert(c);
4184         assert(l);
4185
4186         STRV_FOREACH(i, c->environment_files) {
4187                 char *fn;
4188                 int k;
4189                 unsigned n;
4190                 bool ignore = false;
4191                 char **p;
4192                 _cleanup_globfree_ glob_t pglob = {};
4193
4194                 fn = *i;
4195
4196                 if (fn[0] == '-') {
4197                         ignore = true;
4198                         fn++;
4199                 }
4200
4201                 if (!path_is_absolute(fn)) {
4202                         if (ignore)
4203                                 continue;
4204
4205                         strv_free(r);
4206                         return -EINVAL;
4207                 }
4208
4209                 /* Filename supports globbing, take all matching files */
4210                 k = safe_glob(fn, 0, &pglob);
4211                 if (k < 0) {
4212                         if (ignore)
4213                                 continue;
4214
4215                         strv_free(r);
4216                         return k;
4217                 }
4218
4219                 /* When we don't match anything, -ENOENT should be returned */
4220                 assert(pglob.gl_pathc > 0);
4221
4222                 for (n = 0; n < pglob.gl_pathc; n++) {
4223                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4224                         if (k < 0) {
4225                                 if (ignore)
4226                                         continue;
4227
4228                                 strv_free(r);
4229                                 return k;
4230                         }
4231                         /* Log invalid environment variables with filename */
4232                         if (p) {
4233                                 InvalidEnvInfo info = {
4234                                         .unit = unit,
4235                                         .path = pglob.gl_pathv[n]
4236                                 };
4237
4238                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4239                         }
4240
4241                         if (!r)
4242                                 r = p;
4243                         else {
4244                                 char **m;
4245
4246                                 m = strv_env_merge(2, r, p);
4247                                 strv_free(r);
4248                                 strv_free(p);
4249                                 if (!m)
4250                                         return -ENOMEM;
4251
4252                                 r = m;
4253                         }
4254                 }
4255         }
4256
4257         *l = r;
4258
4259         return 0;
4260 }
4261
4262 static bool tty_may_match_dev_console(const char *tty) {
4263         _cleanup_free_ char *resolved = NULL;
4264
4265         if (!tty)
4266                 return true;
4267
4268         tty = skip_dev_prefix(tty);
4269
4270         /* trivial identity? */
4271         if (streq(tty, "console"))
4272                 return true;
4273
4274         if (resolve_dev_console(&resolved) < 0)
4275                 return true; /* if we could not resolve, assume it may */
4276
4277         /* "tty0" means the active VC, so it may be the same sometimes */
4278         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4279 }
4280
4281 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4282         assert(ec);
4283
4284         return ec->tty_reset ||
4285                 ec->tty_vhangup ||
4286                 ec->tty_vt_disallocate ||
4287                 is_terminal_input(ec->std_input) ||
4288                 is_terminal_output(ec->std_output) ||
4289                 is_terminal_output(ec->std_error);
4290 }
4291
4292 bool exec_context_may_touch_console(const ExecContext *ec) {
4293
4294         return exec_context_may_touch_tty(ec) &&
4295                tty_may_match_dev_console(exec_context_tty_path(ec));
4296 }
4297
4298 static void strv_fprintf(FILE *f, char **l) {
4299         char **g;
4300
4301         assert(f);
4302
4303         STRV_FOREACH(g, l)
4304                 fprintf(f, " %s", *g);
4305 }
4306
4307 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4308         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4309         ExecDirectoryType dt;
4310         unsigned i;
4311         int r;
4312
4313         assert(c);
4314         assert(f);
4315
4316         prefix = strempty(prefix);
4317
4318         fprintf(f,
4319                 "%sUMask: %04o\n"
4320                 "%sWorkingDirectory: %s\n"
4321                 "%sRootDirectory: %s\n"
4322                 "%sNonBlocking: %s\n"
4323                 "%sPrivateTmp: %s\n"
4324                 "%sPrivateDevices: %s\n"
4325                 "%sProtectKernelTunables: %s\n"
4326                 "%sProtectKernelModules: %s\n"
4327                 "%sProtectControlGroups: %s\n"
4328                 "%sPrivateNetwork: %s\n"
4329                 "%sPrivateUsers: %s\n"
4330                 "%sProtectHome: %s\n"
4331                 "%sProtectSystem: %s\n"
4332                 "%sMountAPIVFS: %s\n"
4333                 "%sIgnoreSIGPIPE: %s\n"
4334                 "%sMemoryDenyWriteExecute: %s\n"
4335                 "%sRestrictRealtime: %s\n"
4336                 "%sRestrictSUIDSGID: %s\n"
4337                 "%sKeyringMode: %s\n"
4338                 "%sProtectHostname: %s\n",
4339                 prefix, c->umask,
4340                 prefix, c->working_directory ? c->working_directory : "/",
4341                 prefix, c->root_directory ? c->root_directory : "/",
4342                 prefix, yes_no(c->non_blocking),
4343                 prefix, yes_no(c->private_tmp),
4344                 prefix, yes_no(c->private_devices),
4345                 prefix, yes_no(c->protect_kernel_tunables),
4346                 prefix, yes_no(c->protect_kernel_modules),
4347                 prefix, yes_no(c->protect_control_groups),
4348                 prefix, yes_no(c->private_network),
4349                 prefix, yes_no(c->private_users),
4350                 prefix, protect_home_to_string(c->protect_home),
4351                 prefix, protect_system_to_string(c->protect_system),
4352                 prefix, yes_no(c->mount_apivfs),
4353                 prefix, yes_no(c->ignore_sigpipe),
4354                 prefix, yes_no(c->memory_deny_write_execute),
4355                 prefix, yes_no(c->restrict_realtime),
4356                 prefix, yes_no(c->restrict_suid_sgid),
4357                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4358                 prefix, yes_no(c->protect_hostname));
4359
4360         if (c->root_image)
4361                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4362
4363         STRV_FOREACH(e, c->environment)
4364                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4365
4366         STRV_FOREACH(e, c->environment_files)
4367                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4368
4369         STRV_FOREACH(e, c->pass_environment)
4370                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4371
4372         STRV_FOREACH(e, c->unset_environment)
4373                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4374
4375         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4376
4377         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4378                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4379
4380                 STRV_FOREACH(d, c->directories[dt].paths)
4381                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4382         }
4383
4384         fprintf(f,
4385                 "%sTimeoutCleanSec: %s\n",
4386                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4387
4388         if (c->nice_set)
4389                 fprintf(f,
4390                         "%sNice: %i\n",
4391                         prefix, c->nice);
4392
4393         if (c->oom_score_adjust_set)
4394                 fprintf(f,
4395                         "%sOOMScoreAdjust: %i\n",
4396                         prefix, c->oom_score_adjust);
4397
4398         for (i = 0; i < RLIM_NLIMITS; i++)
4399                 if (c->rlimit[i]) {
4400                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4401                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4402                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4403                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4404                 }
4405
4406         if (c->ioprio_set) {
4407                 _cleanup_free_ char *class_str = NULL;
4408
4409                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4410                 if (r >= 0)
4411                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4412
4413                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4414         }
4415
4416         if (c->cpu_sched_set) {
4417                 _cleanup_free_ char *policy_str = NULL;
4418
4419                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4420                 if (r >= 0)
4421                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4422
4423                 fprintf(f,
4424                         "%sCPUSchedulingPriority: %i\n"
4425                         "%sCPUSchedulingResetOnFork: %s\n",
4426                         prefix, c->cpu_sched_priority,
4427                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4428         }
4429
4430         if (c->cpu_set.set) {
4431                 _cleanup_free_ char *affinity = NULL;
4432
4433                 affinity = cpu_set_to_range_string(&c->cpu_set);
4434                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4435         }
4436
4437         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4438                 _cleanup_free_ char *nodes = NULL;
4439
4440                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4441                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4442                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4443         }
4444
4445         if (c->timer_slack_nsec != NSEC_INFINITY)
4446                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4447
4448         fprintf(f,
4449                 "%sStandardInput: %s\n"
4450                 "%sStandardOutput: %s\n"
4451                 "%sStandardError: %s\n",
4452                 prefix, exec_input_to_string(c->std_input),
4453                 prefix, exec_output_to_string(c->std_output),
4454                 prefix, exec_output_to_string(c->std_error));
4455
4456         if (c->std_input == EXEC_INPUT_NAMED_FD)
4457                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4458         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4459                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4460         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4461                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4462
4463         if (c->std_input == EXEC_INPUT_FILE)
4464                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4465         if (c->std_output == EXEC_OUTPUT_FILE)
4466                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4467         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4468                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4469         if (c->std_error == EXEC_OUTPUT_FILE)
4470                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4471         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4472                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4473
4474         if (c->tty_path)
4475                 fprintf(f,
4476                         "%sTTYPath: %s\n"
4477                         "%sTTYReset: %s\n"
4478                         "%sTTYVHangup: %s\n"
4479                         "%sTTYVTDisallocate: %s\n",
4480                         prefix, c->tty_path,
4481                         prefix, yes_no(c->tty_reset),
4482                         prefix, yes_no(c->tty_vhangup),
4483                         prefix, yes_no(c->tty_vt_disallocate));
4484
4485         if (IN_SET(c->std_output,
4486                    EXEC_OUTPUT_SYSLOG,
4487                    EXEC_OUTPUT_KMSG,
4488                    EXEC_OUTPUT_JOURNAL,
4489                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4490                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4491                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4492             IN_SET(c->std_error,
4493                    EXEC_OUTPUT_SYSLOG,
4494                    EXEC_OUTPUT_KMSG,
4495                    EXEC_OUTPUT_JOURNAL,
4496                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4497                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4498                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4499
4500                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4501
4502                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4503                 if (r >= 0)
4504                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4505
4506                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4507                 if (r >= 0)
4508                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4509         }
4510
4511         if (c->log_level_max >= 0) {
4512                 _cleanup_free_ char *t = NULL;
4513
4514                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4515
4516                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4517         }
4518
4519         if (c->log_ratelimit_interval_usec > 0) {
4520                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4521
4522                 fprintf(f,
4523                         "%sLogRateLimitIntervalSec: %s\n",
4524                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4525         }
4526
4527         if (c->log_ratelimit_burst > 0)
4528                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4529
4530         if (c->n_log_extra_fields > 0) {
4531                 size_t j;
4532
4533                 for (j = 0; j < c->n_log_extra_fields; j++) {
4534                         fprintf(f, "%sLogExtraFields: ", prefix);
4535                         fwrite(c->log_extra_fields[j].iov_base,
4536                                1, c->log_extra_fields[j].iov_len,
4537                                f);
4538                         fputc('\n', f);
4539                 }
4540         }
4541
4542         if (c->secure_bits) {
4543                 _cleanup_free_ char *str = NULL;
4544
4545                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4546                 if (r >= 0)
4547                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4548         }
4549
4550         if (c->capability_bounding_set != CAP_ALL) {
4551                 _cleanup_free_ char *str = NULL;
4552
4553                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4554                 if (r >= 0)
4555                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4556         }
4557
4558         if (c->capability_ambient_set != 0) {
4559                 _cleanup_free_ char *str = NULL;
4560
4561                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4562                 if (r >= 0)
4563                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4564         }
4565
4566         if (c->user)
4567                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4568         if (c->group)
4569                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4570
4571         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4572
4573         if (!strv_isempty(c->supplementary_groups)) {
4574                 fprintf(f, "%sSupplementaryGroups:", prefix);
4575                 strv_fprintf(f, c->supplementary_groups);
4576                 fputs("\n", f);
4577         }
4578
4579         if (c->pam_name)
4580                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4581
4582         if (!strv_isempty(c->read_write_paths)) {
4583                 fprintf(f, "%sReadWritePaths:", prefix);
4584                 strv_fprintf(f, c->read_write_paths);
4585                 fputs("\n", f);
4586         }
4587
4588         if (!strv_isempty(c->read_only_paths)) {
4589                 fprintf(f, "%sReadOnlyPaths:", prefix);
4590                 strv_fprintf(f, c->read_only_paths);
4591                 fputs("\n", f);
4592         }
4593
4594         if (!strv_isempty(c->inaccessible_paths)) {
4595                 fprintf(f, "%sInaccessiblePaths:", prefix);
4596                 strv_fprintf(f, c->inaccessible_paths);
4597                 fputs("\n", f);
4598         }
4599
4600         if (c->n_bind_mounts > 0)
4601                 for (i = 0; i < c->n_bind_mounts; i++)
4602                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4603                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4604                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4605                                 c->bind_mounts[i].source,
4606                                 c->bind_mounts[i].destination,
4607                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4608
4609         if (c->n_temporary_filesystems > 0)
4610                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4611                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4612
4613                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4614                                 t->path,
4615                                 isempty(t->options) ? "" : ":",
4616                                 strempty(t->options));
4617                 }
4618
4619         if (c->utmp_id)
4620                 fprintf(f,
4621                         "%sUtmpIdentifier: %s\n",
4622                         prefix, c->utmp_id);
4623
4624         if (c->selinux_context)
4625                 fprintf(f,
4626                         "%sSELinuxContext: %s%s\n",
4627                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4628
4629         if (c->apparmor_profile)
4630                 fprintf(f,
4631                         "%sAppArmorProfile: %s%s\n",
4632                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4633
4634         if (c->smack_process_label)
4635                 fprintf(f,
4636                         "%sSmackProcessLabel: %s%s\n",
4637                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4638
4639         if (c->personality != PERSONALITY_INVALID)
4640                 fprintf(f,
4641                         "%sPersonality: %s\n",
4642                         prefix, strna(personality_to_string(c->personality)));
4643
4644         fprintf(f,
4645                 "%sLockPersonality: %s\n",
4646                 prefix, yes_no(c->lock_personality));
4647
4648         if (c->syscall_filter) {
4649 #if HAVE_SECCOMP
4650                 Iterator j;
4651                 void *id, *val;
4652                 bool first = true;
4653 #endif
4654
4655                 fprintf(f,
4656                         "%sSystemCallFilter: ",
4657                         prefix);
4658
4659                 if (!c->syscall_whitelist)
4660                         fputc('~', f);
4661
4662 #if HAVE_SECCOMP
4663                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4664                         _cleanup_free_ char *name = NULL;
4665                         const char *errno_name = NULL;
4666                         int num = PTR_TO_INT(val);
4667
4668                         if (first)
4669                                 first = false;
4670                         else
4671                                 fputc(' ', f);
4672
4673                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4674                         fputs(strna(name), f);
4675
4676                         if (num >= 0) {
4677                                 errno_name = errno_to_name(num);
4678                                 if (errno_name)
4679                                         fprintf(f, ":%s", errno_name);
4680                                 else
4681                                         fprintf(f, ":%d", num);
4682                         }
4683                 }
4684 #endif
4685
4686                 fputc('\n', f);
4687         }
4688
4689         if (c->syscall_archs) {
4690 #if HAVE_SECCOMP
4691                 Iterator j;
4692                 void *id;
4693 #endif
4694
4695                 fprintf(f,
4696                         "%sSystemCallArchitectures:",
4697                         prefix);
4698
4699 #if HAVE_SECCOMP
4700                 SET_FOREACH(id, c->syscall_archs, j)
4701                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4702 #endif
4703                 fputc('\n', f);
4704         }
4705
4706         if (exec_context_restrict_namespaces_set(c)) {
4707                 _cleanup_free_ char *s = NULL;
4708
4709                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4710                 if (r >= 0)
4711                         fprintf(f, "%sRestrictNamespaces: %s\n",
4712                                 prefix, s);
4713         }
4714
4715         if (c->network_namespace_path)
4716                 fprintf(f,
4717                         "%sNetworkNamespacePath: %s\n",
4718                         prefix, c->network_namespace_path);
4719
4720         if (c->syscall_errno > 0) {
4721                 const char *errno_name;
4722
4723                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4724
4725                 errno_name = errno_to_name(c->syscall_errno);
4726                 if (errno_name)
4727                         fprintf(f, "%s\n", errno_name);
4728                 else
4729                         fprintf(f, "%d\n", c->syscall_errno);
4730         }
4731 }
4732
4733 bool exec_context_maintains_privileges(const ExecContext *c) {
4734         assert(c);
4735
4736         /* Returns true if the process forked off would run under
4737          * an unchanged UID or as root. */
4738
4739         if (!c->user)
4740                 return true;
4741
4742         if (streq(c->user, "root") || streq(c->user, "0"))
4743                 return true;
4744
4745         return false;
4746 }
4747
4748 int exec_context_get_effective_ioprio(const ExecContext *c) {
4749         int p;
4750
4751         assert(c);
4752
4753         if (c->ioprio_set)
4754                 return c->ioprio;
4755
4756         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4757         if (p < 0)
4758                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4759
4760         return p;
4761 }
4762
4763 void exec_context_free_log_extra_fields(ExecContext *c) {
4764         size_t l;
4765
4766         assert(c);
4767
4768         for (l = 0; l < c->n_log_extra_fields; l++)
4769                 free(c->log_extra_fields[l].iov_base);
4770         c->log_extra_fields = mfree(c->log_extra_fields);
4771         c->n_log_extra_fields = 0;
4772 }
4773
4774 void exec_context_revert_tty(ExecContext *c) {
4775         int r;
4776
4777         assert(c);
4778
4779         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4780         exec_context_tty_reset(c, NULL);
4781
4782         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4783          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4784          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4785
4786         if (exec_context_may_touch_tty(c)) {
4787                 const char *path;
4788
4789                 path = exec_context_tty_path(c);
4790                 if (path) {
4791                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4792                         if (r < 0 && r != -ENOENT)
4793                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4794                 }
4795         }
4796 }
4797
4798 int exec_context_get_clean_directories(
4799                 ExecContext *c,
4800                 char **prefix,
4801                 ExecCleanMask mask,
4802                 char ***ret) {
4803
4804         _cleanup_strv_free_ char **l = NULL;
4805         ExecDirectoryType t;
4806         int r;
4807
4808         assert(c);
4809         assert(prefix);
4810         assert(ret);
4811
4812         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4813                 char **i;
4814
4815                 if (!FLAGS_SET(mask, 1U << t))
4816                         continue;
4817
4818                 if (!prefix[t])
4819                         continue;
4820
4821                 STRV_FOREACH(i, c->directories[t].paths) {
4822                         char *j;
4823
4824                         j = path_join(prefix[t], *i);
4825                         if (!j)
4826                                 return -ENOMEM;
4827
4828                         r = strv_consume(&l, j);
4829                         if (r < 0)
4830                                 return r;
4831
4832                         /* Also remove private directories unconditionally. */
4833                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
4834                                 j = path_join(prefix[t], "private", *i);
4835                                 if (!j)
4836                                         return -ENOMEM;
4837
4838                                 r = strv_consume(&l, j);
4839                                 if (r < 0)
4840                                         return r;
4841                         }
4842                 }
4843         }
4844
4845         *ret = TAKE_PTR(l);
4846         return 0;
4847 }
4848
4849 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4850         ExecCleanMask mask = 0;
4851
4852         assert(c);
4853         assert(ret);
4854
4855         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4856                 if (!strv_isempty(c->directories[t].paths))
4857                         mask |= 1U << t;
4858
4859         *ret = mask;
4860         return 0;
4861 }
4862
4863 void exec_status_start(ExecStatus *s, pid_t pid) {
4864         assert(s);
4865
4866         *s = (ExecStatus) {
4867                 .pid = pid,
4868         };
4869
4870         dual_timestamp_get(&s->start_timestamp);
4871 }
4872
4873 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4874         assert(s);
4875
4876         if (s->pid != pid) {
4877                 *s = (ExecStatus) {
4878                         .pid = pid,
4879                 };
4880         }
4881
4882         dual_timestamp_get(&s->exit_timestamp);
4883
4884         s->code = code;
4885         s->status = status;
4886
4887         if (context && context->utmp_id)
4888                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4889 }
4890
4891 void exec_status_reset(ExecStatus *s) {
4892         assert(s);
4893
4894         *s = (ExecStatus) {};
4895 }
4896
4897 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4898         char buf[FORMAT_TIMESTAMP_MAX];
4899
4900         assert(s);
4901         assert(f);
4902
4903         if (s->pid <= 0)
4904                 return;
4905
4906         prefix = strempty(prefix);
4907
4908         fprintf(f,
4909                 "%sPID: "PID_FMT"\n",
4910                 prefix, s->pid);
4911
4912         if (dual_timestamp_is_set(&s->start_timestamp))
4913                 fprintf(f,
4914                         "%sStart Timestamp: %s\n",
4915                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4916
4917         if (dual_timestamp_is_set(&s->exit_timestamp))
4918                 fprintf(f,
4919                         "%sExit Timestamp: %s\n"
4920                         "%sExit Code: %s\n"
4921                         "%sExit Status: %i\n",
4922                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4923                         prefix, sigchld_code_to_string(s->code),
4924                         prefix, s->status);
4925 }
4926
4927 static char *exec_command_line(char **argv) {
4928         size_t k;
4929         char *n, *p, **a;
4930         bool first = true;
4931
4932         assert(argv);
4933
4934         k = 1;
4935         STRV_FOREACH(a, argv)
4936                 k += strlen(*a)+3;
4937
4938         n = new(char, k);
4939         if (!n)
4940                 return NULL;
4941
4942         p = n;
4943         STRV_FOREACH(a, argv) {
4944
4945                 if (!first)
4946                         *(p++) = ' ';
4947                 else
4948                         first = false;
4949
4950                 if (strpbrk(*a, WHITESPACE)) {
4951                         *(p++) = '\'';
4952                         p = stpcpy(p, *a);
4953                         *(p++) = '\'';
4954                 } else
4955                         p = stpcpy(p, *a);
4956
4957         }
4958
4959         *p = 0;
4960
4961         /* FIXME: this doesn't really handle arguments that have
4962          * spaces and ticks in them */
4963
4964         return n;
4965 }
4966
4967 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4968         _cleanup_free_ char *cmd = NULL;
4969         const char *prefix2;
4970
4971         assert(c);
4972         assert(f);
4973
4974         prefix = strempty(prefix);
4975         prefix2 = strjoina(prefix, "\t");
4976
4977         cmd = exec_command_line(c->argv);
4978         fprintf(f,
4979                 "%sCommand Line: %s\n",
4980                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
4981
4982         exec_status_dump(&c->exec_status, f, prefix2);
4983 }
4984
4985 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4986         assert(f);
4987
4988         prefix = strempty(prefix);
4989
4990         LIST_FOREACH(command, c, c)
4991                 exec_command_dump(c, f, prefix);
4992 }
4993
4994 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4995         ExecCommand *end;
4996
4997         assert(l);
4998         assert(e);
4999
5000         if (*l) {
5001                 /* It's kind of important, that we keep the order here */
5002                 LIST_FIND_TAIL(command, *l, end);
5003                 LIST_INSERT_AFTER(command, *l, end, e);
5004         } else
5005               *l = e;
5006 }
5007
5008 int exec_command_set(ExecCommand *c, const char *path, ...) {
5009         va_list ap;
5010         char **l, *p;
5011
5012         assert(c);
5013         assert(path);
5014
5015         va_start(ap, path);
5016         l = strv_new_ap(path, ap);
5017         va_end(ap);
5018
5019         if (!l)
5020                 return -ENOMEM;
5021
5022         p = strdup(path);
5023         if (!p) {
5024                 strv_free(l);
5025                 return -ENOMEM;
5026         }
5027
5028         free_and_replace(c->path, p);
5029
5030         return strv_free_and_replace(c->argv, l);
5031 }
5032
5033 int exec_command_append(ExecCommand *c, const char *path, ...) {
5034         _cleanup_strv_free_ char **l = NULL;
5035         va_list ap;
5036         int r;
5037
5038         assert(c);
5039         assert(path);
5040
5041         va_start(ap, path);
5042         l = strv_new_ap(path, ap);
5043         va_end(ap);
5044
5045         if (!l)
5046                 return -ENOMEM;
5047
5048         r = strv_extend_strv(&c->argv, l, false);
5049         if (r < 0)
5050                 return r;
5051
5052         return 0;
5053 }
5054
5055 static void *remove_tmpdir_thread(void *p) {
5056         _cleanup_free_ char *path = p;
5057
5058         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5059         return NULL;
5060 }
5061
5062 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5063         int r;
5064
5065         if (!rt)
5066                 return NULL;
5067
5068         if (rt->manager)
5069                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5070
5071         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5072         if (destroy && rt->tmp_dir) {
5073                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5074
5075                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5076                 if (r < 0) {
5077                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5078                         free(rt->tmp_dir);
5079                 }
5080
5081                 rt->tmp_dir = NULL;
5082         }
5083
5084         if (destroy && rt->var_tmp_dir) {
5085                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5086
5087                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5088                 if (r < 0) {
5089                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5090                         free(rt->var_tmp_dir);
5091                 }
5092
5093                 rt->var_tmp_dir = NULL;
5094         }
5095
5096         rt->id = mfree(rt->id);
5097         rt->tmp_dir = mfree(rt->tmp_dir);
5098         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5099         safe_close_pair(rt->netns_storage_socket);
5100         return mfree(rt);
5101 }
5102
5103 static void exec_runtime_freep(ExecRuntime **rt) {
5104         (void) exec_runtime_free(*rt, false);
5105 }
5106
5107 static int exec_runtime_allocate(ExecRuntime **ret) {
5108         ExecRuntime *n;
5109
5110         assert(ret);
5111
5112         n = new(ExecRuntime, 1);
5113         if (!n)
5114                 return -ENOMEM;
5115
5116         *n = (ExecRuntime) {
5117                 .netns_storage_socket = { -1, -1 },
5118         };
5119
5120         *ret = n;
5121         return 0;
5122 }
5123
5124 static int exec_runtime_add(
5125                 Manager *m,
5126                 const char *id,
5127                 const char *tmp_dir,
5128                 const char *var_tmp_dir,
5129                 const int netns_storage_socket[2],
5130                 ExecRuntime **ret) {
5131
5132         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5133         int r;
5134
5135         assert(m);
5136         assert(id);
5137
5138         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5139         if (r < 0)
5140                 return r;
5141
5142         r = exec_runtime_allocate(&rt);
5143         if (r < 0)
5144                 return r;
5145
5146         rt->id = strdup(id);
5147         if (!rt->id)
5148                 return -ENOMEM;
5149
5150         if (tmp_dir) {
5151                 rt->tmp_dir = strdup(tmp_dir);
5152                 if (!rt->tmp_dir)
5153                         return -ENOMEM;
5154
5155                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5156                 assert(var_tmp_dir);
5157                 rt->var_tmp_dir = strdup(var_tmp_dir);
5158                 if (!rt->var_tmp_dir)
5159                         return -ENOMEM;
5160         }
5161
5162         if (netns_storage_socket) {
5163                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5164                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5165         }
5166
5167         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5168         if (r < 0)
5169                 return r;
5170
5171         rt->manager = m;
5172
5173         if (ret)
5174                 *ret = rt;
5175
5176         /* do not remove created ExecRuntime object when the operation succeeds. */
5177         rt = NULL;
5178         return 0;
5179 }
5180
5181 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5182         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5183         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5184         int r;
5185
5186         assert(m);
5187         assert(c);
5188         assert(id);
5189
5190         /* It is not necessary to create ExecRuntime object. */
5191         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5192                 return 0;
5193
5194         if (c->private_tmp) {
5195                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5196                 if (r < 0)
5197                         return r;
5198         }
5199
5200         if (c->private_network || c->network_namespace_path) {
5201                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5202                         return -errno;
5203         }
5204
5205         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5206         if (r < 0)
5207                 return r;
5208
5209         /* Avoid cleanup */
5210         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5211         return 1;
5212 }
5213
5214 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5215         ExecRuntime *rt;
5216         int r;
5217
5218         assert(m);
5219         assert(id);
5220         assert(ret);
5221
5222         rt = hashmap_get(m->exec_runtime_by_id, id);
5223         if (rt)
5224                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5225                 goto ref;
5226
5227         if (!create)
5228                 return 0;
5229
5230         /* If not found, then create a new object. */
5231         r = exec_runtime_make(m, c, id, &rt);
5232         if (r <= 0)
5233                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5234                 return r;
5235
5236 ref:
5237         /* increment reference counter. */
5238         rt->n_ref++;
5239         *ret = rt;
5240         return 1;
5241 }
5242
5243 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5244         if (!rt)
5245                 return NULL;
5246
5247         assert(rt->n_ref > 0);
5248
5249         rt->n_ref--;
5250         if (rt->n_ref > 0)
5251                 return NULL;
5252
5253         return exec_runtime_free(rt, destroy);
5254 }
5255
5256 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5257         ExecRuntime *rt;
5258         Iterator i;
5259
5260         assert(m);
5261         assert(f);
5262         assert(fds);
5263
5264         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5265                 fprintf(f, "exec-runtime=%s", rt->id);
5266
5267                 if (rt->tmp_dir)
5268                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5269
5270                 if (rt->var_tmp_dir)
5271                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5272
5273                 if (rt->netns_storage_socket[0] >= 0) {
5274                         int copy;
5275
5276                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5277                         if (copy < 0)
5278                                 return copy;
5279
5280                         fprintf(f, " netns-socket-0=%i", copy);
5281                 }
5282
5283                 if (rt->netns_storage_socket[1] >= 0) {
5284                         int copy;
5285
5286                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5287                         if (copy < 0)
5288                                 return copy;
5289
5290                         fprintf(f, " netns-socket-1=%i", copy);
5291                 }
5292
5293                 fputc('\n', f);
5294         }
5295
5296         return 0;
5297 }
5298
5299 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5300         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5301         ExecRuntime *rt;
5302         int r;
5303
5304         /* This is for the migration from old (v237 or earlier) deserialization text.
5305          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5306          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5307          * so or not from the serialized text, then we always creates a new object owned by this. */
5308
5309         assert(u);
5310         assert(key);
5311         assert(value);
5312
5313         /* Manager manages ExecRuntime objects by the unit id.
5314          * So, we omit the serialized text when the unit does not have id (yet?)... */
5315         if (isempty(u->id)) {
5316                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5317                 return 0;
5318         }
5319
5320         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5321         if (r < 0) {
5322                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5323                 return 0;
5324         }
5325
5326         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5327         if (!rt) {
5328                 r = exec_runtime_allocate(&rt_create);
5329                 if (r < 0)
5330                         return log_oom();
5331
5332                 rt_create->id = strdup(u->id);
5333                 if (!rt_create->id)
5334                         return log_oom();
5335
5336                 rt = rt_create;
5337         }
5338
5339         if (streq(key, "tmp-dir")) {
5340                 char *copy;
5341
5342                 copy = strdup(value);
5343                 if (!copy)
5344                         return log_oom();
5345
5346                 free_and_replace(rt->tmp_dir, copy);
5347
5348         } else if (streq(key, "var-tmp-dir")) {
5349                 char *copy;
5350
5351                 copy = strdup(value);
5352                 if (!copy)
5353                         return log_oom();
5354
5355                 free_and_replace(rt->var_tmp_dir, copy);
5356
5357         } else if (streq(key, "netns-socket-0")) {
5358                 int fd;
5359
5360                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5361                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5362                         return 0;
5363                 }
5364
5365                 safe_close(rt->netns_storage_socket[0]);
5366                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5367
5368         } else if (streq(key, "netns-socket-1")) {
5369                 int fd;
5370
5371                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5372                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5373                         return 0;
5374                 }
5375
5376                 safe_close(rt->netns_storage_socket[1]);
5377                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5378         } else
5379                 return 0;
5380
5381         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5382         if (rt_create) {
5383                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5384                 if (r < 0) {
5385                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5386                         return 0;
5387                 }
5388
5389                 rt_create->manager = u->manager;
5390
5391                 /* Avoid cleanup */
5392                 rt_create = NULL;
5393         }
5394
5395         return 1;
5396 }
5397
5398 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5399         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5400         int r, fd0 = -1, fd1 = -1;
5401         const char *p, *v = value;
5402         size_t n;
5403
5404         assert(m);
5405         assert(value);
5406         assert(fds);
5407
5408         n = strcspn(v, " ");
5409         id = strndupa(v, n);
5410         if (v[n] != ' ')
5411                 goto finalize;
5412         p = v + n + 1;
5413
5414         v = startswith(p, "tmp-dir=");
5415         if (v) {
5416                 n = strcspn(v, " ");
5417                 tmp_dir = strndupa(v, n);
5418                 if (v[n] != ' ')
5419                         goto finalize;
5420                 p = v + n + 1;
5421         }
5422
5423         v = startswith(p, "var-tmp-dir=");
5424         if (v) {
5425                 n = strcspn(v, " ");
5426                 var_tmp_dir = strndupa(v, n);
5427                 if (v[n] != ' ')
5428                         goto finalize;
5429                 p = v + n + 1;
5430         }
5431
5432         v = startswith(p, "netns-socket-0=");
5433         if (v) {
5434                 char *buf;
5435
5436                 n = strcspn(v, " ");
5437                 buf = strndupa(v, n);
5438                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5439                         log_debug("Unable to process exec-runtime netns fd specification.");
5440                         return;
5441                 }
5442                 fd0 = fdset_remove(fds, fd0);
5443                 if (v[n] != ' ')
5444                         goto finalize;
5445                 p = v + n + 1;
5446         }
5447
5448         v = startswith(p, "netns-socket-1=");
5449         if (v) {
5450                 char *buf;
5451
5452                 n = strcspn(v, " ");
5453                 buf = strndupa(v, n);
5454                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5455                         log_debug("Unable to process exec-runtime netns fd specification.");
5456                         return;
5457                 }
5458                 fd1 = fdset_remove(fds, fd1);
5459         }
5460
5461 finalize:
5462
5463         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5464         if (r < 0)
5465                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5466 }
5467
5468 void exec_runtime_vacuum(Manager *m) {
5469         ExecRuntime *rt;
5470         Iterator i;
5471
5472         assert(m);
5473
5474         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5475
5476         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5477                 if (rt->n_ref > 0)
5478                         continue;
5479
5480                 (void) exec_runtime_free(rt, false);
5481         }
5482 }
5483
5484 void exec_params_clear(ExecParameters *p) {
5485         if (!p)
5486                 return;
5487
5488         strv_free(p->environment);
5489 }
5490
5491 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5492         [EXEC_INPUT_NULL] = "null",
5493         [EXEC_INPUT_TTY] = "tty",
5494         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5495         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5496         [EXEC_INPUT_SOCKET] = "socket",
5497         [EXEC_INPUT_NAMED_FD] = "fd",
5498         [EXEC_INPUT_DATA] = "data",
5499         [EXEC_INPUT_FILE] = "file",
5500 };
5501
5502 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5503
5504 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5505         [EXEC_OUTPUT_INHERIT] = "inherit",
5506         [EXEC_OUTPUT_NULL] = "null",
5507         [EXEC_OUTPUT_TTY] = "tty",
5508         [EXEC_OUTPUT_SYSLOG] = "syslog",
5509         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5510         [EXEC_OUTPUT_KMSG] = "kmsg",
5511         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5512         [EXEC_OUTPUT_JOURNAL] = "journal",
5513         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5514         [EXEC_OUTPUT_SOCKET] = "socket",
5515         [EXEC_OUTPUT_NAMED_FD] = "fd",
5516         [EXEC_OUTPUT_FILE] = "file",
5517         [EXEC_OUTPUT_FILE_APPEND] = "append",
5518 };
5519
5520 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5521
5522 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5523         [EXEC_UTMP_INIT] = "init",
5524         [EXEC_UTMP_LOGIN] = "login",
5525         [EXEC_UTMP_USER] = "user",
5526 };
5527
5528 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5529
5530 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5531         [EXEC_PRESERVE_NO] = "no",
5532         [EXEC_PRESERVE_YES] = "yes",
5533         [EXEC_PRESERVE_RESTART] = "restart",
5534 };
5535
5536 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5537
5538 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5539 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5540         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5541         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5542         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5543         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5544         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5545 };
5546
5547 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5548
5549 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5550  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5551  * directories, specifically .timer units with their timestamp touch file. */
5552 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5553         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5554         [EXEC_DIRECTORY_STATE] = "state",
5555         [EXEC_DIRECTORY_CACHE] = "cache",
5556         [EXEC_DIRECTORY_LOGS] = "logs",
5557         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5558 };
5559
5560 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5561
5562 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5563  * the service payload in. */
5564 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5565         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5566         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5567         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5568         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5569         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5570 };
5571
5572 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5573
5574 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5575         [EXEC_KEYRING_INHERIT] = "inherit",
5576         [EXEC_KEYRING_PRIVATE] = "private",
5577         [EXEC_KEYRING_SHARED] = "shared",
5578 };
5579
5580 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);