src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "execute.h"
  57 #include "exit-status.h"
  58 #include "fd-util.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "memory-util.h"
  69 #include "missing.h"
  70 #include "mkdir.h"
  71 #include "namespace.h"
  72 #include "parse-util.h"
  73 #include "path-util.h"
  74 #include "process-util.h"
  75 #include "rlimit-util.h"
  76 #include "rm-rf.h"
  77 #if HAVE_SECCOMP
  78 #include "seccomp-util.h"
  79 #endif
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "utmp-wtmp.h"
  96
  97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  99
 100 #define SNDBUF_SIZE (8*1024*1024)
 101
 102 static int shift_fds(int fds[], size_t n_fds) {
 103         int start, restart_from;
 104
 105         if (n_fds <= 0)
 106                 return 0;
 107
 108         /* Modifies the fds array! (sorts it) */
 109
 110         assert(fds);
 111
 112         start = 0;
 113         for (;;) {
 114                 int i;
 115
 116                 restart_from = -1;
 117
 118                 for (i = start; i < (int) n_fds; i++) {
 119                         int nfd;
 120
 121                         /* Already at right index? */
 122                         if (fds[i] == i+3)
 123                                 continue;
 124
 125                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 126                         if (nfd < 0)
 127                                 return -errno;
 128
 129                         safe_close(fds[i]);
 130                         fds[i] = nfd;
 131
 132                         /* Hmm, the fd we wanted isn't free? Then
 133                          * let's remember that and try again from here */
 134                         if (nfd != i+3 && restart_from < 0)
 135                                 restart_from = i;
 136                 }
 137
 138                 if (restart_from < 0)
 139                         break;
 140
 141                 start = restart_from;
 142         }
 143
 144         return 0;
 145 }
 146
 147 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 148         size_t i, n_fds;
 149         int r;
 150
 151         n_fds = n_socket_fds + n_storage_fds;
 152         if (n_fds <= 0)
 153                 return 0;
 154
 155         assert(fds);
 156
 157         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 158          * O_NONBLOCK only applies to socket activation though. */
 159
 160         for (i = 0; i < n_fds; i++) {
 161
 162                 if (i < n_socket_fds) {
 163                         r = fd_nonblock(fds[i], nonblock);
 164                         if (r < 0)
 165                                 return r;
 166                 }
 167
 168                 /* We unconditionally drop FD_CLOEXEC from the fds,
 169                  * since after all we want to pass these fds to our
 170                  * children */
 171
 172                 r = fd_cloexec(fds[i], false);
 173                 if (r < 0)
 174                         return r;
 175         }
 176
 177         return 0;
 178 }
 179
 180 static const char *exec_context_tty_path(const ExecContext *context) {
 181         assert(context);
 182
 183         if (context->stdio_as_fds)
 184                 return NULL;
 185
 186         if (context->tty_path)
 187                 return context->tty_path;
 188
 189         return "/dev/console";
 190 }
 191
 192 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 193         const char *path;
 194
 195         assert(context);
 196
 197         path = exec_context_tty_path(context);
 198
 199         if (context->tty_vhangup) {
 200                 if (p && p->stdin_fd >= 0)
 201                         (void) terminal_vhangup_fd(p->stdin_fd);
 202                 else if (path)
 203                         (void) terminal_vhangup(path);
 204         }
 205
 206         if (context->tty_reset) {
 207                 if (p && p->stdin_fd >= 0)
 208                         (void) reset_terminal_fd(p->stdin_fd, true);
 209                 else if (path)
 210                         (void) reset_terminal(path);
 211         }
 212
 213         if (context->tty_vt_disallocate && path)
 214                 (void) vt_disallocate(path);
 215 }
 216
 217 static bool is_terminal_input(ExecInput i) {
 218         return IN_SET(i,
 219                       EXEC_INPUT_TTY,
 220                       EXEC_INPUT_TTY_FORCE,
 221                       EXEC_INPUT_TTY_FAIL);
 222 }
 223
 224 static bool is_terminal_output(ExecOutput o) {
 225         return IN_SET(o,
 226                       EXEC_OUTPUT_TTY,
 227                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 228                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 229                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 230 }
 231
 232 static bool is_syslog_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_SYSLOG,
 235                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 236 }
 237
 238 static bool is_kmsg_output(ExecOutput o) {
 239         return IN_SET(o,
 240                       EXEC_OUTPUT_KMSG,
 241                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 242 }
 243
 244 static bool exec_context_needs_term(const ExecContext *c) {
 245         assert(c);
 246
 247         /* Return true if the execution context suggests we should set $TERM to something useful. */
 248
 249         if (is_terminal_input(c->std_input))
 250                 return true;
 251
 252         if (is_terminal_output(c->std_output))
 253                 return true;
 254
 255         if (is_terminal_output(c->std_error))
 256                 return true;
 257
 258         return !!c->tty_path;
 259 }
 260
 261 static int open_null_as(int flags, int nfd) {
 262         int fd;
 263
 264         assert(nfd >= 0);
 265
 266         fd = open("/dev/null", flags|O_NOCTTY);
 267         if (fd < 0)
 268                 return -errno;
 269
 270         return move_fd(fd, nfd, false);
 271 }
 272
 273 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 274         static const union sockaddr_union sa = {
 275                 .un.sun_family = AF_UNIX,
 276                 .un.sun_path = "/run/systemd/journal/stdout",
 277         };
 278         uid_t olduid = UID_INVALID;
 279         gid_t oldgid = GID_INVALID;
 280         int r;
 281
 282         if (gid_is_valid(gid)) {
 283                 oldgid = getgid();
 284
 285                 if (setegid(gid) < 0)
 286                         return -errno;
 287         }
 288
 289         if (uid_is_valid(uid)) {
 290                 olduid = getuid();
 291
 292                 if (seteuid(uid) < 0) {
 293                         r = -errno;
 294                         goto restore_gid;
 295                 }
 296         }
 297
 298         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 299
 300         /* If we fail to restore the uid or gid, things will likely
 301            fail later on. This should only happen if an LSM interferes. */
 302
 303         if (uid_is_valid(uid))
 304                 (void) seteuid(olduid);
 305
 306  restore_gid:
 307         if (gid_is_valid(gid))
 308                 (void) setegid(oldgid);
 309
 310         return r;
 311 }
 312
 313 static int connect_logger_as(
 314                 const Unit *unit,
 315                 const ExecContext *context,
 316                 const ExecParameters *params,
 317                 ExecOutput output,
 318                 const char *ident,
 319                 int nfd,
 320                 uid_t uid,
 321                 gid_t gid) {
 322
 323         _cleanup_close_ int fd = -1;
 324         int r;
 325
 326         assert(context);
 327         assert(params);
 328         assert(output < _EXEC_OUTPUT_MAX);
 329         assert(ident);
 330         assert(nfd >= 0);
 331
 332         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 333         if (fd < 0)
 334                 return -errno;
 335
 336         r = connect_journal_socket(fd, uid, gid);
 337         if (r < 0)
 338                 return r;
 339
 340         if (shutdown(fd, SHUT_RD) < 0)
 341                 return -errno;
 342
 343         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 344
 345         if (dprintf(fd,
 346                 "%s\n"
 347                 "%s\n"
 348                 "%i\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n",
 353                 context->syslog_identifier ?: ident,
 354                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 355                 context->syslog_priority,
 356                 !!context->syslog_level_prefix,
 357                 is_syslog_output(output),
 358                 is_kmsg_output(output),
 359                 is_terminal_output(output)) < 0)
 360                 return -errno;
 361
 362         return move_fd(TAKE_FD(fd), nfd, false);
 363 }
 364
 365 static int open_terminal_as(const char *path, int flags, int nfd) {
 366         int fd;
 367
 368         assert(path);
 369         assert(nfd >= 0);
 370
 371         fd = open_terminal(path, flags | O_NOCTTY);
 372         if (fd < 0)
 373                 return fd;
 374
 375         return move_fd(fd, nfd, false);
 376 }
 377
 378 static int acquire_path(const char *path, int flags, mode_t mode) {
 379         union sockaddr_union sa = {};
 380         _cleanup_close_ int fd = -1;
 381         int r, salen;
 382
 383         assert(path);
 384
 385         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 386                 flags |= O_CREAT;
 387
 388         fd = open(path, flags|O_NOCTTY, mode);
 389         if (fd >= 0)
 390                 return TAKE_FD(fd);
 391
 392         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 393                 return -errno;
 394         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 395                 return -ENXIO;
 396
 397         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 398
 399         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 400         if (fd < 0)
 401                 return -errno;
 402
 403         salen = sockaddr_un_set_path(&sa.un, path);
 404         if (salen < 0)
 405                 return salen;
 406
 407         if (connect(fd, &sa.sa, salen) < 0)
 408                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 409                                                            * indication that his wasn't an AF_UNIX socket after all */
 410
 411         if ((flags & O_ACCMODE) == O_RDONLY)
 412                 r = shutdown(fd, SHUT_WR);
 413         else if ((flags & O_ACCMODE) == O_WRONLY)
 414                 r = shutdown(fd, SHUT_RD);
 415         else
 416                 return TAKE_FD(fd);
 417         if (r < 0)
 418                 return -errno;
 419
 420         return TAKE_FD(fd);
 421 }
 422
 423 static int fixup_input(
 424                 const ExecContext *context,
 425                 int socket_fd,
 426                 bool apply_tty_stdin) {
 427
 428         ExecInput std_input;
 429
 430         assert(context);
 431
 432         std_input = context->std_input;
 433
 434         if (is_terminal_input(std_input) && !apply_tty_stdin)
 435                 return EXEC_INPUT_NULL;
 436
 437         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         return std_input;
 444 }
 445
 446 static int fixup_output(ExecOutput std_output, int socket_fd) {
 447
 448         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 449                 return EXEC_OUTPUT_INHERIT;
 450
 451         return std_output;
 452 }
 453
 454 static int setup_input(
 455                 const ExecContext *context,
 456                 const ExecParameters *params,
 457                 int socket_fd,
 458                 const int named_iofds[static 3]) {
 459
 460         ExecInput i;
 461
 462         assert(context);
 463         assert(params);
 464         assert(named_iofds);
 465
 466         if (params->stdin_fd >= 0) {
 467                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 468                         return -errno;
 469
 470                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 471                 if (isatty(STDIN_FILENO)) {
 472                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 473                         (void) reset_terminal_fd(STDIN_FILENO, true);
 474                 }
 475
 476                 return STDIN_FILENO;
 477         }
 478
 479         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 480
 481         switch (i) {
 482
 483         case EXEC_INPUT_NULL:
 484                 return open_null_as(O_RDONLY, STDIN_FILENO);
 485
 486         case EXEC_INPUT_TTY:
 487         case EXEC_INPUT_TTY_FORCE:
 488         case EXEC_INPUT_TTY_FAIL: {
 489                 int fd;
 490
 491                 fd = acquire_terminal(exec_context_tty_path(context),
 492                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 493                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 494                                                                   ACQUIRE_TERMINAL_WAIT,
 495                                       USEC_INFINITY);
 496                 if (fd < 0)
 497                         return fd;
 498
 499                 return move_fd(fd, STDIN_FILENO, false);
 500         }
 501
 502         case EXEC_INPUT_SOCKET:
 503                 assert(socket_fd >= 0);
 504
 505                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 506
 507         case EXEC_INPUT_NAMED_FD:
 508                 assert(named_iofds[STDIN_FILENO] >= 0);
 509
 510                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 511                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 512
 513         case EXEC_INPUT_DATA: {
 514                 int fd;
 515
 516                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 517                 if (fd < 0)
 518                         return fd;
 519
 520                 return move_fd(fd, STDIN_FILENO, false);
 521         }
 522
 523         case EXEC_INPUT_FILE: {
 524                 bool rw;
 525                 int fd;
 526
 527                 assert(context->stdio_file[STDIN_FILENO]);
 528
 529                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 530                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 531
 532                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 533                 if (fd < 0)
 534                         return fd;
 535
 536                 return move_fd(fd, STDIN_FILENO, false);
 537         }
 538
 539         default:
 540                 assert_not_reached("Unknown input type");
 541         }
 542 }
 543
 544 static bool can_inherit_stderr_from_stdout(
 545                 const ExecContext *context,
 546                 ExecOutput o,
 547                 ExecOutput e) {
 548
 549         assert(context);
 550
 551         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 552          * stderr fd */
 553
 554         if (e == EXEC_OUTPUT_INHERIT)
 555                 return true;
 556         if (e != o)
 557                 return false;
 558
 559         if (e == EXEC_OUTPUT_NAMED_FD)
 560                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 561
 562         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 563                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 564
 565         return true;
 566 }
 567
 568 static int setup_output(
 569                 const Unit *unit,
 570                 const ExecContext *context,
 571                 const ExecParameters *params,
 572                 int fileno,
 573                 int socket_fd,
 574                 const int named_iofds[static 3],
 575                 const char *ident,
 576                 uid_t uid,
 577                 gid_t gid,
 578                 dev_t *journal_stream_dev,
 579                 ino_t *journal_stream_ino) {
 580
 581         ExecOutput o;
 582         ExecInput i;
 583         int r;
 584
 585         assert(unit);
 586         assert(context);
 587         assert(params);
 588         assert(ident);
 589         assert(journal_stream_dev);
 590         assert(journal_stream_ino);
 591
 592         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 593
 594                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 595                         return -errno;
 596
 597                 return STDOUT_FILENO;
 598         }
 599
 600         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 601                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 602                         return -errno;
 603
 604                 return STDERR_FILENO;
 605         }
 606
 607         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 608         o = fixup_output(context->std_output, socket_fd);
 609
 610         if (fileno == STDERR_FILENO) {
 611                 ExecOutput e;
 612                 e = fixup_output(context->std_error, socket_fd);
 613
 614                 /* This expects the input and output are already set up */
 615
 616                 /* Don't change the stderr file descriptor if we inherit all
 617                  * the way and are not on a tty */
 618                 if (e == EXEC_OUTPUT_INHERIT &&
 619                     o == EXEC_OUTPUT_INHERIT &&
 620                     i == EXEC_INPUT_NULL &&
 621                     !is_terminal_input(context->std_input) &&
 622                     getppid () != 1)
 623                         return fileno;
 624
 625                 /* Duplicate from stdout if possible */
 626                 if (can_inherit_stderr_from_stdout(context, o, e))
 627                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 628
 629                 o = e;
 630
 631         } else if (o == EXEC_OUTPUT_INHERIT) {
 632                 /* If input got downgraded, inherit the original value */
 633                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 634                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 635
 636                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 637                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 638                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 639
 640                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 641                 if (getppid() != 1)
 642                         return fileno;
 643
 644                 /* We need to open /dev/null here anew, to get the right access mode. */
 645                 return open_null_as(O_WRONLY, fileno);
 646         }
 647
 648         switch (o) {
 649
 650         case EXEC_OUTPUT_NULL:
 651                 return open_null_as(O_WRONLY, fileno);
 652
 653         case EXEC_OUTPUT_TTY:
 654                 if (is_terminal_input(i))
 655                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 656
 657                 /* We don't reset the terminal if this is just about output */
 658                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 659
 660         case EXEC_OUTPUT_SYSLOG:
 661         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 662         case EXEC_OUTPUT_KMSG:
 663         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 664         case EXEC_OUTPUT_JOURNAL:
 665         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 666                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 667                 if (r < 0) {
 668                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 669                         r = open_null_as(O_WRONLY, fileno);
 670                 } else {
 671                         struct stat st;
 672
 673                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 674                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 675                          * services to detect whether they are connected to the journal or not.
 676                          *
 677                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 678                          * about STDERR as that's usually the best way to do logging. */
 679
 680                         if (fstat(fileno, &st) >= 0 &&
 681                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 682                                 *journal_stream_dev = st.st_dev;
 683                                 *journal_stream_ino = st.st_ino;
 684                         }
 685                 }
 686                 return r;
 687
 688         case EXEC_OUTPUT_SOCKET:
 689                 assert(socket_fd >= 0);
 690
 691                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 692
 693         case EXEC_OUTPUT_NAMED_FD:
 694                 assert(named_iofds[fileno] >= 0);
 695
 696                 (void) fd_nonblock(named_iofds[fileno], false);
 697                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 698
 699         case EXEC_OUTPUT_FILE:
 700         case EXEC_OUTPUT_FILE_APPEND: {
 701                 bool rw;
 702                 int fd, flags;
 703
 704                 assert(context->stdio_file[fileno]);
 705
 706                 rw = context->std_input == EXEC_INPUT_FILE &&
 707                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 708
 709                 if (rw)
 710                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 711
 712                 flags = O_WRONLY;
 713                 if (o == EXEC_OUTPUT_FILE_APPEND)
 714                         flags |= O_APPEND;
 715
 716                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 717                 if (fd < 0)
 718                         return fd;
 719
 720                 return move_fd(fd, fileno, 0);
 721         }
 722
 723         default:
 724                 assert_not_reached("Unknown error type");
 725         }
 726 }
 727
 728 static int chown_terminal(int fd, uid_t uid) {
 729         int r;
 730
 731         assert(fd >= 0);
 732
 733         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 734         if (isatty(fd) < 1) {
 735                 if (IN_SET(errno, EINVAL, ENOTTY))
 736                         return 0; /* not a tty */
 737
 738                 return -errno;
 739         }
 740
 741         /* This might fail. What matters are the results. */
 742         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 743         if (r < 0)
 744                 return r;
 745
 746         return 1;
 747 }
 748
 749 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 750         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 751         int r;
 752
 753         assert(_saved_stdin);
 754         assert(_saved_stdout);
 755
 756         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 757         if (saved_stdin < 0)
 758                 return -errno;
 759
 760         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 761         if (saved_stdout < 0)
 762                 return -errno;
 763
 764         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 765         if (fd < 0)
 766                 return fd;
 767
 768         r = chown_terminal(fd, getuid());
 769         if (r < 0)
 770                 return r;
 771
 772         r = reset_terminal_fd(fd, true);
 773         if (r < 0)
 774                 return r;
 775
 776         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 777         fd = -1;
 778         if (r < 0)
 779                 return r;
 780
 781         *_saved_stdin = saved_stdin;
 782         *_saved_stdout = saved_stdout;
 783
 784         saved_stdin = saved_stdout = -1;
 785
 786         return 0;
 787 }
 788
 789 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 790         assert(err < 0);
 791
 792         if (err == -ETIMEDOUT)
 793                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 794         else {
 795                 errno = -err;
 796                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 797         }
 798 }
 799
 800 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 801         _cleanup_close_ int fd = -1;
 802
 803         assert(vc);
 804
 805         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 806         if (fd < 0)
 807                 return;
 808
 809         write_confirm_error_fd(err, fd, u);
 810 }
 811
 812 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 813         int r = 0;
 814
 815         assert(saved_stdin);
 816         assert(saved_stdout);
 817
 818         release_terminal();
 819
 820         if (*saved_stdin >= 0)
 821                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 822                         r = -errno;
 823
 824         if (*saved_stdout >= 0)
 825                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 826                         r = -errno;
 827
 828         *saved_stdin = safe_close(*saved_stdin);
 829         *saved_stdout = safe_close(*saved_stdout);
 830
 831         return r;
 832 }
 833
 834 enum {
 835         CONFIRM_PRETEND_FAILURE = -1,
 836         CONFIRM_PRETEND_SUCCESS =  0,
 837         CONFIRM_EXECUTE = 1,
 838 };
 839
 840 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 841         int saved_stdout = -1, saved_stdin = -1, r;
 842         _cleanup_free_ char *e = NULL;
 843         char c;
 844
 845         /* For any internal errors, assume a positive response. */
 846         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 847         if (r < 0) {
 848                 write_confirm_error(r, vc, u);
 849                 return CONFIRM_EXECUTE;
 850         }
 851
 852         /* confirm_spawn might have been disabled while we were sleeping. */
 853         if (manager_is_confirm_spawn_disabled(u->manager)) {
 854                 r = 1;
 855                 goto restore_stdio;
 856         }
 857
 858         e = ellipsize(cmdline, 60, 100);
 859         if (!e) {
 860                 log_oom();
 861                 r = CONFIRM_EXECUTE;
 862                 goto restore_stdio;
 863         }
 864
 865         for (;;) {
 866                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 867                 if (r < 0) {
 868                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 869                         r = CONFIRM_EXECUTE;
 870                         goto restore_stdio;
 871                 }
 872
 873                 switch (c) {
 874                 case 'c':
 875                         printf("Resuming normal execution.\n");
 876                         manager_disable_confirm_spawn();
 877                         r = 1;
 878                         break;
 879                 case 'D':
 880                         unit_dump(u, stdout, "  ");
 881                         continue; /* ask again */
 882                 case 'f':
 883                         printf("Failing execution.\n");
 884                         r = CONFIRM_PRETEND_FAILURE;
 885                         break;
 886                 case 'h':
 887                         printf("  c - continue, proceed without asking anymore\n"
 888                                "  D - dump, show the state of the unit\n"
 889                                "  f - fail, don't execute the command and pretend it failed\n"
 890                                "  h - help\n"
 891                                "  i - info, show a short summary of the unit\n"
 892                                "  j - jobs, show jobs that are in progress\n"
 893                                "  s - skip, don't execute the command and pretend it succeeded\n"
 894                                "  y - yes, execute the command\n");
 895                         continue; /* ask again */
 896                 case 'i':
 897                         printf("  Description: %s\n"
 898                                "  Unit:        %s\n"
 899                                "  Command:     %s\n",
 900                                u->id, u->description, cmdline);
 901                         continue; /* ask again */
 902                 case 'j':
 903                         manager_dump_jobs(u->manager, stdout, "  ");
 904                         continue; /* ask again */
 905                 case 'n':
 906                         /* 'n' was removed in favor of 'f'. */
 907                         printf("Didn't understand 'n', did you mean 'f'?\n");
 908                         continue; /* ask again */
 909                 case 's':
 910                         printf("Skipping execution.\n");
 911                         r = CONFIRM_PRETEND_SUCCESS;
 912                         break;
 913                 case 'y':
 914                         r = CONFIRM_EXECUTE;
 915                         break;
 916                 default:
 917                         assert_not_reached("Unhandled choice");
 918                 }
 919                 break;
 920         }
 921
 922 restore_stdio:
 923         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 924         return r;
 925 }
 926
 927 static int get_fixed_user(const ExecContext *c, const char **user,
 928                           uid_t *uid, gid_t *gid,
 929                           const char **home, const char **shell) {
 930         int r;
 931         const char *name;
 932
 933         assert(c);
 934
 935         if (!c->user)
 936                 return 0;
 937
 938         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 939          * (i.e. are "/" or "/bin/nologin"). */
 940
 941         name = c->user;
 942         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 943         if (r < 0)
 944                 return r;
 945
 946         *user = name;
 947         return 0;
 948 }
 949
 950 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 951         int r;
 952         const char *name;
 953
 954         assert(c);
 955
 956         if (!c->group)
 957                 return 0;
 958
 959         name = c->group;
 960         r = get_group_creds(&name, gid, 0);
 961         if (r < 0)
 962                 return r;
 963
 964         *group = name;
 965         return 0;
 966 }
 967
 968 static int get_supplementary_groups(const ExecContext *c, const char *user,
 969                                     const char *group, gid_t gid,
 970                                     gid_t **supplementary_gids, int *ngids) {
 971         char **i;
 972         int r, k = 0;
 973         int ngroups_max;
 974         bool keep_groups = false;
 975         gid_t *groups = NULL;
 976         _cleanup_free_ gid_t *l_gids = NULL;
 977
 978         assert(c);
 979
 980         /*
 981          * If user is given, then lookup GID and supplementary groups list.
 982          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 983          * here and as early as possible so we keep the list of supplementary
 984          * groups of the caller.
 985          */
 986         if (user && gid_is_valid(gid) && gid != 0) {
 987                 /* First step, initialize groups from /etc/groups */
 988                 if (initgroups(user, gid) < 0)
 989                         return -errno;
 990
 991                 keep_groups = true;
 992         }
 993
 994         if (strv_isempty(c->supplementary_groups))
 995                 return 0;
 996
 997         /*
 998          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 999          * be positive, otherwise fail.
1000          */
1001         errno = 0;
1002         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1003         if (ngroups_max <= 0)
1004                 return errno_or_else(EOPNOTSUPP);
1005
1006         l_gids = new(gid_t, ngroups_max);
1007         if (!l_gids)
1008                 return -ENOMEM;
1009
1010         if (keep_groups) {
1011                 /*
1012                  * Lookup the list of groups that the user belongs to, we
1013                  * avoid NSS lookups here too for gid=0.
1014                  */
1015                 k = ngroups_max;
1016                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1017                         return -EINVAL;
1018         } else
1019                 k = 0;
1020
1021         STRV_FOREACH(i, c->supplementary_groups) {
1022                 const char *g;
1023
1024                 if (k >= ngroups_max)
1025                         return -E2BIG;
1026
1027                 g = *i;
1028                 r = get_group_creds(&g, l_gids+k, 0);
1029                 if (r < 0)
1030                         return r;
1031
1032                 k++;
1033         }
1034
1035         /*
1036          * Sets ngids to zero to drop all supplementary groups, happens
1037          * when we are under root and SupplementaryGroups= is empty.
1038          */
1039         if (k == 0) {
1040                 *ngids = 0;
1041                 return 0;
1042         }
1043
1044         /* Otherwise get the final list of supplementary groups */
1045         groups = memdup(l_gids, sizeof(gid_t) * k);
1046         if (!groups)
1047                 return -ENOMEM;
1048
1049         *supplementary_gids = groups;
1050         *ngids = k;
1051
1052         groups = NULL;
1053
1054         return 0;
1055 }
1056
1057 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1058         int r;
1059
1060         /* Handle SupplementaryGroups= if it is not empty */
1061         if (ngids > 0) {
1062                 r = maybe_setgroups(ngids, supplementary_gids);
1063                 if (r < 0)
1064                         return r;
1065         }
1066
1067         if (gid_is_valid(gid)) {
1068                 /* Then set our gids */
1069                 if (setresgid(gid, gid, gid) < 0)
1070                         return -errno;
1071         }
1072
1073         return 0;
1074 }
1075
1076 static int enforce_user(const ExecContext *context, uid_t uid) {
1077         assert(context);
1078
1079         if (!uid_is_valid(uid))
1080                 return 0;
1081
1082         /* Sets (but doesn't look up) the uid and make sure we keep the
1083          * capabilities while doing so. */
1084
1085         if (context->capability_ambient_set != 0) {
1086
1087                 /* First step: If we need to keep capabilities but
1088                  * drop privileges we need to make sure we keep our
1089                  * caps, while we drop privileges. */
1090                 if (uid != 0) {
1091                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1092
1093                         if (prctl(PR_GET_SECUREBITS) != sb)
1094                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1095                                         return -errno;
1096                 }
1097         }
1098
1099         /* Second step: actually set the uids */
1100         if (setresuid(uid, uid, uid) < 0)
1101                 return -errno;
1102
1103         /* At this point we should have all necessary capabilities but
1104            are otherwise a normal user. However, the caps might got
1105            corrupted due to the setresuid() so we need clean them up
1106            later. This is done outside of this call. */
1107
1108         return 0;
1109 }
1110
1111 #if HAVE_PAM
1112
1113 static int null_conv(
1114                 int num_msg,
1115                 const struct pam_message **msg,
1116                 struct pam_response **resp,
1117                 void *appdata_ptr) {
1118
1119         /* We don't support conversations */
1120
1121         return PAM_CONV_ERR;
1122 }
1123
1124 #endif
1125
1126 static int setup_pam(
1127                 const char *name,
1128                 const char *user,
1129                 uid_t uid,
1130                 gid_t gid,
1131                 const char *tty,
1132                 char ***env,
1133                 int fds[], size_t n_fds) {
1134
1135 #if HAVE_PAM
1136
1137         static const struct pam_conv conv = {
1138                 .conv = null_conv,
1139                 .appdata_ptr = NULL
1140         };
1141
1142         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1143         pam_handle_t *handle = NULL;
1144         sigset_t old_ss;
1145         int pam_code = PAM_SUCCESS, r;
1146         char **nv, **e = NULL;
1147         bool close_session = false;
1148         pid_t pam_pid = 0, parent_pid;
1149         int flags = 0;
1150
1151         assert(name);
1152         assert(user);
1153         assert(env);
1154
1155         /* We set up PAM in the parent process, then fork. The child
1156          * will then stay around until killed via PR_GET_PDEATHSIG or
1157          * systemd via the cgroup logic. It will then remove the PAM
1158          * session again. The parent process will exec() the actual
1159          * daemon. We do things this way to ensure that the main PID
1160          * of the daemon is the one we initially fork()ed. */
1161
1162         r = barrier_create(&barrier);
1163         if (r < 0)
1164                 goto fail;
1165
1166         if (log_get_max_level() < LOG_DEBUG)
1167                 flags |= PAM_SILENT;
1168
1169         pam_code = pam_start(name, user, &conv, &handle);
1170         if (pam_code != PAM_SUCCESS) {
1171                 handle = NULL;
1172                 goto fail;
1173         }
1174
1175         if (!tty) {
1176                 _cleanup_free_ char *q = NULL;
1177
1178                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1179                  * out if that's the case, and read the TTY off it. */
1180
1181                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1182                         tty = strjoina("/dev/", q);
1183         }
1184
1185         if (tty) {
1186                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1187                 if (pam_code != PAM_SUCCESS)
1188                         goto fail;
1189         }
1190
1191         STRV_FOREACH(nv, *env) {
1192                 pam_code = pam_putenv(handle, *nv);
1193                 if (pam_code != PAM_SUCCESS)
1194                         goto fail;
1195         }
1196
1197         pam_code = pam_acct_mgmt(handle, flags);
1198         if (pam_code != PAM_SUCCESS)
1199                 goto fail;
1200
1201         pam_code = pam_open_session(handle, flags);
1202         if (pam_code != PAM_SUCCESS)
1203                 goto fail;
1204
1205         close_session = true;
1206
1207         e = pam_getenvlist(handle);
1208         if (!e) {
1209                 pam_code = PAM_BUF_ERR;
1210                 goto fail;
1211         }
1212
1213         /* Block SIGTERM, so that we know that it won't get lost in
1214          * the child */
1215
1216         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1217
1218         parent_pid = getpid_cached();
1219
1220         r = safe_fork("(sd-pam)", 0, &pam_pid);
1221         if (r < 0)
1222                 goto fail;
1223         if (r == 0) {
1224                 int sig, ret = EXIT_PAM;
1225
1226                 /* The child's job is to reset the PAM session on
1227                  * termination */
1228                 barrier_set_role(&barrier, BARRIER_CHILD);
1229
1230                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1231                  * are open here that have been opened by PAM. */
1232                 (void) close_many(fds, n_fds);
1233
1234                 /* Drop privileges - we don't need any to pam_close_session
1235                  * and this will make PR_SET_PDEATHSIG work in most cases.
1236                  * If this fails, ignore the error - but expect sd-pam threads
1237                  * to fail to exit normally */
1238
1239                 r = maybe_setgroups(0, NULL);
1240                 if (r < 0)
1241                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1242                 if (setresgid(gid, gid, gid) < 0)
1243                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1244                 if (setresuid(uid, uid, uid) < 0)
1245                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1246
1247                 (void) ignore_signals(SIGPIPE, -1);
1248
1249                 /* Wait until our parent died. This will only work if
1250                  * the above setresuid() succeeds, otherwise the kernel
1251                  * will not allow unprivileged parents kill their privileged
1252                  * children this way. We rely on the control groups kill logic
1253                  * to do the rest for us. */
1254                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1255                         goto child_finish;
1256
1257                 /* Tell the parent that our setup is done. This is especially
1258                  * important regarding dropping privileges. Otherwise, unit
1259                  * setup might race against our setresuid(2) call.
1260                  *
1261                  * If the parent aborted, we'll detect this below, hence ignore
1262                  * return failure here. */
1263                 (void) barrier_place(&barrier);
1264
1265                 /* Check if our parent process might already have died? */
1266                 if (getppid() == parent_pid) {
1267                         sigset_t ss;
1268
1269                         assert_se(sigemptyset(&ss) >= 0);
1270                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1271
1272                         for (;;) {
1273                                 if (sigwait(&ss, &sig) < 0) {
1274                                         if (errno == EINTR)
1275                                                 continue;
1276
1277                                         goto child_finish;
1278                                 }
1279
1280                                 assert(sig == SIGTERM);
1281                                 break;
1282                         }
1283                 }
1284
1285                 /* If our parent died we'll end the session */
1286                 if (getppid() != parent_pid) {
1287                         pam_code = pam_close_session(handle, flags);
1288                         if (pam_code != PAM_SUCCESS)
1289                                 goto child_finish;
1290                 }
1291
1292                 ret = 0;
1293
1294         child_finish:
1295                 pam_end(handle, pam_code | flags);
1296                 _exit(ret);
1297         }
1298
1299         barrier_set_role(&barrier, BARRIER_PARENT);
1300
1301         /* If the child was forked off successfully it will do all the
1302          * cleanups, so forget about the handle here. */
1303         handle = NULL;
1304
1305         /* Unblock SIGTERM again in the parent */
1306         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1307
1308         /* We close the log explicitly here, since the PAM modules
1309          * might have opened it, but we don't want this fd around. */
1310         closelog();
1311
1312         /* Synchronously wait for the child to initialize. We don't care for
1313          * errors as we cannot recover. However, warn loudly if it happens. */
1314         if (!barrier_place_and_sync(&barrier))
1315                 log_error("PAM initialization failed");
1316
1317         return strv_free_and_replace(*env, e);
1318
1319 fail:
1320         if (pam_code != PAM_SUCCESS) {
1321                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1322                 r = -EPERM;  /* PAM errors do not map to errno */
1323         } else
1324                 log_error_errno(r, "PAM failed: %m");
1325
1326         if (handle) {
1327                 if (close_session)
1328                         pam_code = pam_close_session(handle, flags);
1329
1330                 pam_end(handle, pam_code | flags);
1331         }
1332
1333         strv_free(e);
1334         closelog();
1335
1336         return r;
1337 #else
1338         return 0;
1339 #endif
1340 }
1341
1342 static void rename_process_from_path(const char *path) {
1343         char process_name[11];
1344         const char *p;
1345         size_t l;
1346
1347         /* This resulting string must fit in 10 chars (i.e. the length
1348          * of "/sbin/init") to look pretty in /bin/ps */
1349
1350         p = basename(path);
1351         if (isempty(p)) {
1352                 rename_process("(...)");
1353                 return;
1354         }
1355
1356         l = strlen(p);
1357         if (l > 8) {
1358                 /* The end of the process name is usually more
1359                  * interesting, since the first bit might just be
1360                  * "systemd-" */
1361                 p = p + l - 8;
1362                 l = 8;
1363         }
1364
1365         process_name[0] = '(';
1366         memcpy(process_name+1, p, l);
1367         process_name[1+l] = ')';
1368         process_name[1+l+1] = 0;
1369
1370         rename_process(process_name);
1371 }
1372
1373 static bool context_has_address_families(const ExecContext *c) {
1374         assert(c);
1375
1376         return c->address_families_whitelist ||
1377                 !set_isempty(c->address_families);
1378 }
1379
1380 static bool context_has_syscall_filters(const ExecContext *c) {
1381         assert(c);
1382
1383         return c->syscall_whitelist ||
1384                 !hashmap_isempty(c->syscall_filter);
1385 }
1386
1387 static bool context_has_no_new_privileges(const ExecContext *c) {
1388         assert(c);
1389
1390         if (c->no_new_privileges)
1391                 return true;
1392
1393         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1394                 return false;
1395
1396         /* We need NNP if we have any form of seccomp and are unprivileged */
1397         return context_has_address_families(c) ||
1398                 c->memory_deny_write_execute ||
1399                 c->restrict_realtime ||
1400                 c->restrict_suid_sgid ||
1401                 exec_context_restrict_namespaces_set(c) ||
1402                 c->protect_kernel_tunables ||
1403                 c->protect_kernel_modules ||
1404                 c->private_devices ||
1405                 context_has_syscall_filters(c) ||
1406                 !set_isempty(c->syscall_archs) ||
1407                 c->lock_personality ||
1408                 c->protect_hostname;
1409 }
1410
1411 #if HAVE_SECCOMP
1412
1413 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1414
1415         if (is_seccomp_available())
1416                 return false;
1417
1418         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1419         return true;
1420 }
1421
1422 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1423         uint32_t negative_action, default_action, action;
1424         int r;
1425
1426         assert(u);
1427         assert(c);
1428
1429         if (!context_has_syscall_filters(c))
1430                 return 0;
1431
1432         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1433                 return 0;
1434
1435         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1436
1437         if (c->syscall_whitelist) {
1438                 default_action = negative_action;
1439                 action = SCMP_ACT_ALLOW;
1440         } else {
1441                 default_action = SCMP_ACT_ALLOW;
1442                 action = negative_action;
1443         }
1444
1445         if (needs_ambient_hack) {
1446                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1447                 if (r < 0)
1448                         return r;
1449         }
1450
1451         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1452 }
1453
1454 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1455         assert(u);
1456         assert(c);
1457
1458         if (set_isempty(c->syscall_archs))
1459                 return 0;
1460
1461         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1462                 return 0;
1463
1464         return seccomp_restrict_archs(c->syscall_archs);
1465 }
1466
1467 static int apply_address_families(const Unit* u, const ExecContext *c) {
1468         assert(u);
1469         assert(c);
1470
1471         if (!context_has_address_families(c))
1472                 return 0;
1473
1474         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1475                 return 0;
1476
1477         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1478 }
1479
1480 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1481         assert(u);
1482         assert(c);
1483
1484         if (!c->memory_deny_write_execute)
1485                 return 0;
1486
1487         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1488                 return 0;
1489
1490         return seccomp_memory_deny_write_execute();
1491 }
1492
1493 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1494         assert(u);
1495         assert(c);
1496
1497         if (!c->restrict_realtime)
1498                 return 0;
1499
1500         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1501                 return 0;
1502
1503         return seccomp_restrict_realtime();
1504 }
1505
1506 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1507         assert(u);
1508         assert(c);
1509
1510         if (!c->restrict_suid_sgid)
1511                 return 0;
1512
1513         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1514                 return 0;
1515
1516         return seccomp_restrict_suid_sgid();
1517 }
1518
1519 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1520         assert(u);
1521         assert(c);
1522
1523         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1524          * let's protect even those systems where this is left on in the kernel. */
1525
1526         if (!c->protect_kernel_tunables)
1527                 return 0;
1528
1529         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1530                 return 0;
1531
1532         return seccomp_protect_sysctl();
1533 }
1534
1535 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1536         assert(u);
1537         assert(c);
1538
1539         /* Turn off module syscalls on ProtectKernelModules=yes */
1540
1541         if (!c->protect_kernel_modules)
1542                 return 0;
1543
1544         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1545                 return 0;
1546
1547         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1548 }
1549
1550 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1551         assert(u);
1552         assert(c);
1553
1554         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1555
1556         if (!c->private_devices)
1557                 return 0;
1558
1559         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1560                 return 0;
1561
1562         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1563 }
1564
1565 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1566         assert(u);
1567         assert(c);
1568
1569         if (!exec_context_restrict_namespaces_set(c))
1570                 return 0;
1571
1572         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1573                 return 0;
1574
1575         return seccomp_restrict_namespaces(c->restrict_namespaces);
1576 }
1577
1578 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1579         unsigned long personality;
1580         int r;
1581
1582         assert(u);
1583         assert(c);
1584
1585         if (!c->lock_personality)
1586                 return 0;
1587
1588         if (skip_seccomp_unavailable(u, "LockPersonality="))
1589                 return 0;
1590
1591         personality = c->personality;
1592
1593         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1594         if (personality == PERSONALITY_INVALID) {
1595
1596                 r = opinionated_personality(&personality);
1597                 if (r < 0)
1598                         return r;
1599         }
1600
1601         return seccomp_lock_personality(personality);
1602 }
1603
1604 #endif
1605
1606 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1607         assert(idle_pipe);
1608
1609         idle_pipe[1] = safe_close(idle_pipe[1]);
1610         idle_pipe[2] = safe_close(idle_pipe[2]);
1611
1612         if (idle_pipe[0] >= 0) {
1613                 int r;
1614
1615                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1616
1617                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1618                         ssize_t n;
1619
1620                         /* Signal systemd that we are bored and want to continue. */
1621                         n = write(idle_pipe[3], "x", 1);
1622                         if (n > 0)
1623                                 /* Wait for systemd to react to the signal above. */
1624                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1625                 }
1626
1627                 idle_pipe[0] = safe_close(idle_pipe[0]);
1628
1629         }
1630
1631         idle_pipe[3] = safe_close(idle_pipe[3]);
1632 }
1633
1634 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1635
1636 static int build_environment(
1637                 const Unit *u,
1638                 const ExecContext *c,
1639                 const ExecParameters *p,
1640                 size_t n_fds,
1641                 const char *home,
1642                 const char *username,
1643                 const char *shell,
1644                 dev_t journal_stream_dev,
1645                 ino_t journal_stream_ino,
1646                 char ***ret) {
1647
1648         _cleanup_strv_free_ char **our_env = NULL;
1649         ExecDirectoryType t;
1650         size_t n_env = 0;
1651         char *x;
1652
1653         assert(u);
1654         assert(c);
1655         assert(p);
1656         assert(ret);
1657
1658         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1659         if (!our_env)
1660                 return -ENOMEM;
1661
1662         if (n_fds > 0) {
1663                 _cleanup_free_ char *joined = NULL;
1664
1665                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1666                         return -ENOMEM;
1667                 our_env[n_env++] = x;
1668
1669                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1670                         return -ENOMEM;
1671                 our_env[n_env++] = x;
1672
1673                 joined = strv_join(p->fd_names, ":");
1674                 if (!joined)
1675                         return -ENOMEM;
1676
1677                 x = strjoin("LISTEN_FDNAMES=", joined);
1678                 if (!x)
1679                         return -ENOMEM;
1680                 our_env[n_env++] = x;
1681         }
1682
1683         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1684                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1685                         return -ENOMEM;
1686                 our_env[n_env++] = x;
1687
1688                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1689                         return -ENOMEM;
1690                 our_env[n_env++] = x;
1691         }
1692
1693         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1694          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1695          * check the database directly. */
1696         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1697                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1698                 if (!x)
1699                         return -ENOMEM;
1700                 our_env[n_env++] = x;
1701         }
1702
1703         if (home) {
1704                 x = strjoin("HOME=", home);
1705                 if (!x)
1706                         return -ENOMEM;
1707
1708                 path_simplify(x + 5, true);
1709                 our_env[n_env++] = x;
1710         }
1711
1712         if (username) {
1713                 x = strjoin("LOGNAME=", username);
1714                 if (!x)
1715                         return -ENOMEM;
1716                 our_env[n_env++] = x;
1717
1718                 x = strjoin("USER=", username);
1719                 if (!x)
1720                         return -ENOMEM;
1721                 our_env[n_env++] = x;
1722         }
1723
1724         if (shell) {
1725                 x = strjoin("SHELL=", shell);
1726                 if (!x)
1727                         return -ENOMEM;
1728
1729                 path_simplify(x + 6, true);
1730                 our_env[n_env++] = x;
1731         }
1732
1733         if (!sd_id128_is_null(u->invocation_id)) {
1734                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1735                         return -ENOMEM;
1736
1737                 our_env[n_env++] = x;
1738         }
1739
1740         if (exec_context_needs_term(c)) {
1741                 const char *tty_path, *term = NULL;
1742
1743                 tty_path = exec_context_tty_path(c);
1744
1745                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1746                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1747                  * passes to PID 1 ends up all the way in the console login shown. */
1748
1749                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1750                         term = getenv("TERM");
1751                 if (!term)
1752                         term = default_term_for_tty(tty_path);
1753
1754                 x = strjoin("TERM=", term);
1755                 if (!x)
1756                         return -ENOMEM;
1757                 our_env[n_env++] = x;
1758         }
1759
1760         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1761                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1762                         return -ENOMEM;
1763
1764                 our_env[n_env++] = x;
1765         }
1766
1767         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1768                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1769                 const char *n;
1770
1771                 if (!p->prefix[t])
1772                         continue;
1773
1774                 if (strv_isempty(c->directories[t].paths))
1775                         continue;
1776
1777                 n = exec_directory_env_name_to_string(t);
1778                 if (!n)
1779                         continue;
1780
1781                 pre = strjoin(p->prefix[t], "/");
1782                 if (!pre)
1783                         return -ENOMEM;
1784
1785                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1786                 if (!joined)
1787                         return -ENOMEM;
1788
1789                 x = strjoin(n, "=", joined);
1790                 if (!x)
1791                         return -ENOMEM;
1792
1793                 our_env[n_env++] = x;
1794         }
1795
1796         our_env[n_env++] = NULL;
1797         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1798
1799         *ret = TAKE_PTR(our_env);
1800
1801         return 0;
1802 }
1803
1804 static int build_pass_environment(const ExecContext *c, char ***ret) {
1805         _cleanup_strv_free_ char **pass_env = NULL;
1806         size_t n_env = 0, n_bufsize = 0;
1807         char **i;
1808
1809         STRV_FOREACH(i, c->pass_environment) {
1810                 _cleanup_free_ char *x = NULL;
1811                 char *v;
1812
1813                 v = getenv(*i);
1814                 if (!v)
1815                         continue;
1816                 x = strjoin(*i, "=", v);
1817                 if (!x)
1818                         return -ENOMEM;
1819
1820                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1821                         return -ENOMEM;
1822
1823                 pass_env[n_env++] = TAKE_PTR(x);
1824                 pass_env[n_env] = NULL;
1825         }
1826
1827         *ret = TAKE_PTR(pass_env);
1828
1829         return 0;
1830 }
1831
1832 static bool exec_needs_mount_namespace(
1833                 const ExecContext *context,
1834                 const ExecParameters *params,
1835                 const ExecRuntime *runtime) {
1836
1837         assert(context);
1838         assert(params);
1839
1840         if (context->root_image)
1841                 return true;
1842
1843         if (!strv_isempty(context->read_write_paths) ||
1844             !strv_isempty(context->read_only_paths) ||
1845             !strv_isempty(context->inaccessible_paths))
1846                 return true;
1847
1848         if (context->n_bind_mounts > 0)
1849                 return true;
1850
1851         if (context->n_temporary_filesystems > 0)
1852                 return true;
1853
1854         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1855                 return true;
1856
1857         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1858                 return true;
1859
1860         if (context->private_devices ||
1861             context->private_mounts ||
1862             context->protect_system != PROTECT_SYSTEM_NO ||
1863             context->protect_home != PROTECT_HOME_NO ||
1864             context->protect_kernel_tunables ||
1865             context->protect_kernel_modules ||
1866             context->protect_control_groups)
1867                 return true;
1868
1869         if (context->root_directory) {
1870                 ExecDirectoryType t;
1871
1872                 if (context->mount_apivfs)
1873                         return true;
1874
1875                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1876                         if (!params->prefix[t])
1877                                 continue;
1878
1879                         if (!strv_isempty(context->directories[t].paths))
1880                                 return true;
1881                 }
1882         }
1883
1884         if (context->dynamic_user &&
1885             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1886              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1887              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1888                 return true;
1889
1890         return false;
1891 }
1892
1893 static int setup_private_users(uid_t uid, gid_t gid) {
1894         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1895         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1896         _cleanup_close_ int unshare_ready_fd = -1;
1897         _cleanup_(sigkill_waitp) pid_t pid = 0;
1898         uint64_t c = 1;
1899         ssize_t n;
1900         int r;
1901
1902         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1903          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1904          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1905          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1906          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1907          * continues execution normally. */
1908
1909         if (uid != 0 && uid_is_valid(uid)) {
1910                 r = asprintf(&uid_map,
1911                              "0 0 1\n"                      /* Map root → root */
1912                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1913                              uid, uid);
1914                 if (r < 0)
1915                         return -ENOMEM;
1916         } else {
1917                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1918                 if (!uid_map)
1919                         return -ENOMEM;
1920         }
1921
1922         if (gid != 0 && gid_is_valid(gid)) {
1923                 r = asprintf(&gid_map,
1924                              "0 0 1\n"                      /* Map root → root */
1925                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1926                              gid, gid);
1927                 if (r < 0)
1928                         return -ENOMEM;
1929         } else {
1930                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1931                 if (!gid_map)
1932                         return -ENOMEM;
1933         }
1934
1935         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1936          * namespace. */
1937         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1938         if (unshare_ready_fd < 0)
1939                 return -errno;
1940
1941         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1942          * failed. */
1943         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1944                 return -errno;
1945
1946         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1947         if (r < 0)
1948                 return r;
1949         if (r == 0) {
1950                 _cleanup_close_ int fd = -1;
1951                 const char *a;
1952                 pid_t ppid;
1953
1954                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1955                  * here, after the parent opened its own user namespace. */
1956
1957                 ppid = getppid();
1958                 errno_pipe[0] = safe_close(errno_pipe[0]);
1959
1960                 /* Wait until the parent unshared the user namespace */
1961                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1962                         r = -errno;
1963                         goto child_fail;
1964                 }
1965
1966                 /* Disable the setgroups() system call in the child user namespace, for good. */
1967                 a = procfs_file_alloca(ppid, "setgroups");
1968                 fd = open(a, O_WRONLY|O_CLOEXEC);
1969                 if (fd < 0) {
1970                         if (errno != ENOENT) {
1971                                 r = -errno;
1972                                 goto child_fail;
1973                         }
1974
1975                         /* If the file is missing the kernel is too old, let's continue anyway. */
1976                 } else {
1977                         if (write(fd, "deny\n", 5) < 0) {
1978                                 r = -errno;
1979                                 goto child_fail;
1980                         }
1981
1982                         fd = safe_close(fd);
1983                 }
1984
1985                 /* First write the GID map */
1986                 a = procfs_file_alloca(ppid, "gid_map");
1987                 fd = open(a, O_WRONLY|O_CLOEXEC);
1988                 if (fd < 0) {
1989                         r = -errno;
1990                         goto child_fail;
1991                 }
1992                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1993                         r = -errno;
1994                         goto child_fail;
1995                 }
1996                 fd = safe_close(fd);
1997
1998                 /* The write the UID map */
1999                 a = procfs_file_alloca(ppid, "uid_map");
2000                 fd = open(a, O_WRONLY|O_CLOEXEC);
2001                 if (fd < 0) {
2002                         r = -errno;
2003                         goto child_fail;
2004                 }
2005                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2006                         r = -errno;
2007                         goto child_fail;
2008                 }
2009
2010                 _exit(EXIT_SUCCESS);
2011
2012         child_fail:
2013                 (void) write(errno_pipe[1], &r, sizeof(r));
2014                 _exit(EXIT_FAILURE);
2015         }
2016
2017         errno_pipe[1] = safe_close(errno_pipe[1]);
2018
2019         if (unshare(CLONE_NEWUSER) < 0)
2020                 return -errno;
2021
2022         /* Let the child know that the namespace is ready now */
2023         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2024                 return -errno;
2025
2026         /* Try to read an error code from the child */
2027         n = read(errno_pipe[0], &r, sizeof(r));
2028         if (n < 0)
2029                 return -errno;
2030         if (n == sizeof(r)) { /* an error code was sent to us */
2031                 if (r < 0)
2032                         return r;
2033                 return -EIO;
2034         }
2035         if (n != 0) /* on success we should have read 0 bytes */
2036                 return -EIO;
2037
2038         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2039         pid = 0;
2040         if (r < 0)
2041                 return r;
2042         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2043                 return -EIO;
2044
2045         return 0;
2046 }
2047
2048 static int setup_exec_directory(
2049                 const ExecContext *context,
2050                 const ExecParameters *params,
2051                 uid_t uid,
2052                 gid_t gid,
2053                 ExecDirectoryType type,
2054                 int *exit_status) {
2055
2056         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2057                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2058                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2059                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2060                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2061                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2062         };
2063         char **rt;
2064         int r;
2065
2066         assert(context);
2067         assert(params);
2068         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2069         assert(exit_status);
2070
2071         if (!params->prefix[type])
2072                 return 0;
2073
2074         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2075                 if (!uid_is_valid(uid))
2076                         uid = 0;
2077                 if (!gid_is_valid(gid))
2078                         gid = 0;
2079         }
2080
2081         STRV_FOREACH(rt, context->directories[type].paths) {
2082                 _cleanup_free_ char *p = NULL, *pp = NULL;
2083
2084                 p = path_join(params->prefix[type], *rt);
2085                 if (!p) {
2086                         r = -ENOMEM;
2087                         goto fail;
2088                 }
2089
2090                 r = mkdir_parents_label(p, 0755);
2091                 if (r < 0)
2092                         goto fail;
2093
2094                 if (context->dynamic_user &&
2095                     (!IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) ||
2096                      (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode != EXEC_PRESERVE_NO))) {
2097                         _cleanup_free_ char *private_root = NULL;
2098
2099                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2100                          * case we want to avoid leaving a directory around fully accessible that is owned by
2101                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2102                          * trick used by container managers to prohibit host users to get access to files of
2103                          * the same UID in containers: we place everything inside a directory that has an
2104                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2105                          * for unprivileged host code. We then use fs namespacing to make this directory
2106                          * permeable for the service itself.
2107                          *
2108                          * Specifically: for a service which wants a special directory "foo/" we first create
2109                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2110                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2111                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2112                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2113                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2114                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2115                          * for the service and making sure it only gets access to the dirs it needs but no
2116                          * others. Tricky? Yes, absolutely, but it works!
2117                          *
2118                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2119                          * to be owned by the service itself.
2120                          *
2121                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2122                          * for sharing files or sockets with other services. */
2123
2124                         private_root = path_join(params->prefix[type], "private");
2125                         if (!private_root) {
2126                                 r = -ENOMEM;
2127                                 goto fail;
2128                         }
2129
2130                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2131                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2132                         if (r < 0)
2133                                 goto fail;
2134
2135                         pp = path_join(private_root, *rt);
2136                         if (!pp) {
2137                                 r = -ENOMEM;
2138                                 goto fail;
2139                         }
2140
2141                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2142                         r = mkdir_parents_label(pp, 0755);
2143                         if (r < 0)
2144                                 goto fail;
2145
2146                         if (is_dir(p, false) > 0 &&
2147                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2148
2149                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2150                                  * it over. Most likely the service has been upgraded from one that didn't use
2151                                  * DynamicUser=1, to one that does. */
2152
2153                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2154                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2155                                          exec_directory_type_to_string(type), p, pp);
2156
2157                                 if (rename(p, pp) < 0) {
2158                                         r = -errno;
2159                                         goto fail;
2160                                 }
2161                         } else {
2162                                 /* Otherwise, create the actual directory for the service */
2163
2164                                 r = mkdir_label(pp, context->directories[type].mode);
2165                                 if (r < 0 && r != -EEXIST)
2166                                         goto fail;
2167                         }
2168
2169                         /* And link it up from the original place */
2170                         r = symlink_idempotent(pp, p, true);
2171                         if (r < 0)
2172                                 goto fail;
2173
2174                 } else {
2175                         _cleanup_free_ char *target = NULL;
2176
2177                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2178                             readlink_and_make_absolute(p, &target) >= 0) {
2179                                 _cleanup_free_ char *q = NULL;
2180
2181                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2182                                  * by DynamicUser=1 (see above)?
2183                                  *
2184                                  * We do this for all directory types except for ConfigurationDirectory=,
2185                                  * since they all support the private/ symlink logic at least in some
2186                                  * configurations, see above. */
2187
2188                                 q = path_join(params->prefix[type], "private", *rt);
2189                                 if (!q) {
2190                                         r = -ENOMEM;
2191                                         goto fail;
2192                                 }
2193
2194                                 if (path_equal(q, target)) {
2195
2196                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2197                                          * but is no longer. Let's move the directory back up. */
2198
2199                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2200                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2201                                                  exec_directory_type_to_string(type), q, p);
2202
2203                                         if (unlink(p) < 0) {
2204                                                 r = -errno;
2205                                                 goto fail;
2206                                         }
2207
2208                                         if (rename(q, p) < 0) {
2209                                                 r = -errno;
2210                                                 goto fail;
2211                                         }
2212                                 }
2213                         }
2214
2215                         r = mkdir_label(p, context->directories[type].mode);
2216                         if (r < 0) {
2217                                 if (r != -EEXIST)
2218                                         goto fail;
2219
2220                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2221                                         struct stat st;
2222
2223                                         /* Don't change the owner/access mode of the configuration directory,
2224                                          * as in the common case it is not written to by a service, and shall
2225                                          * not be writable. */
2226
2227                                         if (stat(p, &st) < 0) {
2228                                                 r = -errno;
2229                                                 goto fail;
2230                                         }
2231
2232                                         /* Still complain if the access mode doesn't match */
2233                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2234                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2235                                                             "(File system: %o %sMode: %o)",
2236                                                             exec_directory_type_to_string(type), *rt,
2237                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2238
2239                                         continue;
2240                                 }
2241                         }
2242                 }
2243
2244                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2245                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2246                  * current UID/GID ownership.) */
2247                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2248                 if (r < 0)
2249                         goto fail;
2250
2251                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2252                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2253                  * assignments to exist.*/
2254                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2255                 if (r < 0)
2256                         goto fail;
2257         }
2258
2259         return 0;
2260
2261 fail:
2262         *exit_status = exit_status_table[type];
2263         return r;
2264 }
2265
2266 #if ENABLE_SMACK
2267 static int setup_smack(
2268                 const ExecContext *context,
2269                 const ExecCommand *command) {
2270
2271         int r;
2272
2273         assert(context);
2274         assert(command);
2275
2276         if (context->smack_process_label) {
2277                 r = mac_smack_apply_pid(0, context->smack_process_label);
2278                 if (r < 0)
2279                         return r;
2280         }
2281 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2282         else {
2283                 _cleanup_free_ char *exec_label = NULL;
2284
2285                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2286                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2287                         return r;
2288
2289                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2290                 if (r < 0)
2291                         return r;
2292         }
2293 #endif
2294
2295         return 0;
2296 }
2297 #endif
2298
2299 static int compile_bind_mounts(
2300                 const ExecContext *context,
2301                 const ExecParameters *params,
2302                 BindMount **ret_bind_mounts,
2303                 size_t *ret_n_bind_mounts,
2304                 char ***ret_empty_directories) {
2305
2306         _cleanup_strv_free_ char **empty_directories = NULL;
2307         BindMount *bind_mounts;
2308         size_t n, h = 0, i;
2309         ExecDirectoryType t;
2310         int r;
2311
2312         assert(context);
2313         assert(params);
2314         assert(ret_bind_mounts);
2315         assert(ret_n_bind_mounts);
2316         assert(ret_empty_directories);
2317
2318         n = context->n_bind_mounts;
2319         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2320                 if (!params->prefix[t])
2321                         continue;
2322
2323                 n += strv_length(context->directories[t].paths);
2324         }
2325
2326         if (n <= 0) {
2327                 *ret_bind_mounts = NULL;
2328                 *ret_n_bind_mounts = 0;
2329                 *ret_empty_directories = NULL;
2330                 return 0;
2331         }
2332
2333         bind_mounts = new(BindMount, n);
2334         if (!bind_mounts)
2335                 return -ENOMEM;
2336
2337         for (i = 0; i < context->n_bind_mounts; i++) {
2338                 BindMount *item = context->bind_mounts + i;
2339                 char *s, *d;
2340
2341                 s = strdup(item->source);
2342                 if (!s) {
2343                         r = -ENOMEM;
2344                         goto finish;
2345                 }
2346
2347                 d = strdup(item->destination);
2348                 if (!d) {
2349                         free(s);
2350                         r = -ENOMEM;
2351                         goto finish;
2352                 }
2353
2354                 bind_mounts[h++] = (BindMount) {
2355                         .source = s,
2356                         .destination = d,
2357                         .read_only = item->read_only,
2358                         .recursive = item->recursive,
2359                         .ignore_enoent = item->ignore_enoent,
2360                 };
2361         }
2362
2363         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2364                 char **suffix;
2365
2366                 if (!params->prefix[t])
2367                         continue;
2368
2369                 if (strv_isempty(context->directories[t].paths))
2370                         continue;
2371
2372                 if (context->dynamic_user &&
2373                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2374                     !(context->root_directory || context->root_image)) {
2375                         char *private_root;
2376
2377                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2378                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2379                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2380
2381                         private_root = path_join(params->prefix[t], "private");
2382                         if (!private_root) {
2383                                 r = -ENOMEM;
2384                                 goto finish;
2385                         }
2386
2387                         r = strv_consume(&empty_directories, private_root);
2388                         if (r < 0)
2389                                 goto finish;
2390                 }
2391
2392                 STRV_FOREACH(suffix, context->directories[t].paths) {
2393                         char *s, *d;
2394
2395                         if (context->dynamic_user &&
2396                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2397                                 s = path_join(params->prefix[t], "private", *suffix);
2398                         else
2399                                 s = path_join(params->prefix[t], *suffix);
2400                         if (!s) {
2401                                 r = -ENOMEM;
2402                                 goto finish;
2403                         }
2404
2405                         if (context->dynamic_user &&
2406                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2407                             (context->root_directory || context->root_image))
2408                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2409                                  * directory is not created on the root directory. So, let's bind-mount the directory
2410                                  * on the 'non-private' place. */
2411                                 d = path_join(params->prefix[t], *suffix);
2412                         else
2413                                 d = strdup(s);
2414                         if (!d) {
2415                                 free(s);
2416                                 r = -ENOMEM;
2417                                 goto finish;
2418                         }
2419
2420                         bind_mounts[h++] = (BindMount) {
2421                                 .source = s,
2422                                 .destination = d,
2423                                 .read_only = false,
2424                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2425                                 .recursive = true,
2426                                 .ignore_enoent = false,
2427                         };
2428                 }
2429         }
2430
2431         assert(h == n);
2432
2433         *ret_bind_mounts = bind_mounts;
2434         *ret_n_bind_mounts = n;
2435         *ret_empty_directories = TAKE_PTR(empty_directories);
2436
2437         return (int) n;
2438
2439 finish:
2440         bind_mount_free_many(bind_mounts, h);
2441         return r;
2442 }
2443
2444 static int apply_mount_namespace(
2445                 const Unit *u,
2446                 const ExecCommand *command,
2447                 const ExecContext *context,
2448                 const ExecParameters *params,
2449                 const ExecRuntime *runtime,
2450                 char **error_path) {
2451
2452         _cleanup_strv_free_ char **empty_directories = NULL;
2453         char *tmp = NULL, *var = NULL;
2454         const char *root_dir = NULL, *root_image = NULL;
2455         NamespaceInfo ns_info;
2456         bool needs_sandboxing;
2457         BindMount *bind_mounts = NULL;
2458         size_t n_bind_mounts = 0;
2459         int r;
2460
2461         assert(context);
2462
2463         /* The runtime struct only contains the parent of the private /tmp,
2464          * which is non-accessible to world users. Inside of it there's a /tmp
2465          * that is sticky, and that's the one we want to use here. */
2466
2467         if (context->private_tmp && runtime) {
2468                 if (runtime->tmp_dir)
2469                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2470                 if (runtime->var_tmp_dir)
2471                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2472         }
2473
2474         if (params->flags & EXEC_APPLY_CHROOT) {
2475                 root_image = context->root_image;
2476
2477                 if (!root_image)
2478                         root_dir = context->root_directory;
2479         }
2480
2481         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2482         if (r < 0)
2483                 return r;
2484
2485         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2486         if (needs_sandboxing)
2487                 ns_info = (NamespaceInfo) {
2488                         .ignore_protect_paths = false,
2489                         .private_dev = context->private_devices,
2490                         .protect_control_groups = context->protect_control_groups,
2491                         .protect_kernel_tunables = context->protect_kernel_tunables,
2492                         .protect_kernel_modules = context->protect_kernel_modules,
2493                         .protect_hostname = context->protect_hostname,
2494                         .mount_apivfs = context->mount_apivfs,
2495                         .private_mounts = context->private_mounts,
2496                 };
2497         else if (!context->dynamic_user && root_dir)
2498                 /*
2499                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2500                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2501                  * fail if we are enable to apply the sandbox inside the mount namespace.
2502                  */
2503                 ns_info = (NamespaceInfo) {
2504                         .ignore_protect_paths = true,
2505                 };
2506         else
2507                 ns_info = (NamespaceInfo) {};
2508
2509         if (context->mount_flags == MS_SHARED)
2510                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2511
2512         r = setup_namespace(root_dir, root_image,
2513                             &ns_info, context->read_write_paths,
2514                             needs_sandboxing ? context->read_only_paths : NULL,
2515                             needs_sandboxing ? context->inaccessible_paths : NULL,
2516                             empty_directories,
2517                             bind_mounts,
2518                             n_bind_mounts,
2519                             context->temporary_filesystems,
2520                             context->n_temporary_filesystems,
2521                             tmp,
2522                             var,
2523                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2524                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2525                             context->mount_flags,
2526                             DISSECT_IMAGE_DISCARD_ON_LOOP,
2527                             error_path);
2528
2529         bind_mount_free_many(bind_mounts, n_bind_mounts);
2530
2531         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2532          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2533          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2534          * completely different execution environment. */
2535         if (r == -ENOANO) {
2536                 if (n_bind_mounts == 0 &&
2537                     context->n_temporary_filesystems == 0 &&
2538                     !root_dir && !root_image &&
2539                     !context->dynamic_user) {
2540                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2541                         return 0;
2542                 }
2543
2544                 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2545                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2546                                n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2547
2548                 return -EOPNOTSUPP;
2549         }
2550
2551         return r;
2552 }
2553
2554 static int apply_working_directory(
2555                 const ExecContext *context,
2556                 const ExecParameters *params,
2557                 const char *home,
2558                 int *exit_status) {
2559
2560         const char *d, *wd;
2561
2562         assert(context);
2563         assert(exit_status);
2564
2565         if (context->working_directory_home) {
2566
2567                 if (!home) {
2568                         *exit_status = EXIT_CHDIR;
2569                         return -ENXIO;
2570                 }
2571
2572                 wd = home;
2573
2574         } else if (context->working_directory)
2575                 wd = context->working_directory;
2576         else
2577                 wd = "/";
2578
2579         if (params->flags & EXEC_APPLY_CHROOT)
2580                 d = wd;
2581         else
2582                 d = prefix_roota(context->root_directory, wd);
2583
2584         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2585                 *exit_status = EXIT_CHDIR;
2586                 return -errno;
2587         }
2588
2589         return 0;
2590 }
2591
2592 static int apply_root_directory(
2593                 const ExecContext *context,
2594                 const ExecParameters *params,
2595                 const bool needs_mount_ns,
2596                 int *exit_status) {
2597
2598         assert(context);
2599         assert(exit_status);
2600
2601         if (params->flags & EXEC_APPLY_CHROOT) {
2602                 if (!needs_mount_ns && context->root_directory)
2603                         if (chroot(context->root_directory) < 0) {
2604                                 *exit_status = EXIT_CHROOT;
2605                                 return -errno;
2606                         }
2607         }
2608
2609         return 0;
2610 }
2611
2612 static int setup_keyring(
2613                 const Unit *u,
2614                 const ExecContext *context,
2615                 const ExecParameters *p,
2616                 uid_t uid, gid_t gid) {
2617
2618         key_serial_t keyring;
2619         int r = 0;
2620         uid_t saved_uid;
2621         gid_t saved_gid;
2622
2623         assert(u);
2624         assert(context);
2625         assert(p);
2626
2627         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2628          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2629          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2630          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2631          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2632          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2633
2634         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2635                 return 0;
2636
2637         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2638          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2639          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2640          * & group is just as nasty as acquiring a reference to the user keyring. */
2641
2642         saved_uid = getuid();
2643         saved_gid = getgid();
2644
2645         if (gid_is_valid(gid) && gid != saved_gid) {
2646                 if (setregid(gid, -1) < 0)
2647                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2648         }
2649
2650         if (uid_is_valid(uid) && uid != saved_uid) {
2651                 if (setreuid(uid, -1) < 0) {
2652                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2653                         goto out;
2654                 }
2655         }
2656
2657         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2658         if (keyring == -1) {
2659                 if (errno == ENOSYS)
2660                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2661                 else if (IN_SET(errno, EACCES, EPERM))
2662                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2663                 else if (errno == EDQUOT)
2664                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2665                 else
2666                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2667
2668                 goto out;
2669         }
2670
2671         /* When requested link the user keyring into the session keyring. */
2672         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2673
2674                 if (keyctl(KEYCTL_LINK,
2675                            KEY_SPEC_USER_KEYRING,
2676                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2677                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2678                         goto out;
2679                 }
2680         }
2681
2682         /* Restore uid/gid back */
2683         if (uid_is_valid(uid) && uid != saved_uid) {
2684                 if (setreuid(saved_uid, -1) < 0) {
2685                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2686                         goto out;
2687                 }
2688         }
2689
2690         if (gid_is_valid(gid) && gid != saved_gid) {
2691                 if (setregid(saved_gid, -1) < 0)
2692                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2693         }
2694
2695         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2696         if (!sd_id128_is_null(u->invocation_id)) {
2697                 key_serial_t key;
2698
2699                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2700                 if (key == -1)
2701                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2702                 else {
2703                         if (keyctl(KEYCTL_SETPERM, key,
2704                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2705                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2706                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2707                 }
2708         }
2709
2710 out:
2711         /* Revert back uid & gid for the the last time, and exit */
2712         /* no extra logging, as only the first already reported error matters */
2713         if (getuid() != saved_uid)
2714                 (void) setreuid(saved_uid, -1);
2715
2716         if (getgid() != saved_gid)
2717                 (void) setregid(saved_gid, -1);
2718
2719         return r;
2720 }
2721
2722 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2723         assert(array);
2724         assert(n);
2725         assert(pair);
2726
2727         if (pair[0] >= 0)
2728                 array[(*n)++] = pair[0];
2729         if (pair[1] >= 0)
2730                 array[(*n)++] = pair[1];
2731 }
2732
2733 static int close_remaining_fds(
2734                 const ExecParameters *params,
2735                 const ExecRuntime *runtime,
2736                 const DynamicCreds *dcreds,
2737                 int user_lookup_fd,
2738                 int socket_fd,
2739                 int exec_fd,
2740                 int *fds, size_t n_fds) {
2741
2742         size_t n_dont_close = 0;
2743         int dont_close[n_fds + 12];
2744
2745         assert(params);
2746
2747         if (params->stdin_fd >= 0)
2748                 dont_close[n_dont_close++] = params->stdin_fd;
2749         if (params->stdout_fd >= 0)
2750                 dont_close[n_dont_close++] = params->stdout_fd;
2751         if (params->stderr_fd >= 0)
2752                 dont_close[n_dont_close++] = params->stderr_fd;
2753
2754         if (socket_fd >= 0)
2755                 dont_close[n_dont_close++] = socket_fd;
2756         if (exec_fd >= 0)
2757                 dont_close[n_dont_close++] = exec_fd;
2758         if (n_fds > 0) {
2759                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2760                 n_dont_close += n_fds;
2761         }
2762
2763         if (runtime)
2764                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2765
2766         if (dcreds) {
2767                 if (dcreds->user)
2768                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2769                 if (dcreds->group)
2770                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2771         }
2772
2773         if (user_lookup_fd >= 0)
2774                 dont_close[n_dont_close++] = user_lookup_fd;
2775
2776         return close_all_fds(dont_close, n_dont_close);
2777 }
2778
2779 static int send_user_lookup(
2780                 Unit *unit,
2781                 int user_lookup_fd,
2782                 uid_t uid,
2783                 gid_t gid) {
2784
2785         assert(unit);
2786
2787         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2788          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2789          * specified. */
2790
2791         if (user_lookup_fd < 0)
2792                 return 0;
2793
2794         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2795                 return 0;
2796
2797         if (writev(user_lookup_fd,
2798                (struct iovec[]) {
2799                            IOVEC_INIT(&uid, sizeof(uid)),
2800                            IOVEC_INIT(&gid, sizeof(gid)),
2801                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2802                 return -errno;
2803
2804         return 0;
2805 }
2806
2807 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2808         int r;
2809
2810         assert(c);
2811         assert(home);
2812         assert(buf);
2813
2814         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2815
2816         if (*home)
2817                 return 0;
2818
2819         if (!c->working_directory_home)
2820                 return 0;
2821
2822         r = get_home_dir(buf);
2823         if (r < 0)
2824                 return r;
2825
2826         *home = *buf;
2827         return 1;
2828 }
2829
2830 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2831         _cleanup_strv_free_ char ** list = NULL;
2832         ExecDirectoryType t;
2833         int r;
2834
2835         assert(c);
2836         assert(p);
2837         assert(ret);
2838
2839         assert(c->dynamic_user);
2840
2841         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2842          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2843          * directories. */
2844
2845         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2846                 char **i;
2847
2848                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2849                         continue;
2850
2851                 if (!p->prefix[t])
2852                         continue;
2853
2854                 STRV_FOREACH(i, c->directories[t].paths) {
2855                         char *e;
2856
2857                         if (t == EXEC_DIRECTORY_RUNTIME)
2858                                 e = path_join(p->prefix[t], *i);
2859                         else
2860                                 e = path_join(p->prefix[t], "private", *i);
2861                         if (!e)
2862                                 return -ENOMEM;
2863
2864                         r = strv_consume(&list, e);
2865                         if (r < 0)
2866                                 return r;
2867                 }
2868         }
2869
2870         *ret = TAKE_PTR(list);
2871
2872         return 0;
2873 }
2874
2875 static char *exec_command_line(char **argv);
2876
2877 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2878         bool using_subcgroup;
2879         char *p;
2880
2881         assert(params);
2882         assert(ret);
2883
2884         if (!params->cgroup_path)
2885                 return -EINVAL;
2886
2887         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2888          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2889          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2890          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2891          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2892          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2893          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2894          * flag, which is only passed for the former statements, not for the latter. */
2895
2896         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2897         if (using_subcgroup)
2898                 p = path_join(params->cgroup_path, ".control");
2899         else
2900                 p = strdup(params->cgroup_path);
2901         if (!p)
2902                 return -ENOMEM;
2903
2904         *ret = p;
2905         return using_subcgroup;
2906 }
2907
2908 static int exec_child(
2909                 Unit *unit,
2910                 const ExecCommand *command,
2911                 const ExecContext *context,
2912                 const ExecParameters *params,
2913                 ExecRuntime *runtime,
2914                 DynamicCreds *dcreds,
2915                 int socket_fd,
2916                 const int named_iofds[static 3],
2917                 int *fds,
2918                 size_t n_socket_fds,
2919                 size_t n_storage_fds,
2920                 char **files_env,
2921                 int user_lookup_fd,
2922                 int *exit_status) {
2923
2924         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2925         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2926         _cleanup_free_ gid_t *supplementary_gids = NULL;
2927         const char *username = NULL, *groupname = NULL;
2928         _cleanup_free_ char *home_buffer = NULL;
2929         const char *home = NULL, *shell = NULL;
2930         char **final_argv = NULL;
2931         dev_t journal_stream_dev = 0;
2932         ino_t journal_stream_ino = 0;
2933         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2934                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2935                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2936                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2937 #if HAVE_SELINUX
2938         _cleanup_free_ char *mac_selinux_context_net = NULL;
2939         bool use_selinux = false;
2940 #endif
2941 #if ENABLE_SMACK
2942         bool use_smack = false;
2943 #endif
2944 #if HAVE_APPARMOR
2945         bool use_apparmor = false;
2946 #endif
2947         uid_t uid = UID_INVALID;
2948         gid_t gid = GID_INVALID;
2949         size_t n_fds;
2950         ExecDirectoryType dt;
2951         int secure_bits;
2952
2953         assert(unit);
2954         assert(command);
2955         assert(context);
2956         assert(params);
2957         assert(exit_status);
2958
2959         rename_process_from_path(command->path);
2960
2961         /* We reset exactly these signals, since they are the
2962          * only ones we set to SIG_IGN in the main daemon. All
2963          * others we leave untouched because we set them to
2964          * SIG_DFL or a valid handler initially, both of which
2965          * will be demoted to SIG_DFL. */
2966         (void) default_signals(SIGNALS_CRASH_HANDLER,
2967                                SIGNALS_IGNORE, -1);
2968
2969         if (context->ignore_sigpipe)
2970                 (void) ignore_signals(SIGPIPE, -1);
2971
2972         r = reset_signal_mask();
2973         if (r < 0) {
2974                 *exit_status = EXIT_SIGNAL_MASK;
2975                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2976         }
2977
2978         if (params->idle_pipe)
2979                 do_idle_pipe_dance(params->idle_pipe);
2980
2981         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2982          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2983          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2984          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2985
2986         log_forget_fds();
2987         log_set_open_when_needed(true);
2988
2989         /* In case anything used libc syslog(), close this here, too */
2990         closelog();
2991
2992         n_fds = n_socket_fds + n_storage_fds;
2993         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2994         if (r < 0) {
2995                 *exit_status = EXIT_FDS;
2996                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2997         }
2998
2999         if (!context->same_pgrp)
3000                 if (setsid() < 0) {
3001                         *exit_status = EXIT_SETSID;
3002                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3003                 }
3004
3005         exec_context_tty_reset(context, params);
3006
3007         if (unit_shall_confirm_spawn(unit)) {
3008                 const char *vc = params->confirm_spawn;
3009                 _cleanup_free_ char *cmdline = NULL;
3010
3011                 cmdline = exec_command_line(command->argv);
3012                 if (!cmdline) {
3013                         *exit_status = EXIT_MEMORY;
3014                         return log_oom();
3015                 }
3016
3017                 r = ask_for_confirmation(vc, unit, cmdline);
3018                 if (r != CONFIRM_EXECUTE) {
3019                         if (r == CONFIRM_PRETEND_SUCCESS) {
3020                                 *exit_status = EXIT_SUCCESS;
3021                                 return 0;
3022                         }
3023                         *exit_status = EXIT_CONFIRM;
3024                         log_unit_error(unit, "Execution cancelled by the user");
3025                         return -ECANCELED;
3026                 }
3027         }
3028
3029         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3030          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3031          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3032          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3033          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3034         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3035             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3036                 *exit_status = EXIT_MEMORY;
3037                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3038         }
3039
3040         if (context->dynamic_user && dcreds) {
3041                 _cleanup_strv_free_ char **suggested_paths = NULL;
3042
3043                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3044                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3045                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3046                         *exit_status = EXIT_USER;
3047                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3048                 }
3049
3050                 r = compile_suggested_paths(context, params, &suggested_paths);
3051                 if (r < 0) {
3052                         *exit_status = EXIT_MEMORY;
3053                         return log_oom();
3054                 }
3055
3056                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3057                 if (r < 0) {
3058                         *exit_status = EXIT_USER;
3059                         if (r == -EILSEQ) {
3060                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3061                                 return -EOPNOTSUPP;
3062                         }
3063                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3064                 }
3065
3066                 if (!uid_is_valid(uid)) {
3067                         *exit_status = EXIT_USER;
3068                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3069                         return -ESRCH;
3070                 }
3071
3072                 if (!gid_is_valid(gid)) {
3073                         *exit_status = EXIT_USER;
3074                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3075                         return -ESRCH;
3076                 }
3077
3078                 if (dcreds->user)
3079                         username = dcreds->user->name;
3080
3081         } else {
3082                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3083                 if (r < 0) {
3084                         *exit_status = EXIT_USER;
3085                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3086                 }
3087
3088                 r = get_fixed_group(context, &groupname, &gid);
3089                 if (r < 0) {
3090                         *exit_status = EXIT_GROUP;
3091                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3092                 }
3093         }
3094
3095         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3096         r = get_supplementary_groups(context, username, groupname, gid,
3097                                      &supplementary_gids, &ngids);
3098         if (r < 0) {
3099                 *exit_status = EXIT_GROUP;
3100                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3101         }
3102
3103         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3104         if (r < 0) {
3105                 *exit_status = EXIT_USER;
3106                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3107         }
3108
3109         user_lookup_fd = safe_close(user_lookup_fd);
3110
3111         r = acquire_home(context, uid, &home, &home_buffer);
3112         if (r < 0) {
3113                 *exit_status = EXIT_CHDIR;
3114                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3115         }
3116
3117         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3118          * must sure to drop O_NONBLOCK */
3119         if (socket_fd >= 0)
3120                 (void) fd_nonblock(socket_fd, false);
3121
3122         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3123          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3124         if (params->cgroup_path) {
3125                 _cleanup_free_ char *p = NULL;
3126
3127                 r = exec_parameters_get_cgroup_path(params, &p);
3128                 if (r < 0) {
3129                         *exit_status = EXIT_CGROUP;
3130                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3131                 }
3132
3133                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3134                 if (r < 0) {
3135                         *exit_status = EXIT_CGROUP;
3136                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3137                 }
3138         }
3139
3140         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3141                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3142                 if (r < 0) {
3143                         *exit_status = EXIT_NETWORK;
3144                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3145                 }
3146         }
3147
3148         r = setup_input(context, params, socket_fd, named_iofds);
3149         if (r < 0) {
3150                 *exit_status = EXIT_STDIN;
3151                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3152         }
3153
3154         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3155         if (r < 0) {
3156                 *exit_status = EXIT_STDOUT;
3157                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3158         }
3159
3160         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3161         if (r < 0) {
3162                 *exit_status = EXIT_STDERR;
3163                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3164         }
3165
3166         if (context->oom_score_adjust_set) {
3167                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3168                  * prohibit write access to this file, and we shouldn't trip up over that. */
3169                 r = set_oom_score_adjust(context->oom_score_adjust);
3170                 if (IN_SET(r, -EPERM, -EACCES))
3171                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3172                 else if (r < 0) {
3173                         *exit_status = EXIT_OOM_ADJUST;
3174                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3175                 }
3176         }
3177
3178         if (context->nice_set)
3179                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3180                         *exit_status = EXIT_NICE;
3181                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3182                 }
3183
3184         if (context->cpu_sched_set) {
3185                 struct sched_param param = {
3186                         .sched_priority = context->cpu_sched_priority,
3187                 };
3188
3189                 r = sched_setscheduler(0,
3190                                        context->cpu_sched_policy |
3191                                        (context->cpu_sched_reset_on_fork ?
3192                                         SCHED_RESET_ON_FORK : 0),
3193                                        &param);
3194                 if (r < 0) {
3195                         *exit_status = EXIT_SETSCHEDULER;
3196                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3197                 }
3198         }
3199
3200         if (context->cpu_set.set)
3201                 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3202                         *exit_status = EXIT_CPUAFFINITY;
3203                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3204                 }
3205
3206         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3207                 r = apply_numa_policy(&context->numa_policy);
3208                 if (r == -EOPNOTSUPP)
3209                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3210                 else if (r < 0) {
3211                         *exit_status = EXIT_NUMA_POLICY;
3212                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3213                 }
3214         }
3215
3216         if (context->ioprio_set)
3217                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3218                         *exit_status = EXIT_IOPRIO;
3219                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3220                 }
3221
3222         if (context->timer_slack_nsec != NSEC_INFINITY)
3223                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3224                         *exit_status = EXIT_TIMERSLACK;
3225                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3226                 }
3227
3228         if (context->personality != PERSONALITY_INVALID) {
3229                 r = safe_personality(context->personality);
3230                 if (r < 0) {
3231                         *exit_status = EXIT_PERSONALITY;
3232                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3233                 }
3234         }
3235
3236         if (context->utmp_id)
3237                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3238                                       context->tty_path,
3239                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3240                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3241                                       USER_PROCESS,
3242                                       username);
3243
3244         if (uid_is_valid(uid)) {
3245                 r = chown_terminal(STDIN_FILENO, uid);
3246                 if (r < 0) {
3247                         *exit_status = EXIT_STDIN;
3248                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3249                 }
3250         }
3251
3252         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3253          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3254          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3255          * touch a single hierarchy too. */
3256         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3257                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3258                 if (r < 0) {
3259                         *exit_status = EXIT_CGROUP;
3260                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3261                 }
3262         }
3263
3264         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3265                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3266                 if (r < 0)
3267                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3268         }
3269
3270         r = build_environment(
3271                         unit,
3272                         context,
3273                         params,
3274                         n_fds,
3275                         home,
3276                         username,
3277                         shell,
3278                         journal_stream_dev,
3279                         journal_stream_ino,
3280                         &our_env);
3281         if (r < 0) {
3282                 *exit_status = EXIT_MEMORY;
3283                 return log_oom();
3284         }
3285
3286         r = build_pass_environment(context, &pass_env);
3287         if (r < 0) {
3288                 *exit_status = EXIT_MEMORY;
3289                 return log_oom();
3290         }
3291
3292         accum_env = strv_env_merge(5,
3293                                    params->environment,
3294                                    our_env,
3295                                    pass_env,
3296                                    context->environment,
3297                                    files_env,
3298                                    NULL);
3299         if (!accum_env) {
3300                 *exit_status = EXIT_MEMORY;
3301                 return log_oom();
3302         }
3303         accum_env = strv_env_clean(accum_env);
3304
3305         (void) umask(context->umask);
3306
3307         r = setup_keyring(unit, context, params, uid, gid);
3308         if (r < 0) {
3309                 *exit_status = EXIT_KEYRING;
3310                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3311         }
3312
3313         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3314         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3315
3316         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3317         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3318
3319         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3320         if (needs_ambient_hack)
3321                 needs_setuid = false;
3322         else
3323                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3324
3325         if (needs_sandboxing) {
3326                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3327                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3328                  * impacting our own code paths. */
3329
3330 #if HAVE_SELINUX
3331                 use_selinux = mac_selinux_use();
3332 #endif
3333 #if ENABLE_SMACK
3334                 use_smack = mac_smack_use();
3335 #endif
3336 #if HAVE_APPARMOR
3337                 use_apparmor = mac_apparmor_use();
3338 #endif
3339         }
3340
3341         if (needs_sandboxing) {
3342                 int which_failed;
3343
3344                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3345                  * is set here. (See below.) */
3346
3347                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3348                 if (r < 0) {
3349                         *exit_status = EXIT_LIMITS;
3350                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3351                 }
3352         }
3353
3354         if (needs_setuid) {
3355
3356                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3357                  * wins here. (See above.) */
3358
3359                 if (context->pam_name && username) {
3360                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3361                         if (r < 0) {
3362                                 *exit_status = EXIT_PAM;
3363                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3364                         }
3365                 }
3366         }
3367
3368         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3369
3370                 if (ns_type_supported(NAMESPACE_NET)) {
3371                         r = setup_netns(runtime->netns_storage_socket);
3372                         if (r < 0) {
3373                                 *exit_status = EXIT_NETWORK;
3374                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3375                         }
3376                 } else if (context->network_namespace_path) {
3377                         *exit_status = EXIT_NETWORK;
3378                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3379                 } else
3380                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3381         }
3382
3383         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3384         if (needs_mount_namespace) {
3385                 _cleanup_free_ char *error_path = NULL;
3386
3387                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3388                 if (r < 0) {
3389                         *exit_status = EXIT_NAMESPACE;
3390                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3391                                                     error_path ? ": " : "", strempty(error_path));
3392                 }
3393         }
3394
3395         if (context->protect_hostname) {
3396                 if (ns_type_supported(NAMESPACE_UTS)) {
3397                         if (unshare(CLONE_NEWUTS) < 0) {
3398                                 *exit_status = EXIT_NAMESPACE;
3399                                 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3400                         }
3401                 } else
3402                         log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3403 #if HAVE_SECCOMP
3404                 r = seccomp_protect_hostname();
3405                 if (r < 0) {
3406                         *exit_status = EXIT_SECCOMP;
3407                         return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3408                 }
3409 #endif
3410         }
3411
3412         /* Drop groups as early as possbile */
3413         if (needs_setuid) {
3414                 r = enforce_groups(gid, supplementary_gids, ngids);
3415                 if (r < 0) {
3416                         *exit_status = EXIT_GROUP;
3417                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3418                 }
3419         }
3420
3421         if (needs_sandboxing) {
3422 #if HAVE_SELINUX
3423                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3424                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3425                         if (r < 0) {
3426                                 *exit_status = EXIT_SELINUX_CONTEXT;
3427                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3428                         }
3429                 }
3430 #endif
3431
3432                 if (context->private_users) {
3433                         r = setup_private_users(uid, gid);
3434                         if (r < 0) {
3435                                 *exit_status = EXIT_USER;
3436                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3437                         }
3438                 }
3439         }
3440
3441         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3442          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3443          * however if we have it as we want to keep it open until the final execve(). */
3444
3445         if (params->exec_fd >= 0) {
3446                 exec_fd = params->exec_fd;
3447
3448                 if (exec_fd < 3 + (int) n_fds) {
3449                         int moved_fd;
3450
3451                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3452                          * process we are about to execute. */
3453
3454                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3455                         if (moved_fd < 0) {
3456                                 *exit_status = EXIT_FDS;
3457                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3458                         }
3459
3460                         safe_close(exec_fd);
3461                         exec_fd = moved_fd;
3462                 } else {
3463                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3464                         r = fd_cloexec(exec_fd, true);
3465                         if (r < 0) {
3466                                 *exit_status = EXIT_FDS;
3467                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3468                         }
3469                 }
3470
3471                 fds_with_exec_fd = newa(int, n_fds + 1);
3472                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3473                 fds_with_exec_fd[n_fds] = exec_fd;
3474                 n_fds_with_exec_fd = n_fds + 1;
3475         } else {
3476                 fds_with_exec_fd = fds;
3477                 n_fds_with_exec_fd = n_fds;
3478         }
3479
3480         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3481         if (r >= 0)
3482                 r = shift_fds(fds, n_fds);
3483         if (r >= 0)
3484                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3485         if (r < 0) {
3486                 *exit_status = EXIT_FDS;
3487                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3488         }
3489
3490         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3491          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3492          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3493          * came this far. */
3494
3495         secure_bits = context->secure_bits;
3496
3497         if (needs_sandboxing) {
3498                 uint64_t bset;
3499
3500                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3501                  * requested. (Note this is placed after the general resource limit initialization, see
3502                  * above, in order to take precedence.) */
3503                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3504                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3505                                 *exit_status = EXIT_LIMITS;
3506                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3507                         }
3508                 }
3509
3510 #if ENABLE_SMACK
3511                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3512                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3513                 if (use_smack) {
3514                         r = setup_smack(context, command);
3515                         if (r < 0) {
3516                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3517                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3518                         }
3519                 }
3520 #endif
3521
3522                 bset = context->capability_bounding_set;
3523                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3524                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3525                  * instead of us doing that */
3526                 if (needs_ambient_hack)
3527                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3528                                 (UINT64_C(1) << CAP_SETUID) |
3529                                 (UINT64_C(1) << CAP_SETGID);
3530
3531                 if (!cap_test_all(bset)) {
3532                         r = capability_bounding_set_drop(bset, false);
3533                         if (r < 0) {
3534                                 *exit_status = EXIT_CAPABILITIES;
3535                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3536                         }
3537                 }
3538
3539                 /* This is done before enforce_user, but ambient set
3540                  * does not survive over setresuid() if keep_caps is not set. */
3541                 if (!needs_ambient_hack &&
3542                     context->capability_ambient_set != 0) {
3543                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3544                         if (r < 0) {
3545                                 *exit_status = EXIT_CAPABILITIES;
3546                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3547                         }
3548                 }
3549         }
3550
3551         /* chroot to root directory first, before we lose the ability to chroot */
3552         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3553         if (r < 0)
3554                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3555
3556         if (needs_setuid) {
3557                 if (uid_is_valid(uid)) {
3558                         r = enforce_user(context, uid);
3559                         if (r < 0) {
3560                                 *exit_status = EXIT_USER;
3561                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3562                         }
3563
3564                         if (!needs_ambient_hack &&
3565                             context->capability_ambient_set != 0) {
3566
3567                                 /* Fix the ambient capabilities after user change. */
3568                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3569                                 if (r < 0) {
3570                                         *exit_status = EXIT_CAPABILITIES;
3571                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3572                                 }
3573
3574                                 /* If we were asked to change user and ambient capabilities
3575                                  * were requested, we had to add keep-caps to the securebits
3576                                  * so that we would maintain the inherited capability set
3577                                  * through the setresuid(). Make sure that the bit is added
3578                                  * also to the context secure_bits so that we don't try to
3579                                  * drop the bit away next. */
3580
3581                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3582                         }
3583                 }
3584         }
3585
3586         /* Apply working directory here, because the working directory might be on NFS and only the user running
3587          * this service might have the correct privilege to change to the working directory */
3588         r = apply_working_directory(context, params, home, exit_status);
3589         if (r < 0)
3590                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3591
3592         if (needs_sandboxing) {
3593                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3594                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3595                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3596                  * are restricted. */
3597
3598 #if HAVE_SELINUX
3599                 if (use_selinux) {
3600                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3601
3602                         if (exec_context) {
3603                                 r = setexeccon(exec_context);
3604                                 if (r < 0) {
3605                                         *exit_status = EXIT_SELINUX_CONTEXT;
3606                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3607                                 }
3608                         }
3609                 }
3610 #endif
3611
3612 #if HAVE_APPARMOR
3613                 if (use_apparmor && context->apparmor_profile) {
3614                         r = aa_change_onexec(context->apparmor_profile);
3615                         if (r < 0 && !context->apparmor_profile_ignore) {
3616                                 *exit_status = EXIT_APPARMOR_PROFILE;
3617                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3618                         }
3619                 }
3620 #endif
3621
3622                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3623                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3624                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3625                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3626                                 *exit_status = EXIT_SECUREBITS;
3627                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3628                         }
3629
3630                 if (context_has_no_new_privileges(context))
3631                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3632                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3633                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3634                         }
3635
3636 #if HAVE_SECCOMP
3637                 r = apply_address_families(unit, context);
3638                 if (r < 0) {
3639                         *exit_status = EXIT_ADDRESS_FAMILIES;
3640                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3641                 }
3642
3643                 r = apply_memory_deny_write_execute(unit, context);
3644                 if (r < 0) {
3645                         *exit_status = EXIT_SECCOMP;
3646                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3647                 }
3648
3649                 r = apply_restrict_realtime(unit, context);
3650                 if (r < 0) {
3651                         *exit_status = EXIT_SECCOMP;
3652                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3653                 }
3654
3655                 r = apply_restrict_suid_sgid(unit, context);
3656                 if (r < 0) {
3657                         *exit_status = EXIT_SECCOMP;
3658                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3659                 }
3660
3661                 r = apply_restrict_namespaces(unit, context);
3662                 if (r < 0) {
3663                         *exit_status = EXIT_SECCOMP;
3664                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3665                 }
3666
3667                 r = apply_protect_sysctl(unit, context);
3668                 if (r < 0) {
3669                         *exit_status = EXIT_SECCOMP;
3670                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3671                 }
3672
3673                 r = apply_protect_kernel_modules(unit, context);
3674                 if (r < 0) {
3675                         *exit_status = EXIT_SECCOMP;
3676                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3677                 }
3678
3679                 r = apply_private_devices(unit, context);
3680                 if (r < 0) {
3681                         *exit_status = EXIT_SECCOMP;
3682                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3683                 }
3684
3685                 r = apply_syscall_archs(unit, context);
3686                 if (r < 0) {
3687                         *exit_status = EXIT_SECCOMP;
3688                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3689                 }
3690
3691                 r = apply_lock_personality(unit, context);
3692                 if (r < 0) {
3693                         *exit_status = EXIT_SECCOMP;
3694                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3695                 }
3696
3697                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3698                  * by the filter as little as possible. */
3699                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3700                 if (r < 0) {
3701                         *exit_status = EXIT_SECCOMP;
3702                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3703                 }
3704 #endif
3705         }
3706
3707         if (!strv_isempty(context->unset_environment)) {
3708                 char **ee = NULL;
3709
3710                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3711                 if (!ee) {
3712                         *exit_status = EXIT_MEMORY;
3713                         return log_oom();
3714                 }
3715
3716                 strv_free_and_replace(accum_env, ee);
3717         }
3718
3719         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3720                 replaced_argv = replace_env_argv(command->argv, accum_env);
3721                 if (!replaced_argv) {
3722                         *exit_status = EXIT_MEMORY;
3723                         return log_oom();
3724                 }
3725                 final_argv = replaced_argv;
3726         } else
3727                 final_argv = command->argv;
3728
3729         if (DEBUG_LOGGING) {
3730                 _cleanup_free_ char *line;
3731
3732                 line = exec_command_line(final_argv);
3733                 if (line)
3734                         log_struct(LOG_DEBUG,
3735                                    "EXECUTABLE=%s", command->path,
3736                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3737                                    LOG_UNIT_ID(unit),
3738                                    LOG_UNIT_INVOCATION_ID(unit));
3739         }
3740
3741         if (exec_fd >= 0) {
3742                 uint8_t hot = 1;
3743
3744                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3745                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3746
3747                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3748                         *exit_status = EXIT_EXEC;
3749                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3750                 }
3751         }
3752
3753         execve(command->path, final_argv, accum_env);
3754         r = -errno;
3755
3756         if (exec_fd >= 0) {
3757                 uint8_t hot = 0;
3758
3759                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3760                  * that POLLHUP on it no longer means execve() succeeded. */
3761
3762                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3763                         *exit_status = EXIT_EXEC;
3764                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3765                 }
3766         }
3767
3768         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3769                 log_struct_errno(LOG_INFO, r,
3770                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3771                                  LOG_UNIT_ID(unit),
3772                                  LOG_UNIT_INVOCATION_ID(unit),
3773                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3774                                                   command->path),
3775                                  "EXECUTABLE=%s", command->path);
3776                 return 0;
3777         }
3778
3779         *exit_status = EXIT_EXEC;
3780         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3781 }
3782
3783 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3784 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3785
3786 int exec_spawn(Unit *unit,
3787                ExecCommand *command,
3788                const ExecContext *context,
3789                const ExecParameters *params,
3790                ExecRuntime *runtime,
3791                DynamicCreds *dcreds,
3792                pid_t *ret) {
3793
3794         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3795         _cleanup_free_ char *subcgroup_path = NULL;
3796         _cleanup_strv_free_ char **files_env = NULL;
3797         size_t n_storage_fds = 0, n_socket_fds = 0;
3798         _cleanup_free_ char *line = NULL;
3799         pid_t pid;
3800
3801         assert(unit);
3802         assert(command);
3803         assert(context);
3804         assert(ret);
3805         assert(params);
3806         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3807
3808         if (context->std_input == EXEC_INPUT_SOCKET ||
3809             context->std_output == EXEC_OUTPUT_SOCKET ||
3810             context->std_error == EXEC_OUTPUT_SOCKET) {
3811
3812                 if (params->n_socket_fds > 1) {
3813                         log_unit_error(unit, "Got more than one socket.");
3814                         return -EINVAL;
3815                 }
3816
3817                 if (params->n_socket_fds == 0) {
3818                         log_unit_error(unit, "Got no socket.");
3819                         return -EINVAL;
3820                 }
3821
3822                 socket_fd = params->fds[0];
3823         } else {
3824                 socket_fd = -1;
3825                 fds = params->fds;
3826                 n_socket_fds = params->n_socket_fds;
3827                 n_storage_fds = params->n_storage_fds;
3828         }
3829
3830         r = exec_context_named_iofds(context, params, named_iofds);
3831         if (r < 0)
3832                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3833
3834         r = exec_context_load_environment(unit, context, &files_env);
3835         if (r < 0)
3836                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3837
3838         line = exec_command_line(command->argv);
3839         if (!line)
3840                 return log_oom();
3841
3842         log_struct(LOG_DEBUG,
3843                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3844                    "EXECUTABLE=%s", command->path,
3845                    LOG_UNIT_ID(unit),
3846                    LOG_UNIT_INVOCATION_ID(unit));
3847
3848         if (params->cgroup_path) {
3849                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3850                 if (r < 0)
3851                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3852                 if (r > 0) { /* We are using a child cgroup */
3853                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3854                         if (r < 0)
3855                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3856                 }
3857         }
3858
3859         pid = fork();
3860         if (pid < 0)
3861                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3862
3863         if (pid == 0) {
3864                 int exit_status = EXIT_SUCCESS;
3865
3866                 r = exec_child(unit,
3867                                command,
3868                                context,
3869                                params,
3870                                runtime,
3871                                dcreds,
3872                                socket_fd,
3873                                named_iofds,
3874                                fds,
3875                                n_socket_fds,
3876                                n_storage_fds,
3877                                files_env,
3878                                unit->manager->user_lookup_fds[1],
3879                                &exit_status);
3880
3881                 if (r < 0)
3882                         log_struct_errno(LOG_ERR, r,
3883                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3884                                          LOG_UNIT_ID(unit),
3885                                          LOG_UNIT_INVOCATION_ID(unit),
3886                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3887                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3888                                                           command->path),
3889                                          "EXECUTABLE=%s", command->path);
3890
3891                 _exit(exit_status);
3892         }
3893
3894         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3895
3896         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3897          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3898          * process will be killed too). */
3899         if (subcgroup_path)
3900                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3901
3902         exec_status_start(&command->exec_status, pid);
3903
3904         *ret = pid;
3905         return 0;
3906 }
3907
3908 void exec_context_init(ExecContext *c) {
3909         ExecDirectoryType i;
3910
3911         assert(c);
3912
3913         c->umask = 0022;
3914         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3915         c->cpu_sched_policy = SCHED_OTHER;
3916         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3917         c->syslog_level_prefix = true;
3918         c->ignore_sigpipe = true;
3919         c->timer_slack_nsec = NSEC_INFINITY;
3920         c->personality = PERSONALITY_INVALID;
3921         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3922                 c->directories[i].mode = 0755;
3923         c->capability_bounding_set = CAP_ALL;
3924         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3925         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3926         c->log_level_max = -1;
3927         numa_policy_reset(&c->numa_policy);
3928 }
3929
3930 void exec_context_done(ExecContext *c) {
3931         ExecDirectoryType i;
3932         size_t l;
3933
3934         assert(c);
3935
3936         c->environment = strv_free(c->environment);
3937         c->environment_files = strv_free(c->environment_files);
3938         c->pass_environment = strv_free(c->pass_environment);
3939         c->unset_environment = strv_free(c->unset_environment);
3940
3941         rlimit_free_all(c->rlimit);
3942
3943         for (l = 0; l < 3; l++) {
3944                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3945                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3946         }
3947
3948         c->working_directory = mfree(c->working_directory);
3949         c->root_directory = mfree(c->root_directory);
3950         c->root_image = mfree(c->root_image);
3951         c->tty_path = mfree(c->tty_path);
3952         c->syslog_identifier = mfree(c->syslog_identifier);
3953         c->user = mfree(c->user);
3954         c->group = mfree(c->group);
3955
3956         c->supplementary_groups = strv_free(c->supplementary_groups);
3957
3958         c->pam_name = mfree(c->pam_name);
3959
3960         c->read_only_paths = strv_free(c->read_only_paths);
3961         c->read_write_paths = strv_free(c->read_write_paths);
3962         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3963
3964         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3965         c->bind_mounts = NULL;
3966         c->n_bind_mounts = 0;
3967         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3968         c->temporary_filesystems = NULL;
3969         c->n_temporary_filesystems = 0;
3970
3971         cpu_set_reset(&c->cpu_set);
3972         numa_policy_reset(&c->numa_policy);
3973
3974         c->utmp_id = mfree(c->utmp_id);
3975         c->selinux_context = mfree(c->selinux_context);
3976         c->apparmor_profile = mfree(c->apparmor_profile);
3977         c->smack_process_label = mfree(c->smack_process_label);
3978
3979         c->syscall_filter = hashmap_free(c->syscall_filter);
3980         c->syscall_archs = set_free(c->syscall_archs);
3981         c->address_families = set_free(c->address_families);
3982
3983         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3984                 c->directories[i].paths = strv_free(c->directories[i].paths);
3985
3986         c->log_level_max = -1;
3987
3988         exec_context_free_log_extra_fields(c);
3989
3990         c->log_rate_limit_interval_usec = 0;
3991         c->log_rate_limit_burst = 0;
3992
3993         c->stdin_data = mfree(c->stdin_data);
3994         c->stdin_data_size = 0;
3995
3996         c->network_namespace_path = mfree(c->network_namespace_path);
3997 }
3998
3999 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4000         char **i;
4001
4002         assert(c);
4003
4004         if (!runtime_prefix)
4005                 return 0;
4006
4007         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4008                 _cleanup_free_ char *p;
4009
4010                 p = path_join(runtime_prefix, *i);
4011                 if (!p)
4012                         return -ENOMEM;
4013
4014                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4015                  * service next. */
4016                 (void) rm_rf(p, REMOVE_ROOT);
4017         }
4018
4019         return 0;
4020 }
4021
4022 static void exec_command_done(ExecCommand *c) {
4023         assert(c);
4024
4025         c->path = mfree(c->path);
4026         c->argv = strv_free(c->argv);
4027 }
4028
4029 void exec_command_done_array(ExecCommand *c, size_t n) {
4030         size_t i;
4031
4032         for (i = 0; i < n; i++)
4033                 exec_command_done(c+i);
4034 }
4035
4036 ExecCommand* exec_command_free_list(ExecCommand *c) {
4037         ExecCommand *i;
4038
4039         while ((i = c)) {
4040                 LIST_REMOVE(command, c, i);
4041                 exec_command_done(i);
4042                 free(i);
4043         }
4044
4045         return NULL;
4046 }
4047
4048 void exec_command_free_array(ExecCommand **c, size_t n) {
4049         size_t i;
4050
4051         for (i = 0; i < n; i++)
4052                 c[i] = exec_command_free_list(c[i]);
4053 }
4054
4055 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4056         size_t i;
4057
4058         for (i = 0; i < n; i++)
4059                 exec_status_reset(&c[i].exec_status);
4060 }
4061
4062 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4063         size_t i;
4064
4065         for (i = 0; i < n; i++) {
4066                 ExecCommand *z;
4067
4068                 LIST_FOREACH(command, z, c[i])
4069                         exec_status_reset(&z->exec_status);
4070         }
4071 }
4072
4073 typedef struct InvalidEnvInfo {
4074         const Unit *unit;
4075         const char *path;
4076 } InvalidEnvInfo;
4077
4078 static void invalid_env(const char *p, void *userdata) {
4079         InvalidEnvInfo *info = userdata;
4080
4081         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4082 }
4083
4084 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4085         assert(c);
4086
4087         switch (fd_index) {
4088
4089         case STDIN_FILENO:
4090                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4091                         return NULL;
4092
4093                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4094
4095         case STDOUT_FILENO:
4096                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4097                         return NULL;
4098
4099                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4100
4101         case STDERR_FILENO:
4102                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4103                         return NULL;
4104
4105                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4106
4107         default:
4108                 return NULL;
4109         }
4110 }
4111
4112 static int exec_context_named_iofds(
4113                 const ExecContext *c,
4114                 const ExecParameters *p,
4115                 int named_iofds[static 3]) {
4116
4117         size_t i, targets;
4118         const char* stdio_fdname[3];
4119         size_t n_fds;
4120
4121         assert(c);
4122         assert(p);
4123         assert(named_iofds);
4124
4125         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4126                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4127                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4128
4129         for (i = 0; i < 3; i++)
4130                 stdio_fdname[i] = exec_context_fdname(c, i);
4131
4132         n_fds = p->n_storage_fds + p->n_socket_fds;
4133
4134         for (i = 0; i < n_fds  && targets > 0; i++)
4135                 if (named_iofds[STDIN_FILENO] < 0 &&
4136                     c->std_input == EXEC_INPUT_NAMED_FD &&
4137                     stdio_fdname[STDIN_FILENO] &&
4138                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4139
4140                         named_iofds[STDIN_FILENO] = p->fds[i];
4141                         targets--;
4142
4143                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4144                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4145                            stdio_fdname[STDOUT_FILENO] &&
4146                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4147
4148                         named_iofds[STDOUT_FILENO] = p->fds[i];
4149                         targets--;
4150
4151                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4152                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4153                            stdio_fdname[STDERR_FILENO] &&
4154                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4155
4156                         named_iofds[STDERR_FILENO] = p->fds[i];
4157                         targets--;
4158                 }
4159
4160         return targets == 0 ? 0 : -ENOENT;
4161 }
4162
4163 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4164         char **i, **r = NULL;
4165
4166         assert(c);
4167         assert(l);
4168
4169         STRV_FOREACH(i, c->environment_files) {
4170                 char *fn;
4171                 int k;
4172                 unsigned n;
4173                 bool ignore = false;
4174                 char **p;
4175                 _cleanup_globfree_ glob_t pglob = {};
4176
4177                 fn = *i;
4178
4179                 if (fn[0] == '-') {
4180                         ignore = true;
4181                         fn++;
4182                 }
4183
4184                 if (!path_is_absolute(fn)) {
4185                         if (ignore)
4186                                 continue;
4187
4188                         strv_free(r);
4189                         return -EINVAL;
4190                 }
4191
4192                 /* Filename supports globbing, take all matching files */
4193                 k = safe_glob(fn, 0, &pglob);
4194                 if (k < 0) {
4195                         if (ignore)
4196                                 continue;
4197
4198                         strv_free(r);
4199                         return k;
4200                 }
4201
4202                 /* When we don't match anything, -ENOENT should be returned */
4203                 assert(pglob.gl_pathc > 0);
4204
4205                 for (n = 0; n < pglob.gl_pathc; n++) {
4206                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4207                         if (k < 0) {
4208                                 if (ignore)
4209                                         continue;
4210
4211                                 strv_free(r);
4212                                 return k;
4213                         }
4214                         /* Log invalid environment variables with filename */
4215                         if (p) {
4216                                 InvalidEnvInfo info = {
4217                                         .unit = unit,
4218                                         .path = pglob.gl_pathv[n]
4219                                 };
4220
4221                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4222                         }
4223
4224                         if (!r)
4225                                 r = p;
4226                         else {
4227                                 char **m;
4228
4229                                 m = strv_env_merge(2, r, p);
4230                                 strv_free(r);
4231                                 strv_free(p);
4232                                 if (!m)
4233                                         return -ENOMEM;
4234
4235                                 r = m;
4236                         }
4237                 }
4238         }
4239
4240         *l = r;
4241
4242         return 0;
4243 }
4244
4245 static bool tty_may_match_dev_console(const char *tty) {
4246         _cleanup_free_ char *resolved = NULL;
4247
4248         if (!tty)
4249                 return true;
4250
4251         tty = skip_dev_prefix(tty);
4252
4253         /* trivial identity? */
4254         if (streq(tty, "console"))
4255                 return true;
4256
4257         if (resolve_dev_console(&resolved) < 0)
4258                 return true; /* if we could not resolve, assume it may */
4259
4260         /* "tty0" means the active VC, so it may be the same sometimes */
4261         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4262 }
4263
4264 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4265         assert(ec);
4266
4267         return ec->tty_reset ||
4268                 ec->tty_vhangup ||
4269                 ec->tty_vt_disallocate ||
4270                 is_terminal_input(ec->std_input) ||
4271                 is_terminal_output(ec->std_output) ||
4272                 is_terminal_output(ec->std_error);
4273 }
4274
4275 bool exec_context_may_touch_console(const ExecContext *ec) {
4276
4277         return exec_context_may_touch_tty(ec) &&
4278                tty_may_match_dev_console(exec_context_tty_path(ec));
4279 }
4280
4281 static void strv_fprintf(FILE *f, char **l) {
4282         char **g;
4283
4284         assert(f);
4285
4286         STRV_FOREACH(g, l)
4287                 fprintf(f, " %s", *g);
4288 }
4289
4290 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4291         ExecDirectoryType dt;
4292         char **e, **d;
4293         unsigned i;
4294         int r;
4295
4296         assert(c);
4297         assert(f);
4298
4299         prefix = strempty(prefix);
4300
4301         fprintf(f,
4302                 "%sUMask: %04o\n"
4303                 "%sWorkingDirectory: %s\n"
4304                 "%sRootDirectory: %s\n"
4305                 "%sNonBlocking: %s\n"
4306                 "%sPrivateTmp: %s\n"
4307                 "%sPrivateDevices: %s\n"
4308                 "%sProtectKernelTunables: %s\n"
4309                 "%sProtectKernelModules: %s\n"
4310                 "%sProtectControlGroups: %s\n"
4311                 "%sPrivateNetwork: %s\n"
4312                 "%sPrivateUsers: %s\n"
4313                 "%sProtectHome: %s\n"
4314                 "%sProtectSystem: %s\n"
4315                 "%sMountAPIVFS: %s\n"
4316                 "%sIgnoreSIGPIPE: %s\n"
4317                 "%sMemoryDenyWriteExecute: %s\n"
4318                 "%sRestrictRealtime: %s\n"
4319                 "%sRestrictSUIDSGID: %s\n"
4320                 "%sKeyringMode: %s\n"
4321                 "%sProtectHostname: %s\n",
4322                 prefix, c->umask,
4323                 prefix, c->working_directory ? c->working_directory : "/",
4324                 prefix, c->root_directory ? c->root_directory : "/",
4325                 prefix, yes_no(c->non_blocking),
4326                 prefix, yes_no(c->private_tmp),
4327                 prefix, yes_no(c->private_devices),
4328                 prefix, yes_no(c->protect_kernel_tunables),
4329                 prefix, yes_no(c->protect_kernel_modules),
4330                 prefix, yes_no(c->protect_control_groups),
4331                 prefix, yes_no(c->private_network),
4332                 prefix, yes_no(c->private_users),
4333                 prefix, protect_home_to_string(c->protect_home),
4334                 prefix, protect_system_to_string(c->protect_system),
4335                 prefix, yes_no(c->mount_apivfs),
4336                 prefix, yes_no(c->ignore_sigpipe),
4337                 prefix, yes_no(c->memory_deny_write_execute),
4338                 prefix, yes_no(c->restrict_realtime),
4339                 prefix, yes_no(c->restrict_suid_sgid),
4340                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4341                 prefix, yes_no(c->protect_hostname));
4342
4343         if (c->root_image)
4344                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4345
4346         STRV_FOREACH(e, c->environment)
4347                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4348
4349         STRV_FOREACH(e, c->environment_files)
4350                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4351
4352         STRV_FOREACH(e, c->pass_environment)
4353                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4354
4355         STRV_FOREACH(e, c->unset_environment)
4356                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4357
4358         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4359
4360         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4361                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4362
4363                 STRV_FOREACH(d, c->directories[dt].paths)
4364                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4365         }
4366
4367         if (c->nice_set)
4368                 fprintf(f,
4369                         "%sNice: %i\n",
4370                         prefix, c->nice);
4371
4372         if (c->oom_score_adjust_set)
4373                 fprintf(f,
4374                         "%sOOMScoreAdjust: %i\n",
4375                         prefix, c->oom_score_adjust);
4376
4377         for (i = 0; i < RLIM_NLIMITS; i++)
4378                 if (c->rlimit[i]) {
4379                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4380                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4381                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4382                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4383                 }
4384
4385         if (c->ioprio_set) {
4386                 _cleanup_free_ char *class_str = NULL;
4387
4388                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4389                 if (r >= 0)
4390                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4391
4392                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4393         }
4394
4395         if (c->cpu_sched_set) {
4396                 _cleanup_free_ char *policy_str = NULL;
4397
4398                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4399                 if (r >= 0)
4400                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4401
4402                 fprintf(f,
4403                         "%sCPUSchedulingPriority: %i\n"
4404                         "%sCPUSchedulingResetOnFork: %s\n",
4405                         prefix, c->cpu_sched_priority,
4406                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4407         }
4408
4409         if (c->cpu_set.set) {
4410                 _cleanup_free_ char *affinity = NULL;
4411
4412                 affinity = cpu_set_to_range_string(&c->cpu_set);
4413                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4414         }
4415
4416         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4417                 _cleanup_free_ char *nodes = NULL;
4418
4419                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4420                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4421                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4422         }
4423
4424         if (c->timer_slack_nsec != NSEC_INFINITY)
4425                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4426
4427         fprintf(f,
4428                 "%sStandardInput: %s\n"
4429                 "%sStandardOutput: %s\n"
4430                 "%sStandardError: %s\n",
4431                 prefix, exec_input_to_string(c->std_input),
4432                 prefix, exec_output_to_string(c->std_output),
4433                 prefix, exec_output_to_string(c->std_error));
4434
4435         if (c->std_input == EXEC_INPUT_NAMED_FD)
4436                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4437         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4438                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4439         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4440                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4441
4442         if (c->std_input == EXEC_INPUT_FILE)
4443                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4444         if (c->std_output == EXEC_OUTPUT_FILE)
4445                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4446         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4447                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4448         if (c->std_error == EXEC_OUTPUT_FILE)
4449                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4450         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4451                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4452
4453         if (c->tty_path)
4454                 fprintf(f,
4455                         "%sTTYPath: %s\n"
4456                         "%sTTYReset: %s\n"
4457                         "%sTTYVHangup: %s\n"
4458                         "%sTTYVTDisallocate: %s\n",
4459                         prefix, c->tty_path,
4460                         prefix, yes_no(c->tty_reset),
4461                         prefix, yes_no(c->tty_vhangup),
4462                         prefix, yes_no(c->tty_vt_disallocate));
4463
4464         if (IN_SET(c->std_output,
4465                    EXEC_OUTPUT_SYSLOG,
4466                    EXEC_OUTPUT_KMSG,
4467                    EXEC_OUTPUT_JOURNAL,
4468                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4469                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4470                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4471             IN_SET(c->std_error,
4472                    EXEC_OUTPUT_SYSLOG,
4473                    EXEC_OUTPUT_KMSG,
4474                    EXEC_OUTPUT_JOURNAL,
4475                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4476                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4477                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4478
4479                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4480
4481                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4482                 if (r >= 0)
4483                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4484
4485                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4486                 if (r >= 0)
4487                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4488         }
4489
4490         if (c->log_level_max >= 0) {
4491                 _cleanup_free_ char *t = NULL;
4492
4493                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4494
4495                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4496         }
4497
4498         if (c->log_rate_limit_interval_usec > 0) {
4499                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4500
4501                 fprintf(f,
4502                         "%sLogRateLimitIntervalSec: %s\n",
4503                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4504         }
4505
4506         if (c->log_rate_limit_burst > 0)
4507                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4508
4509         if (c->n_log_extra_fields > 0) {
4510                 size_t j;
4511
4512                 for (j = 0; j < c->n_log_extra_fields; j++) {
4513                         fprintf(f, "%sLogExtraFields: ", prefix);
4514                         fwrite(c->log_extra_fields[j].iov_base,
4515                                1, c->log_extra_fields[j].iov_len,
4516                                f);
4517                         fputc('\n', f);
4518                 }
4519         }
4520
4521         if (c->secure_bits) {
4522                 _cleanup_free_ char *str = NULL;
4523
4524                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4525                 if (r >= 0)
4526                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4527         }
4528
4529         if (c->capability_bounding_set != CAP_ALL) {
4530                 _cleanup_free_ char *str = NULL;
4531
4532                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4533                 if (r >= 0)
4534                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4535         }
4536
4537         if (c->capability_ambient_set != 0) {
4538                 _cleanup_free_ char *str = NULL;
4539
4540                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4541                 if (r >= 0)
4542                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4543         }
4544
4545         if (c->user)
4546                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4547         if (c->group)
4548                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4549
4550         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4551
4552         if (!strv_isempty(c->supplementary_groups)) {
4553                 fprintf(f, "%sSupplementaryGroups:", prefix);
4554                 strv_fprintf(f, c->supplementary_groups);
4555                 fputs("\n", f);
4556         }
4557
4558         if (c->pam_name)
4559                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4560
4561         if (!strv_isempty(c->read_write_paths)) {
4562                 fprintf(f, "%sReadWritePaths:", prefix);
4563                 strv_fprintf(f, c->read_write_paths);
4564                 fputs("\n", f);
4565         }
4566
4567         if (!strv_isempty(c->read_only_paths)) {
4568                 fprintf(f, "%sReadOnlyPaths:", prefix);
4569                 strv_fprintf(f, c->read_only_paths);
4570                 fputs("\n", f);
4571         }
4572
4573         if (!strv_isempty(c->inaccessible_paths)) {
4574                 fprintf(f, "%sInaccessiblePaths:", prefix);
4575                 strv_fprintf(f, c->inaccessible_paths);
4576                 fputs("\n", f);
4577         }
4578
4579         if (c->n_bind_mounts > 0)
4580                 for (i = 0; i < c->n_bind_mounts; i++)
4581                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4582                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4583                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4584                                 c->bind_mounts[i].source,
4585                                 c->bind_mounts[i].destination,
4586                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4587
4588         if (c->n_temporary_filesystems > 0)
4589                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4590                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4591
4592                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4593                                 t->path,
4594                                 isempty(t->options) ? "" : ":",
4595                                 strempty(t->options));
4596                 }
4597
4598         if (c->utmp_id)
4599                 fprintf(f,
4600                         "%sUtmpIdentifier: %s\n",
4601                         prefix, c->utmp_id);
4602
4603         if (c->selinux_context)
4604                 fprintf(f,
4605                         "%sSELinuxContext: %s%s\n",
4606                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4607
4608         if (c->apparmor_profile)
4609                 fprintf(f,
4610                         "%sAppArmorProfile: %s%s\n",
4611                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4612
4613         if (c->smack_process_label)
4614                 fprintf(f,
4615                         "%sSmackProcessLabel: %s%s\n",
4616                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4617
4618         if (c->personality != PERSONALITY_INVALID)
4619                 fprintf(f,
4620                         "%sPersonality: %s\n",
4621                         prefix, strna(personality_to_string(c->personality)));
4622
4623         fprintf(f,
4624                 "%sLockPersonality: %s\n",
4625                 prefix, yes_no(c->lock_personality));
4626
4627         if (c->syscall_filter) {
4628 #if HAVE_SECCOMP
4629                 Iterator j;
4630                 void *id, *val;
4631                 bool first = true;
4632 #endif
4633
4634                 fprintf(f,
4635                         "%sSystemCallFilter: ",
4636                         prefix);
4637
4638                 if (!c->syscall_whitelist)
4639                         fputc('~', f);
4640
4641 #if HAVE_SECCOMP
4642                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4643                         _cleanup_free_ char *name = NULL;
4644                         const char *errno_name = NULL;
4645                         int num = PTR_TO_INT(val);
4646
4647                         if (first)
4648                                 first = false;
4649                         else
4650                                 fputc(' ', f);
4651
4652                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4653                         fputs(strna(name), f);
4654
4655                         if (num >= 0) {
4656                                 errno_name = errno_to_name(num);
4657                                 if (errno_name)
4658                                         fprintf(f, ":%s", errno_name);
4659                                 else
4660                                         fprintf(f, ":%d", num);
4661                         }
4662                 }
4663 #endif
4664
4665                 fputc('\n', f);
4666         }
4667
4668         if (c->syscall_archs) {
4669 #if HAVE_SECCOMP
4670                 Iterator j;
4671                 void *id;
4672 #endif
4673
4674                 fprintf(f,
4675                         "%sSystemCallArchitectures:",
4676                         prefix);
4677
4678 #if HAVE_SECCOMP
4679                 SET_FOREACH(id, c->syscall_archs, j)
4680                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4681 #endif
4682                 fputc('\n', f);
4683         }
4684
4685         if (exec_context_restrict_namespaces_set(c)) {
4686                 _cleanup_free_ char *s = NULL;
4687
4688                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4689                 if (r >= 0)
4690                         fprintf(f, "%sRestrictNamespaces: %s\n",
4691                                 prefix, s);
4692         }
4693
4694         if (c->network_namespace_path)
4695                 fprintf(f,
4696                         "%sNetworkNamespacePath: %s\n",
4697                         prefix, c->network_namespace_path);
4698
4699         if (c->syscall_errno > 0) {
4700                 const char *errno_name;
4701
4702                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4703
4704                 errno_name = errno_to_name(c->syscall_errno);
4705                 if (errno_name)
4706                         fprintf(f, "%s\n", errno_name);
4707                 else
4708                         fprintf(f, "%d\n", c->syscall_errno);
4709         }
4710 }
4711
4712 bool exec_context_maintains_privileges(const ExecContext *c) {
4713         assert(c);
4714
4715         /* Returns true if the process forked off would run under
4716          * an unchanged UID or as root. */
4717
4718         if (!c->user)
4719                 return true;
4720
4721         if (streq(c->user, "root") || streq(c->user, "0"))
4722                 return true;
4723
4724         return false;
4725 }
4726
4727 int exec_context_get_effective_ioprio(const ExecContext *c) {
4728         int p;
4729
4730         assert(c);
4731
4732         if (c->ioprio_set)
4733                 return c->ioprio;
4734
4735         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4736         if (p < 0)
4737                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4738
4739         return p;
4740 }
4741
4742 void exec_context_free_log_extra_fields(ExecContext *c) {
4743         size_t l;
4744
4745         assert(c);
4746
4747         for (l = 0; l < c->n_log_extra_fields; l++)
4748                 free(c->log_extra_fields[l].iov_base);
4749         c->log_extra_fields = mfree(c->log_extra_fields);
4750         c->n_log_extra_fields = 0;
4751 }
4752
4753 void exec_context_revert_tty(ExecContext *c) {
4754         int r;
4755
4756         assert(c);
4757
4758         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4759         exec_context_tty_reset(c, NULL);
4760
4761         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4762          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4763          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4764
4765         if (exec_context_may_touch_tty(c)) {
4766                 const char *path;
4767
4768                 path = exec_context_tty_path(c);
4769                 if (path) {
4770                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4771                         if (r < 0 && r != -ENOENT)
4772                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4773                 }
4774         }
4775 }
4776
4777 int exec_context_get_clean_directories(
4778                 ExecContext *c,
4779                 char **prefix,
4780                 ExecCleanMask mask,
4781                 char ***ret) {
4782
4783         _cleanup_strv_free_ char **l = NULL;
4784         ExecDirectoryType t;
4785         int r;
4786
4787         assert(c);
4788         assert(prefix);
4789         assert(ret);
4790
4791         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4792                 char **i;
4793
4794                 if (!FLAGS_SET(mask, 1U << t))
4795                         continue;
4796
4797                 if (!prefix[t])
4798                         continue;
4799
4800                 STRV_FOREACH(i, c->directories[t].paths) {
4801                         char *j;
4802
4803                         j = path_join(prefix[t], *i);
4804                         if (!j)
4805                                 return -ENOMEM;
4806
4807                         r = strv_consume(&l, j);
4808                         if (r < 0)
4809                                 return r;
4810                 }
4811         }
4812
4813         *ret = TAKE_PTR(l);
4814         return 0;
4815 }
4816
4817 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4818         ExecCleanMask mask = 0;
4819
4820         assert(c);
4821         assert(ret);
4822
4823         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4824                 if (!strv_isempty(c->directories[t].paths))
4825                         mask |= 1U << t;
4826
4827         *ret = mask;
4828         return 0;
4829 }
4830
4831 void exec_status_start(ExecStatus *s, pid_t pid) {
4832         assert(s);
4833
4834         *s = (ExecStatus) {
4835                 .pid = pid,
4836         };
4837
4838         dual_timestamp_get(&s->start_timestamp);
4839 }
4840
4841 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4842         assert(s);
4843
4844         if (s->pid != pid) {
4845                 *s = (ExecStatus) {
4846                         .pid = pid,
4847                 };
4848         }
4849
4850         dual_timestamp_get(&s->exit_timestamp);
4851
4852         s->code = code;
4853         s->status = status;
4854
4855         if (context && context->utmp_id)
4856                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4857 }
4858
4859 void exec_status_reset(ExecStatus *s) {
4860         assert(s);
4861
4862         *s = (ExecStatus) {};
4863 }
4864
4865 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4866         char buf[FORMAT_TIMESTAMP_MAX];
4867
4868         assert(s);
4869         assert(f);
4870
4871         if (s->pid <= 0)
4872                 return;
4873
4874         prefix = strempty(prefix);
4875
4876         fprintf(f,
4877                 "%sPID: "PID_FMT"\n",
4878                 prefix, s->pid);
4879
4880         if (dual_timestamp_is_set(&s->start_timestamp))
4881                 fprintf(f,
4882                         "%sStart Timestamp: %s\n",
4883                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4884
4885         if (dual_timestamp_is_set(&s->exit_timestamp))
4886                 fprintf(f,
4887                         "%sExit Timestamp: %s\n"
4888                         "%sExit Code: %s\n"
4889                         "%sExit Status: %i\n",
4890                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4891                         prefix, sigchld_code_to_string(s->code),
4892                         prefix, s->status);
4893 }
4894
4895 static char *exec_command_line(char **argv) {
4896         size_t k;
4897         char *n, *p, **a;
4898         bool first = true;
4899
4900         assert(argv);
4901
4902         k = 1;
4903         STRV_FOREACH(a, argv)
4904                 k += strlen(*a)+3;
4905
4906         n = new(char, k);
4907         if (!n)
4908                 return NULL;
4909
4910         p = n;
4911         STRV_FOREACH(a, argv) {
4912
4913                 if (!first)
4914                         *(p++) = ' ';
4915                 else
4916                         first = false;
4917
4918                 if (strpbrk(*a, WHITESPACE)) {
4919                         *(p++) = '\'';
4920                         p = stpcpy(p, *a);
4921                         *(p++) = '\'';
4922                 } else
4923                         p = stpcpy(p, *a);
4924
4925         }
4926
4927         *p = 0;
4928
4929         /* FIXME: this doesn't really handle arguments that have
4930          * spaces and ticks in them */
4931
4932         return n;
4933 }
4934
4935 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4936         _cleanup_free_ char *cmd = NULL;
4937         const char *prefix2;
4938
4939         assert(c);
4940         assert(f);
4941
4942         prefix = strempty(prefix);
4943         prefix2 = strjoina(prefix, "\t");
4944
4945         cmd = exec_command_line(c->argv);
4946         fprintf(f,
4947                 "%sCommand Line: %s\n",
4948                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
4949
4950         exec_status_dump(&c->exec_status, f, prefix2);
4951 }
4952
4953 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4954         assert(f);
4955
4956         prefix = strempty(prefix);
4957
4958         LIST_FOREACH(command, c, c)
4959                 exec_command_dump(c, f, prefix);
4960 }
4961
4962 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4963         ExecCommand *end;
4964
4965         assert(l);
4966         assert(e);
4967
4968         if (*l) {
4969                 /* It's kind of important, that we keep the order here */
4970                 LIST_FIND_TAIL(command, *l, end);
4971                 LIST_INSERT_AFTER(command, *l, end, e);
4972         } else
4973               *l = e;
4974 }
4975
4976 int exec_command_set(ExecCommand *c, const char *path, ...) {
4977         va_list ap;
4978         char **l, *p;
4979
4980         assert(c);
4981         assert(path);
4982
4983         va_start(ap, path);
4984         l = strv_new_ap(path, ap);
4985         va_end(ap);
4986
4987         if (!l)
4988                 return -ENOMEM;
4989
4990         p = strdup(path);
4991         if (!p) {
4992                 strv_free(l);
4993                 return -ENOMEM;
4994         }
4995
4996         free_and_replace(c->path, p);
4997
4998         return strv_free_and_replace(c->argv, l);
4999 }
5000
5001 int exec_command_append(ExecCommand *c, const char *path, ...) {
5002         _cleanup_strv_free_ char **l = NULL;
5003         va_list ap;
5004         int r;
5005
5006         assert(c);
5007         assert(path);
5008
5009         va_start(ap, path);
5010         l = strv_new_ap(path, ap);
5011         va_end(ap);
5012
5013         if (!l)
5014                 return -ENOMEM;
5015
5016         r = strv_extend_strv(&c->argv, l, false);
5017         if (r < 0)
5018                 return r;
5019
5020         return 0;
5021 }
5022
5023 static void *remove_tmpdir_thread(void *p) {
5024         _cleanup_free_ char *path = p;
5025
5026         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5027         return NULL;
5028 }
5029
5030 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5031         int r;
5032
5033         if (!rt)
5034                 return NULL;
5035
5036         if (rt->manager)
5037                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5038
5039         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5040         if (destroy && rt->tmp_dir) {
5041                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5042
5043                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5044                 if (r < 0) {
5045                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5046                         free(rt->tmp_dir);
5047                 }
5048
5049                 rt->tmp_dir = NULL;
5050         }
5051
5052         if (destroy && rt->var_tmp_dir) {
5053                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5054
5055                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5056                 if (r < 0) {
5057                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5058                         free(rt->var_tmp_dir);
5059                 }
5060
5061                 rt->var_tmp_dir = NULL;
5062         }
5063
5064         rt->id = mfree(rt->id);
5065         rt->tmp_dir = mfree(rt->tmp_dir);
5066         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5067         safe_close_pair(rt->netns_storage_socket);
5068         return mfree(rt);
5069 }
5070
5071 static void exec_runtime_freep(ExecRuntime **rt) {
5072         (void) exec_runtime_free(*rt, false);
5073 }
5074
5075 static int exec_runtime_allocate(ExecRuntime **ret) {
5076         ExecRuntime *n;
5077
5078         assert(ret);
5079
5080         n = new(ExecRuntime, 1);
5081         if (!n)
5082                 return -ENOMEM;
5083
5084         *n = (ExecRuntime) {
5085                 .netns_storage_socket = { -1, -1 },
5086         };
5087
5088         *ret = n;
5089         return 0;
5090 }
5091
5092 static int exec_runtime_add(
5093                 Manager *m,
5094                 const char *id,
5095                 const char *tmp_dir,
5096                 const char *var_tmp_dir,
5097                 const int netns_storage_socket[2],
5098                 ExecRuntime **ret) {
5099
5100         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5101         int r;
5102
5103         assert(m);
5104         assert(id);
5105
5106         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5107         if (r < 0)
5108                 return r;
5109
5110         r = exec_runtime_allocate(&rt);
5111         if (r < 0)
5112                 return r;
5113
5114         rt->id = strdup(id);
5115         if (!rt->id)
5116                 return -ENOMEM;
5117
5118         if (tmp_dir) {
5119                 rt->tmp_dir = strdup(tmp_dir);
5120                 if (!rt->tmp_dir)
5121                         return -ENOMEM;
5122
5123                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5124                 assert(var_tmp_dir);
5125                 rt->var_tmp_dir = strdup(var_tmp_dir);
5126                 if (!rt->var_tmp_dir)
5127                         return -ENOMEM;
5128         }
5129
5130         if (netns_storage_socket) {
5131                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5132                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5133         }
5134
5135         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5136         if (r < 0)
5137                 return r;
5138
5139         rt->manager = m;
5140
5141         if (ret)
5142                 *ret = rt;
5143
5144         /* do not remove created ExecRuntime object when the operation succeeds. */
5145         rt = NULL;
5146         return 0;
5147 }
5148
5149 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5150         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5151         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5152         int r;
5153
5154         assert(m);
5155         assert(c);
5156         assert(id);
5157
5158         /* It is not necessary to create ExecRuntime object. */
5159         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5160                 return 0;
5161
5162         if (c->private_tmp) {
5163                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5164                 if (r < 0)
5165                         return r;
5166         }
5167
5168         if (c->private_network || c->network_namespace_path) {
5169                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5170                         return -errno;
5171         }
5172
5173         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5174         if (r < 0)
5175                 return r;
5176
5177         /* Avoid cleanup */
5178         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5179         return 1;
5180 }
5181
5182 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5183         ExecRuntime *rt;
5184         int r;
5185
5186         assert(m);
5187         assert(id);
5188         assert(ret);
5189
5190         rt = hashmap_get(m->exec_runtime_by_id, id);
5191         if (rt)
5192                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5193                 goto ref;
5194
5195         if (!create)
5196                 return 0;
5197
5198         /* If not found, then create a new object. */
5199         r = exec_runtime_make(m, c, id, &rt);
5200         if (r <= 0)
5201                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5202                 return r;
5203
5204 ref:
5205         /* increment reference counter. */
5206         rt->n_ref++;
5207         *ret = rt;
5208         return 1;
5209 }
5210
5211 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5212         if (!rt)
5213                 return NULL;
5214
5215         assert(rt->n_ref > 0);
5216
5217         rt->n_ref--;
5218         if (rt->n_ref > 0)
5219                 return NULL;
5220
5221         return exec_runtime_free(rt, destroy);
5222 }
5223
5224 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5225         ExecRuntime *rt;
5226         Iterator i;
5227
5228         assert(m);
5229         assert(f);
5230         assert(fds);
5231
5232         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5233                 fprintf(f, "exec-runtime=%s", rt->id);
5234
5235                 if (rt->tmp_dir)
5236                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5237
5238                 if (rt->var_tmp_dir)
5239                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5240
5241                 if (rt->netns_storage_socket[0] >= 0) {
5242                         int copy;
5243
5244                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5245                         if (copy < 0)
5246                                 return copy;
5247
5248                         fprintf(f, " netns-socket-0=%i", copy);
5249                 }
5250
5251                 if (rt->netns_storage_socket[1] >= 0) {
5252                         int copy;
5253
5254                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5255                         if (copy < 0)
5256                                 return copy;
5257
5258                         fprintf(f, " netns-socket-1=%i", copy);
5259                 }
5260
5261                 fputc('\n', f);
5262         }
5263
5264         return 0;
5265 }
5266
5267 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5268         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5269         ExecRuntime *rt;
5270         int r;
5271
5272         /* This is for the migration from old (v237 or earlier) deserialization text.
5273          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5274          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5275          * so or not from the serialized text, then we always creates a new object owned by this. */
5276
5277         assert(u);
5278         assert(key);
5279         assert(value);
5280
5281         /* Manager manages ExecRuntime objects by the unit id.
5282          * So, we omit the serialized text when the unit does not have id (yet?)... */
5283         if (isempty(u->id)) {
5284                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5285                 return 0;
5286         }
5287
5288         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5289         if (r < 0) {
5290                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5291                 return 0;
5292         }
5293
5294         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5295         if (!rt) {
5296                 r = exec_runtime_allocate(&rt_create);
5297                 if (r < 0)
5298                         return log_oom();
5299
5300                 rt_create->id = strdup(u->id);
5301                 if (!rt_create->id)
5302                         return log_oom();
5303
5304                 rt = rt_create;
5305         }
5306
5307         if (streq(key, "tmp-dir")) {
5308                 char *copy;
5309
5310                 copy = strdup(value);
5311                 if (!copy)
5312                         return log_oom();
5313
5314                 free_and_replace(rt->tmp_dir, copy);
5315
5316         } else if (streq(key, "var-tmp-dir")) {
5317                 char *copy;
5318
5319                 copy = strdup(value);
5320                 if (!copy)
5321                         return log_oom();
5322
5323                 free_and_replace(rt->var_tmp_dir, copy);
5324
5325         } else if (streq(key, "netns-socket-0")) {
5326                 int fd;
5327
5328                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5329                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5330                         return 0;
5331                 }
5332
5333                 safe_close(rt->netns_storage_socket[0]);
5334                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5335
5336         } else if (streq(key, "netns-socket-1")) {
5337                 int fd;
5338
5339                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5340                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5341                         return 0;
5342                 }
5343
5344                 safe_close(rt->netns_storage_socket[1]);
5345                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5346         } else
5347                 return 0;
5348
5349         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5350         if (rt_create) {
5351                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5352                 if (r < 0) {
5353                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5354                         return 0;
5355                 }
5356
5357                 rt_create->manager = u->manager;
5358
5359                 /* Avoid cleanup */
5360                 rt_create = NULL;
5361         }
5362
5363         return 1;
5364 }
5365
5366 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5367         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5368         int r, fd0 = -1, fd1 = -1;
5369         const char *p, *v = value;
5370         size_t n;
5371
5372         assert(m);
5373         assert(value);
5374         assert(fds);
5375
5376         n = strcspn(v, " ");
5377         id = strndupa(v, n);
5378         if (v[n] != ' ')
5379                 goto finalize;
5380         p = v + n + 1;
5381
5382         v = startswith(p, "tmp-dir=");
5383         if (v) {
5384                 n = strcspn(v, " ");
5385                 tmp_dir = strndupa(v, n);
5386                 if (v[n] != ' ')
5387                         goto finalize;
5388                 p = v + n + 1;
5389         }
5390
5391         v = startswith(p, "var-tmp-dir=");
5392         if (v) {
5393                 n = strcspn(v, " ");
5394                 var_tmp_dir = strndupa(v, n);
5395                 if (v[n] != ' ')
5396                         goto finalize;
5397                 p = v + n + 1;
5398         }
5399
5400         v = startswith(p, "netns-socket-0=");
5401         if (v) {
5402                 char *buf;
5403
5404                 n = strcspn(v, " ");
5405                 buf = strndupa(v, n);
5406                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5407                         log_debug("Unable to process exec-runtime netns fd specification.");
5408                         return;
5409                 }
5410                 fd0 = fdset_remove(fds, fd0);
5411                 if (v[n] != ' ')
5412                         goto finalize;
5413                 p = v + n + 1;
5414         }
5415
5416         v = startswith(p, "netns-socket-1=");
5417         if (v) {
5418                 char *buf;
5419
5420                 n = strcspn(v, " ");
5421                 buf = strndupa(v, n);
5422                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5423                         log_debug("Unable to process exec-runtime netns fd specification.");
5424                         return;
5425                 }
5426                 fd1 = fdset_remove(fds, fd1);
5427         }
5428
5429 finalize:
5430
5431         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5432         if (r < 0)
5433                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5434 }
5435
5436 void exec_runtime_vacuum(Manager *m) {
5437         ExecRuntime *rt;
5438         Iterator i;
5439
5440         assert(m);
5441
5442         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5443
5444         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5445                 if (rt->n_ref > 0)
5446                         continue;
5447
5448                 (void) exec_runtime_free(rt, false);
5449         }
5450 }
5451
5452 void exec_params_clear(ExecParameters *p) {
5453         if (!p)
5454                 return;
5455
5456         strv_free(p->environment);
5457 }
5458
5459 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5460         [EXEC_INPUT_NULL] = "null",
5461         [EXEC_INPUT_TTY] = "tty",
5462         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5463         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5464         [EXEC_INPUT_SOCKET] = "socket",
5465         [EXEC_INPUT_NAMED_FD] = "fd",
5466         [EXEC_INPUT_DATA] = "data",
5467         [EXEC_INPUT_FILE] = "file",
5468 };
5469
5470 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5471
5472 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5473         [EXEC_OUTPUT_INHERIT] = "inherit",
5474         [EXEC_OUTPUT_NULL] = "null",
5475         [EXEC_OUTPUT_TTY] = "tty",
5476         [EXEC_OUTPUT_SYSLOG] = "syslog",
5477         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5478         [EXEC_OUTPUT_KMSG] = "kmsg",
5479         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5480         [EXEC_OUTPUT_JOURNAL] = "journal",
5481         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5482         [EXEC_OUTPUT_SOCKET] = "socket",
5483         [EXEC_OUTPUT_NAMED_FD] = "fd",
5484         [EXEC_OUTPUT_FILE] = "file",
5485         [EXEC_OUTPUT_FILE_APPEND] = "append",
5486 };
5487
5488 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5489
5490 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5491         [EXEC_UTMP_INIT] = "init",
5492         [EXEC_UTMP_LOGIN] = "login",
5493         [EXEC_UTMP_USER] = "user",
5494 };
5495
5496 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5497
5498 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5499         [EXEC_PRESERVE_NO] = "no",
5500         [EXEC_PRESERVE_YES] = "yes",
5501         [EXEC_PRESERVE_RESTART] = "restart",
5502 };
5503
5504 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5505
5506 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5507 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5508         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5509         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5510         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5511         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5512         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5513 };
5514
5515 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5516
5517 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5518  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5519  * directories, specifically .timer units with their timestamp touch file. */
5520 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5521         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5522         [EXEC_DIRECTORY_STATE] = "state",
5523         [EXEC_DIRECTORY_CACHE] = "cache",
5524         [EXEC_DIRECTORY_LOGS] = "logs",
5525         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5526 };
5527
5528 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5529
5530 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5531  * the service payload in. */
5532 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5533         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5534         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5535         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5536         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5537         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5538 };
5539
5540 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5541
5542 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5543         [EXEC_KEYRING_INHERIT] = "inherit",
5544         [EXEC_KEYRING_PRIVATE] = "private",
5545         [EXEC_KEYRING_SHARED] = "shared",
5546 };
5547
5548 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);