src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "execute.h"
  57 #include "exit-status.h"
  58 #include "fd-util.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "memory-util.h"
  69 #include "missing.h"
  70 #include "mkdir.h"
  71 #include "namespace.h"
  72 #include "parse-util.h"
  73 #include "path-util.h"
  74 #include "process-util.h"
  75 #include "rlimit-util.h"
  76 #include "rm-rf.h"
  77 #if HAVE_SECCOMP
  78 #include "seccomp-util.h"
  79 #endif
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "utmp-wtmp.h"
  96
  97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  99
 100 #define SNDBUF_SIZE (8*1024*1024)
 101
 102 static int shift_fds(int fds[], size_t n_fds) {
 103         int start, restart_from;
 104
 105         if (n_fds <= 0)
 106                 return 0;
 107
 108         /* Modifies the fds array! (sorts it) */
 109
 110         assert(fds);
 111
 112         start = 0;
 113         for (;;) {
 114                 int i;
 115
 116                 restart_from = -1;
 117
 118                 for (i = start; i < (int) n_fds; i++) {
 119                         int nfd;
 120
 121                         /* Already at right index? */
 122                         if (fds[i] == i+3)
 123                                 continue;
 124
 125                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 126                         if (nfd < 0)
 127                                 return -errno;
 128
 129                         safe_close(fds[i]);
 130                         fds[i] = nfd;
 131
 132                         /* Hmm, the fd we wanted isn't free? Then
 133                          * let's remember that and try again from here */
 134                         if (nfd != i+3 && restart_from < 0)
 135                                 restart_from = i;
 136                 }
 137
 138                 if (restart_from < 0)
 139                         break;
 140
 141                 start = restart_from;
 142         }
 143
 144         return 0;
 145 }
 146
 147 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 148         size_t i, n_fds;
 149         int r;
 150
 151         n_fds = n_socket_fds + n_storage_fds;
 152         if (n_fds <= 0)
 153                 return 0;
 154
 155         assert(fds);
 156
 157         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 158          * O_NONBLOCK only applies to socket activation though. */
 159
 160         for (i = 0; i < n_fds; i++) {
 161
 162                 if (i < n_socket_fds) {
 163                         r = fd_nonblock(fds[i], nonblock);
 164                         if (r < 0)
 165                                 return r;
 166                 }
 167
 168                 /* We unconditionally drop FD_CLOEXEC from the fds,
 169                  * since after all we want to pass these fds to our
 170                  * children */
 171
 172                 r = fd_cloexec(fds[i], false);
 173                 if (r < 0)
 174                         return r;
 175         }
 176
 177         return 0;
 178 }
 179
 180 static const char *exec_context_tty_path(const ExecContext *context) {
 181         assert(context);
 182
 183         if (context->stdio_as_fds)
 184                 return NULL;
 185
 186         if (context->tty_path)
 187                 return context->tty_path;
 188
 189         return "/dev/console";
 190 }
 191
 192 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 193         const char *path;
 194
 195         assert(context);
 196
 197         path = exec_context_tty_path(context);
 198
 199         if (context->tty_vhangup) {
 200                 if (p && p->stdin_fd >= 0)
 201                         (void) terminal_vhangup_fd(p->stdin_fd);
 202                 else if (path)
 203                         (void) terminal_vhangup(path);
 204         }
 205
 206         if (context->tty_reset) {
 207                 if (p && p->stdin_fd >= 0)
 208                         (void) reset_terminal_fd(p->stdin_fd, true);
 209                 else if (path)
 210                         (void) reset_terminal(path);
 211         }
 212
 213         if (context->tty_vt_disallocate && path)
 214                 (void) vt_disallocate(path);
 215 }
 216
 217 static bool is_terminal_input(ExecInput i) {
 218         return IN_SET(i,
 219                       EXEC_INPUT_TTY,
 220                       EXEC_INPUT_TTY_FORCE,
 221                       EXEC_INPUT_TTY_FAIL);
 222 }
 223
 224 static bool is_terminal_output(ExecOutput o) {
 225         return IN_SET(o,
 226                       EXEC_OUTPUT_TTY,
 227                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 228                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 229                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 230 }
 231
 232 static bool is_syslog_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_SYSLOG,
 235                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 236 }
 237
 238 static bool is_kmsg_output(ExecOutput o) {
 239         return IN_SET(o,
 240                       EXEC_OUTPUT_KMSG,
 241                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 242 }
 243
 244 static bool exec_context_needs_term(const ExecContext *c) {
 245         assert(c);
 246
 247         /* Return true if the execution context suggests we should set $TERM to something useful. */
 248
 249         if (is_terminal_input(c->std_input))
 250                 return true;
 251
 252         if (is_terminal_output(c->std_output))
 253                 return true;
 254
 255         if (is_terminal_output(c->std_error))
 256                 return true;
 257
 258         return !!c->tty_path;
 259 }
 260
 261 static int open_null_as(int flags, int nfd) {
 262         int fd;
 263
 264         assert(nfd >= 0);
 265
 266         fd = open("/dev/null", flags|O_NOCTTY);
 267         if (fd < 0)
 268                 return -errno;
 269
 270         return move_fd(fd, nfd, false);
 271 }
 272
 273 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 274         static const union sockaddr_union sa = {
 275                 .un.sun_family = AF_UNIX,
 276                 .un.sun_path = "/run/systemd/journal/stdout",
 277         };
 278         uid_t olduid = UID_INVALID;
 279         gid_t oldgid = GID_INVALID;
 280         int r;
 281
 282         if (gid_is_valid(gid)) {
 283                 oldgid = getgid();
 284
 285                 if (setegid(gid) < 0)
 286                         return -errno;
 287         }
 288
 289         if (uid_is_valid(uid)) {
 290                 olduid = getuid();
 291
 292                 if (seteuid(uid) < 0) {
 293                         r = -errno;
 294                         goto restore_gid;
 295                 }
 296         }
 297
 298         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 299
 300         /* If we fail to restore the uid or gid, things will likely
 301            fail later on. This should only happen if an LSM interferes. */
 302
 303         if (uid_is_valid(uid))
 304                 (void) seteuid(olduid);
 305
 306  restore_gid:
 307         if (gid_is_valid(gid))
 308                 (void) setegid(oldgid);
 309
 310         return r;
 311 }
 312
 313 static int connect_logger_as(
 314                 const Unit *unit,
 315                 const ExecContext *context,
 316                 const ExecParameters *params,
 317                 ExecOutput output,
 318                 const char *ident,
 319                 int nfd,
 320                 uid_t uid,
 321                 gid_t gid) {
 322
 323         _cleanup_close_ int fd = -1;
 324         int r;
 325
 326         assert(context);
 327         assert(params);
 328         assert(output < _EXEC_OUTPUT_MAX);
 329         assert(ident);
 330         assert(nfd >= 0);
 331
 332         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 333         if (fd < 0)
 334                 return -errno;
 335
 336         r = connect_journal_socket(fd, uid, gid);
 337         if (r < 0)
 338                 return r;
 339
 340         if (shutdown(fd, SHUT_RD) < 0)
 341                 return -errno;
 342
 343         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 344
 345         if (dprintf(fd,
 346                 "%s\n"
 347                 "%s\n"
 348                 "%i\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n",
 353                 context->syslog_identifier ?: ident,
 354                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 355                 context->syslog_priority,
 356                 !!context->syslog_level_prefix,
 357                 is_syslog_output(output),
 358                 is_kmsg_output(output),
 359                 is_terminal_output(output)) < 0)
 360                 return -errno;
 361
 362         return move_fd(TAKE_FD(fd), nfd, false);
 363 }
 364
 365 static int open_terminal_as(const char *path, int flags, int nfd) {
 366         int fd;
 367
 368         assert(path);
 369         assert(nfd >= 0);
 370
 371         fd = open_terminal(path, flags | O_NOCTTY);
 372         if (fd < 0)
 373                 return fd;
 374
 375         return move_fd(fd, nfd, false);
 376 }
 377
 378 static int acquire_path(const char *path, int flags, mode_t mode) {
 379         union sockaddr_union sa = {};
 380         _cleanup_close_ int fd = -1;
 381         int r, salen;
 382
 383         assert(path);
 384
 385         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 386                 flags |= O_CREAT;
 387
 388         fd = open(path, flags|O_NOCTTY, mode);
 389         if (fd >= 0)
 390                 return TAKE_FD(fd);
 391
 392         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 393                 return -errno;
 394         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 395                 return -ENXIO;
 396
 397         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 398
 399         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 400         if (fd < 0)
 401                 return -errno;
 402
 403         salen = sockaddr_un_set_path(&sa.un, path);
 404         if (salen < 0)
 405                 return salen;
 406
 407         if (connect(fd, &sa.sa, salen) < 0)
 408                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 409                                                            * indication that his wasn't an AF_UNIX socket after all */
 410
 411         if ((flags & O_ACCMODE) == O_RDONLY)
 412                 r = shutdown(fd, SHUT_WR);
 413         else if ((flags & O_ACCMODE) == O_WRONLY)
 414                 r = shutdown(fd, SHUT_RD);
 415         else
 416                 return TAKE_FD(fd);
 417         if (r < 0)
 418                 return -errno;
 419
 420         return TAKE_FD(fd);
 421 }
 422
 423 static int fixup_input(
 424                 const ExecContext *context,
 425                 int socket_fd,
 426                 bool apply_tty_stdin) {
 427
 428         ExecInput std_input;
 429
 430         assert(context);
 431
 432         std_input = context->std_input;
 433
 434         if (is_terminal_input(std_input) && !apply_tty_stdin)
 435                 return EXEC_INPUT_NULL;
 436
 437         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         return std_input;
 444 }
 445
 446 static int fixup_output(ExecOutput std_output, int socket_fd) {
 447
 448         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 449                 return EXEC_OUTPUT_INHERIT;
 450
 451         return std_output;
 452 }
 453
 454 static int setup_input(
 455                 const ExecContext *context,
 456                 const ExecParameters *params,
 457                 int socket_fd,
 458                 const int named_iofds[static 3]) {
 459
 460         ExecInput i;
 461
 462         assert(context);
 463         assert(params);
 464         assert(named_iofds);
 465
 466         if (params->stdin_fd >= 0) {
 467                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 468                         return -errno;
 469
 470                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 471                 if (isatty(STDIN_FILENO)) {
 472                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 473                         (void) reset_terminal_fd(STDIN_FILENO, true);
 474                 }
 475
 476                 return STDIN_FILENO;
 477         }
 478
 479         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 480
 481         switch (i) {
 482
 483         case EXEC_INPUT_NULL:
 484                 return open_null_as(O_RDONLY, STDIN_FILENO);
 485
 486         case EXEC_INPUT_TTY:
 487         case EXEC_INPUT_TTY_FORCE:
 488         case EXEC_INPUT_TTY_FAIL: {
 489                 int fd;
 490
 491                 fd = acquire_terminal(exec_context_tty_path(context),
 492                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 493                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 494                                                                   ACQUIRE_TERMINAL_WAIT,
 495                                       USEC_INFINITY);
 496                 if (fd < 0)
 497                         return fd;
 498
 499                 return move_fd(fd, STDIN_FILENO, false);
 500         }
 501
 502         case EXEC_INPUT_SOCKET:
 503                 assert(socket_fd >= 0);
 504
 505                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 506
 507         case EXEC_INPUT_NAMED_FD:
 508                 assert(named_iofds[STDIN_FILENO] >= 0);
 509
 510                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 511                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 512
 513         case EXEC_INPUT_DATA: {
 514                 int fd;
 515
 516                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 517                 if (fd < 0)
 518                         return fd;
 519
 520                 return move_fd(fd, STDIN_FILENO, false);
 521         }
 522
 523         case EXEC_INPUT_FILE: {
 524                 bool rw;
 525                 int fd;
 526
 527                 assert(context->stdio_file[STDIN_FILENO]);
 528
 529                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 530                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 531
 532                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 533                 if (fd < 0)
 534                         return fd;
 535
 536                 return move_fd(fd, STDIN_FILENO, false);
 537         }
 538
 539         default:
 540                 assert_not_reached("Unknown input type");
 541         }
 542 }
 543
 544 static bool can_inherit_stderr_from_stdout(
 545                 const ExecContext *context,
 546                 ExecOutput o,
 547                 ExecOutput e) {
 548
 549         assert(context);
 550
 551         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 552          * stderr fd */
 553
 554         if (e == EXEC_OUTPUT_INHERIT)
 555                 return true;
 556         if (e != o)
 557                 return false;
 558
 559         if (e == EXEC_OUTPUT_NAMED_FD)
 560                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 561
 562         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 563                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 564
 565         return true;
 566 }
 567
 568 static int setup_output(
 569                 const Unit *unit,
 570                 const ExecContext *context,
 571                 const ExecParameters *params,
 572                 int fileno,
 573                 int socket_fd,
 574                 const int named_iofds[static 3],
 575                 const char *ident,
 576                 uid_t uid,
 577                 gid_t gid,
 578                 dev_t *journal_stream_dev,
 579                 ino_t *journal_stream_ino) {
 580
 581         ExecOutput o;
 582         ExecInput i;
 583         int r;
 584
 585         assert(unit);
 586         assert(context);
 587         assert(params);
 588         assert(ident);
 589         assert(journal_stream_dev);
 590         assert(journal_stream_ino);
 591
 592         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 593
 594                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 595                         return -errno;
 596
 597                 return STDOUT_FILENO;
 598         }
 599
 600         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 601                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 602                         return -errno;
 603
 604                 return STDERR_FILENO;
 605         }
 606
 607         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 608         o = fixup_output(context->std_output, socket_fd);
 609
 610         if (fileno == STDERR_FILENO) {
 611                 ExecOutput e;
 612                 e = fixup_output(context->std_error, socket_fd);
 613
 614                 /* This expects the input and output are already set up */
 615
 616                 /* Don't change the stderr file descriptor if we inherit all
 617                  * the way and are not on a tty */
 618                 if (e == EXEC_OUTPUT_INHERIT &&
 619                     o == EXEC_OUTPUT_INHERIT &&
 620                     i == EXEC_INPUT_NULL &&
 621                     !is_terminal_input(context->std_input) &&
 622                     getppid () != 1)
 623                         return fileno;
 624
 625                 /* Duplicate from stdout if possible */
 626                 if (can_inherit_stderr_from_stdout(context, o, e))
 627                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 628
 629                 o = e;
 630
 631         } else if (o == EXEC_OUTPUT_INHERIT) {
 632                 /* If input got downgraded, inherit the original value */
 633                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 634                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 635
 636                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 637                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 638                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 639
 640                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 641                 if (getppid() != 1)
 642                         return fileno;
 643
 644                 /* We need to open /dev/null here anew, to get the right access mode. */
 645                 return open_null_as(O_WRONLY, fileno);
 646         }
 647
 648         switch (o) {
 649
 650         case EXEC_OUTPUT_NULL:
 651                 return open_null_as(O_WRONLY, fileno);
 652
 653         case EXEC_OUTPUT_TTY:
 654                 if (is_terminal_input(i))
 655                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 656
 657                 /* We don't reset the terminal if this is just about output */
 658                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 659
 660         case EXEC_OUTPUT_SYSLOG:
 661         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 662         case EXEC_OUTPUT_KMSG:
 663         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 664         case EXEC_OUTPUT_JOURNAL:
 665         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 666                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 667                 if (r < 0) {
 668                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 669                         r = open_null_as(O_WRONLY, fileno);
 670                 } else {
 671                         struct stat st;
 672
 673                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 674                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 675                          * services to detect whether they are connected to the journal or not.
 676                          *
 677                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 678                          * about STDERR as that's usually the best way to do logging. */
 679
 680                         if (fstat(fileno, &st) >= 0 &&
 681                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 682                                 *journal_stream_dev = st.st_dev;
 683                                 *journal_stream_ino = st.st_ino;
 684                         }
 685                 }
 686                 return r;
 687
 688         case EXEC_OUTPUT_SOCKET:
 689                 assert(socket_fd >= 0);
 690
 691                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 692
 693         case EXEC_OUTPUT_NAMED_FD:
 694                 assert(named_iofds[fileno] >= 0);
 695
 696                 (void) fd_nonblock(named_iofds[fileno], false);
 697                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 698
 699         case EXEC_OUTPUT_FILE:
 700         case EXEC_OUTPUT_FILE_APPEND: {
 701                 bool rw;
 702                 int fd, flags;
 703
 704                 assert(context->stdio_file[fileno]);
 705
 706                 rw = context->std_input == EXEC_INPUT_FILE &&
 707                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 708
 709                 if (rw)
 710                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 711
 712                 flags = O_WRONLY;
 713                 if (o == EXEC_OUTPUT_FILE_APPEND)
 714                         flags |= O_APPEND;
 715
 716                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 717                 if (fd < 0)
 718                         return fd;
 719
 720                 return move_fd(fd, fileno, 0);
 721         }
 722
 723         default:
 724                 assert_not_reached("Unknown error type");
 725         }
 726 }
 727
 728 static int chown_terminal(int fd, uid_t uid) {
 729         int r;
 730
 731         assert(fd >= 0);
 732
 733         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 734         if (isatty(fd) < 1) {
 735                 if (IN_SET(errno, EINVAL, ENOTTY))
 736                         return 0; /* not a tty */
 737
 738                 return -errno;
 739         }
 740
 741         /* This might fail. What matters are the results. */
 742         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 743         if (r < 0)
 744                 return r;
 745
 746         return 1;
 747 }
 748
 749 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 750         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 751         int r;
 752
 753         assert(_saved_stdin);
 754         assert(_saved_stdout);
 755
 756         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 757         if (saved_stdin < 0)
 758                 return -errno;
 759
 760         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 761         if (saved_stdout < 0)
 762                 return -errno;
 763
 764         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 765         if (fd < 0)
 766                 return fd;
 767
 768         r = chown_terminal(fd, getuid());
 769         if (r < 0)
 770                 return r;
 771
 772         r = reset_terminal_fd(fd, true);
 773         if (r < 0)
 774                 return r;
 775
 776         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 777         fd = -1;
 778         if (r < 0)
 779                 return r;
 780
 781         *_saved_stdin = saved_stdin;
 782         *_saved_stdout = saved_stdout;
 783
 784         saved_stdin = saved_stdout = -1;
 785
 786         return 0;
 787 }
 788
 789 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 790         assert(err < 0);
 791
 792         if (err == -ETIMEDOUT)
 793                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 794         else {
 795                 errno = -err;
 796                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 797         }
 798 }
 799
 800 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 801         _cleanup_close_ int fd = -1;
 802
 803         assert(vc);
 804
 805         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 806         if (fd < 0)
 807                 return;
 808
 809         write_confirm_error_fd(err, fd, u);
 810 }
 811
 812 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 813         int r = 0;
 814
 815         assert(saved_stdin);
 816         assert(saved_stdout);
 817
 818         release_terminal();
 819
 820         if (*saved_stdin >= 0)
 821                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 822                         r = -errno;
 823
 824         if (*saved_stdout >= 0)
 825                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 826                         r = -errno;
 827
 828         *saved_stdin = safe_close(*saved_stdin);
 829         *saved_stdout = safe_close(*saved_stdout);
 830
 831         return r;
 832 }
 833
 834 enum {
 835         CONFIRM_PRETEND_FAILURE = -1,
 836         CONFIRM_PRETEND_SUCCESS =  0,
 837         CONFIRM_EXECUTE = 1,
 838 };
 839
 840 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 841         int saved_stdout = -1, saved_stdin = -1, r;
 842         _cleanup_free_ char *e = NULL;
 843         char c;
 844
 845         /* For any internal errors, assume a positive response. */
 846         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 847         if (r < 0) {
 848                 write_confirm_error(r, vc, u);
 849                 return CONFIRM_EXECUTE;
 850         }
 851
 852         /* confirm_spawn might have been disabled while we were sleeping. */
 853         if (manager_is_confirm_spawn_disabled(u->manager)) {
 854                 r = 1;
 855                 goto restore_stdio;
 856         }
 857
 858         e = ellipsize(cmdline, 60, 100);
 859         if (!e) {
 860                 log_oom();
 861                 r = CONFIRM_EXECUTE;
 862                 goto restore_stdio;
 863         }
 864
 865         for (;;) {
 866                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 867                 if (r < 0) {
 868                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 869                         r = CONFIRM_EXECUTE;
 870                         goto restore_stdio;
 871                 }
 872
 873                 switch (c) {
 874                 case 'c':
 875                         printf("Resuming normal execution.\n");
 876                         manager_disable_confirm_spawn();
 877                         r = 1;
 878                         break;
 879                 case 'D':
 880                         unit_dump(u, stdout, "  ");
 881                         continue; /* ask again */
 882                 case 'f':
 883                         printf("Failing execution.\n");
 884                         r = CONFIRM_PRETEND_FAILURE;
 885                         break;
 886                 case 'h':
 887                         printf("  c - continue, proceed without asking anymore\n"
 888                                "  D - dump, show the state of the unit\n"
 889                                "  f - fail, don't execute the command and pretend it failed\n"
 890                                "  h - help\n"
 891                                "  i - info, show a short summary of the unit\n"
 892                                "  j - jobs, show jobs that are in progress\n"
 893                                "  s - skip, don't execute the command and pretend it succeeded\n"
 894                                "  y - yes, execute the command\n");
 895                         continue; /* ask again */
 896                 case 'i':
 897                         printf("  Description: %s\n"
 898                                "  Unit:        %s\n"
 899                                "  Command:     %s\n",
 900                                u->id, u->description, cmdline);
 901                         continue; /* ask again */
 902                 case 'j':
 903                         manager_dump_jobs(u->manager, stdout, "  ");
 904                         continue; /* ask again */
 905                 case 'n':
 906                         /* 'n' was removed in favor of 'f'. */
 907                         printf("Didn't understand 'n', did you mean 'f'?\n");
 908                         continue; /* ask again */
 909                 case 's':
 910                         printf("Skipping execution.\n");
 911                         r = CONFIRM_PRETEND_SUCCESS;
 912                         break;
 913                 case 'y':
 914                         r = CONFIRM_EXECUTE;
 915                         break;
 916                 default:
 917                         assert_not_reached("Unhandled choice");
 918                 }
 919                 break;
 920         }
 921
 922 restore_stdio:
 923         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 924         return r;
 925 }
 926
 927 static int get_fixed_user(const ExecContext *c, const char **user,
 928                           uid_t *uid, gid_t *gid,
 929                           const char **home, const char **shell) {
 930         int r;
 931         const char *name;
 932
 933         assert(c);
 934
 935         if (!c->user)
 936                 return 0;
 937
 938         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 939          * (i.e. are "/" or "/bin/nologin"). */
 940
 941         name = c->user;
 942         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 943         if (r < 0)
 944                 return r;
 945
 946         *user = name;
 947         return 0;
 948 }
 949
 950 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 951         int r;
 952         const char *name;
 953
 954         assert(c);
 955
 956         if (!c->group)
 957                 return 0;
 958
 959         name = c->group;
 960         r = get_group_creds(&name, gid, 0);
 961         if (r < 0)
 962                 return r;
 963
 964         *group = name;
 965         return 0;
 966 }
 967
 968 static int get_supplementary_groups(const ExecContext *c, const char *user,
 969                                     const char *group, gid_t gid,
 970                                     gid_t **supplementary_gids, int *ngids) {
 971         char **i;
 972         int r, k = 0;
 973         int ngroups_max;
 974         bool keep_groups = false;
 975         gid_t *groups = NULL;
 976         _cleanup_free_ gid_t *l_gids = NULL;
 977
 978         assert(c);
 979
 980         /*
 981          * If user is given, then lookup GID and supplementary groups list.
 982          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 983          * here and as early as possible so we keep the list of supplementary
 984          * groups of the caller.
 985          */
 986         if (user && gid_is_valid(gid) && gid != 0) {
 987                 /* First step, initialize groups from /etc/groups */
 988                 if (initgroups(user, gid) < 0)
 989                         return -errno;
 990
 991                 keep_groups = true;
 992         }
 993
 994         if (strv_isempty(c->supplementary_groups))
 995                 return 0;
 996
 997         /*
 998          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 999          * be positive, otherwise fail.
1000          */
1001         errno = 0;
1002         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1003         if (ngroups_max <= 0)
1004                 return errno_or_else(EOPNOTSUPP);
1005
1006         l_gids = new(gid_t, ngroups_max);
1007         if (!l_gids)
1008                 return -ENOMEM;
1009
1010         if (keep_groups) {
1011                 /*
1012                  * Lookup the list of groups that the user belongs to, we
1013                  * avoid NSS lookups here too for gid=0.
1014                  */
1015                 k = ngroups_max;
1016                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1017                         return -EINVAL;
1018         } else
1019                 k = 0;
1020
1021         STRV_FOREACH(i, c->supplementary_groups) {
1022                 const char *g;
1023
1024                 if (k >= ngroups_max)
1025                         return -E2BIG;
1026
1027                 g = *i;
1028                 r = get_group_creds(&g, l_gids+k, 0);
1029                 if (r < 0)
1030                         return r;
1031
1032                 k++;
1033         }
1034
1035         /*
1036          * Sets ngids to zero to drop all supplementary groups, happens
1037          * when we are under root and SupplementaryGroups= is empty.
1038          */
1039         if (k == 0) {
1040                 *ngids = 0;
1041                 return 0;
1042         }
1043
1044         /* Otherwise get the final list of supplementary groups */
1045         groups = memdup(l_gids, sizeof(gid_t) * k);
1046         if (!groups)
1047                 return -ENOMEM;
1048
1049         *supplementary_gids = groups;
1050         *ngids = k;
1051
1052         groups = NULL;
1053
1054         return 0;
1055 }
1056
1057 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1058         int r;
1059
1060         /* Handle SupplementaryGroups= if it is not empty */
1061         if (ngids > 0) {
1062                 r = maybe_setgroups(ngids, supplementary_gids);
1063                 if (r < 0)
1064                         return r;
1065         }
1066
1067         if (gid_is_valid(gid)) {
1068                 /* Then set our gids */
1069                 if (setresgid(gid, gid, gid) < 0)
1070                         return -errno;
1071         }
1072
1073         return 0;
1074 }
1075
1076 static int enforce_user(const ExecContext *context, uid_t uid) {
1077         assert(context);
1078
1079         if (!uid_is_valid(uid))
1080                 return 0;
1081
1082         /* Sets (but doesn't look up) the uid and make sure we keep the
1083          * capabilities while doing so. */
1084
1085         if (context->capability_ambient_set != 0) {
1086
1087                 /* First step: If we need to keep capabilities but
1088                  * drop privileges we need to make sure we keep our
1089                  * caps, while we drop privileges. */
1090                 if (uid != 0) {
1091                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1092
1093                         if (prctl(PR_GET_SECUREBITS) != sb)
1094                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1095                                         return -errno;
1096                 }
1097         }
1098
1099         /* Second step: actually set the uids */
1100         if (setresuid(uid, uid, uid) < 0)
1101                 return -errno;
1102
1103         /* At this point we should have all necessary capabilities but
1104            are otherwise a normal user. However, the caps might got
1105            corrupted due to the setresuid() so we need clean them up
1106            later. This is done outside of this call. */
1107
1108         return 0;
1109 }
1110
1111 #if HAVE_PAM
1112
1113 static int null_conv(
1114                 int num_msg,
1115                 const struct pam_message **msg,
1116                 struct pam_response **resp,
1117                 void *appdata_ptr) {
1118
1119         /* We don't support conversations */
1120
1121         return PAM_CONV_ERR;
1122 }
1123
1124 #endif
1125
1126 static int setup_pam(
1127                 const char *name,
1128                 const char *user,
1129                 uid_t uid,
1130                 gid_t gid,
1131                 const char *tty,
1132                 char ***env,
1133                 int fds[], size_t n_fds) {
1134
1135 #if HAVE_PAM
1136
1137         static const struct pam_conv conv = {
1138                 .conv = null_conv,
1139                 .appdata_ptr = NULL
1140         };
1141
1142         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1143         pam_handle_t *handle = NULL;
1144         sigset_t old_ss;
1145         int pam_code = PAM_SUCCESS, r;
1146         char **nv, **e = NULL;
1147         bool close_session = false;
1148         pid_t pam_pid = 0, parent_pid;
1149         int flags = 0;
1150
1151         assert(name);
1152         assert(user);
1153         assert(env);
1154
1155         /* We set up PAM in the parent process, then fork. The child
1156          * will then stay around until killed via PR_GET_PDEATHSIG or
1157          * systemd via the cgroup logic. It will then remove the PAM
1158          * session again. The parent process will exec() the actual
1159          * daemon. We do things this way to ensure that the main PID
1160          * of the daemon is the one we initially fork()ed. */
1161
1162         r = barrier_create(&barrier);
1163         if (r < 0)
1164                 goto fail;
1165
1166         if (log_get_max_level() < LOG_DEBUG)
1167                 flags |= PAM_SILENT;
1168
1169         pam_code = pam_start(name, user, &conv, &handle);
1170         if (pam_code != PAM_SUCCESS) {
1171                 handle = NULL;
1172                 goto fail;
1173         }
1174
1175         if (!tty) {
1176                 _cleanup_free_ char *q = NULL;
1177
1178                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1179                  * out if that's the case, and read the TTY off it. */
1180
1181                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1182                         tty = strjoina("/dev/", q);
1183         }
1184
1185         if (tty) {
1186                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1187                 if (pam_code != PAM_SUCCESS)
1188                         goto fail;
1189         }
1190
1191         STRV_FOREACH(nv, *env) {
1192                 pam_code = pam_putenv(handle, *nv);
1193                 if (pam_code != PAM_SUCCESS)
1194                         goto fail;
1195         }
1196
1197         pam_code = pam_acct_mgmt(handle, flags);
1198         if (pam_code != PAM_SUCCESS)
1199                 goto fail;
1200
1201         pam_code = pam_open_session(handle, flags);
1202         if (pam_code != PAM_SUCCESS)
1203                 goto fail;
1204
1205         close_session = true;
1206
1207         e = pam_getenvlist(handle);
1208         if (!e) {
1209                 pam_code = PAM_BUF_ERR;
1210                 goto fail;
1211         }
1212
1213         /* Block SIGTERM, so that we know that it won't get lost in
1214          * the child */
1215
1216         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1217
1218         parent_pid = getpid_cached();
1219
1220         r = safe_fork("(sd-pam)", 0, &pam_pid);
1221         if (r < 0)
1222                 goto fail;
1223         if (r == 0) {
1224                 int sig, ret = EXIT_PAM;
1225
1226                 /* The child's job is to reset the PAM session on
1227                  * termination */
1228                 barrier_set_role(&barrier, BARRIER_CHILD);
1229
1230                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1231                  * are open here that have been opened by PAM. */
1232                 (void) close_many(fds, n_fds);
1233
1234                 /* Drop privileges - we don't need any to pam_close_session
1235                  * and this will make PR_SET_PDEATHSIG work in most cases.
1236                  * If this fails, ignore the error - but expect sd-pam threads
1237                  * to fail to exit normally */
1238
1239                 r = maybe_setgroups(0, NULL);
1240                 if (r < 0)
1241                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1242                 if (setresgid(gid, gid, gid) < 0)
1243                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1244                 if (setresuid(uid, uid, uid) < 0)
1245                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1246
1247                 (void) ignore_signals(SIGPIPE, -1);
1248
1249                 /* Wait until our parent died. This will only work if
1250                  * the above setresuid() succeeds, otherwise the kernel
1251                  * will not allow unprivileged parents kill their privileged
1252                  * children this way. We rely on the control groups kill logic
1253                  * to do the rest for us. */
1254                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1255                         goto child_finish;
1256
1257                 /* Tell the parent that our setup is done. This is especially
1258                  * important regarding dropping privileges. Otherwise, unit
1259                  * setup might race against our setresuid(2) call.
1260                  *
1261                  * If the parent aborted, we'll detect this below, hence ignore
1262                  * return failure here. */
1263                 (void) barrier_place(&barrier);
1264
1265                 /* Check if our parent process might already have died? */
1266                 if (getppid() == parent_pid) {
1267                         sigset_t ss;
1268
1269                         assert_se(sigemptyset(&ss) >= 0);
1270                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1271
1272                         for (;;) {
1273                                 if (sigwait(&ss, &sig) < 0) {
1274                                         if (errno == EINTR)
1275                                                 continue;
1276
1277                                         goto child_finish;
1278                                 }
1279
1280                                 assert(sig == SIGTERM);
1281                                 break;
1282                         }
1283                 }
1284
1285                 /* If our parent died we'll end the session */
1286                 if (getppid() != parent_pid) {
1287                         pam_code = pam_close_session(handle, flags);
1288                         if (pam_code != PAM_SUCCESS)
1289                                 goto child_finish;
1290                 }
1291
1292                 ret = 0;
1293
1294         child_finish:
1295                 pam_end(handle, pam_code | flags);
1296                 _exit(ret);
1297         }
1298
1299         barrier_set_role(&barrier, BARRIER_PARENT);
1300
1301         /* If the child was forked off successfully it will do all the
1302          * cleanups, so forget about the handle here. */
1303         handle = NULL;
1304
1305         /* Unblock SIGTERM again in the parent */
1306         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1307
1308         /* We close the log explicitly here, since the PAM modules
1309          * might have opened it, but we don't want this fd around. */
1310         closelog();
1311
1312         /* Synchronously wait for the child to initialize. We don't care for
1313          * errors as we cannot recover. However, warn loudly if it happens. */
1314         if (!barrier_place_and_sync(&barrier))
1315                 log_error("PAM initialization failed");
1316
1317         return strv_free_and_replace(*env, e);
1318
1319 fail:
1320         if (pam_code != PAM_SUCCESS) {
1321                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1322                 r = -EPERM;  /* PAM errors do not map to errno */
1323         } else
1324                 log_error_errno(r, "PAM failed: %m");
1325
1326         if (handle) {
1327                 if (close_session)
1328                         pam_code = pam_close_session(handle, flags);
1329
1330                 pam_end(handle, pam_code | flags);
1331         }
1332
1333         strv_free(e);
1334         closelog();
1335
1336         return r;
1337 #else
1338         return 0;
1339 #endif
1340 }
1341
1342 static void rename_process_from_path(const char *path) {
1343         char process_name[11];
1344         const char *p;
1345         size_t l;
1346
1347         /* This resulting string must fit in 10 chars (i.e. the length
1348          * of "/sbin/init") to look pretty in /bin/ps */
1349
1350         p = basename(path);
1351         if (isempty(p)) {
1352                 rename_process("(...)");
1353                 return;
1354         }
1355
1356         l = strlen(p);
1357         if (l > 8) {
1358                 /* The end of the process name is usually more
1359                  * interesting, since the first bit might just be
1360                  * "systemd-" */
1361                 p = p + l - 8;
1362                 l = 8;
1363         }
1364
1365         process_name[0] = '(';
1366         memcpy(process_name+1, p, l);
1367         process_name[1+l] = ')';
1368         process_name[1+l+1] = 0;
1369
1370         rename_process(process_name);
1371 }
1372
1373 static bool context_has_address_families(const ExecContext *c) {
1374         assert(c);
1375
1376         return c->address_families_whitelist ||
1377                 !set_isempty(c->address_families);
1378 }
1379
1380 static bool context_has_syscall_filters(const ExecContext *c) {
1381         assert(c);
1382
1383         return c->syscall_whitelist ||
1384                 !hashmap_isempty(c->syscall_filter);
1385 }
1386
1387 static bool context_has_no_new_privileges(const ExecContext *c) {
1388         assert(c);
1389
1390         if (c->no_new_privileges)
1391                 return true;
1392
1393         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1394                 return false;
1395
1396         /* We need NNP if we have any form of seccomp and are unprivileged */
1397         return context_has_address_families(c) ||
1398                 c->memory_deny_write_execute ||
1399                 c->restrict_realtime ||
1400                 c->restrict_suid_sgid ||
1401                 exec_context_restrict_namespaces_set(c) ||
1402                 c->protect_kernel_tunables ||
1403                 c->protect_kernel_modules ||
1404                 c->private_devices ||
1405                 context_has_syscall_filters(c) ||
1406                 !set_isempty(c->syscall_archs) ||
1407                 c->lock_personality ||
1408                 c->protect_hostname;
1409 }
1410
1411 #if HAVE_SECCOMP
1412
1413 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1414
1415         if (is_seccomp_available())
1416                 return false;
1417
1418         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1419         return true;
1420 }
1421
1422 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1423         uint32_t negative_action, default_action, action;
1424         int r;
1425
1426         assert(u);
1427         assert(c);
1428
1429         if (!context_has_syscall_filters(c))
1430                 return 0;
1431
1432         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1433                 return 0;
1434
1435         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1436
1437         if (c->syscall_whitelist) {
1438                 default_action = negative_action;
1439                 action = SCMP_ACT_ALLOW;
1440         } else {
1441                 default_action = SCMP_ACT_ALLOW;
1442                 action = negative_action;
1443         }
1444
1445         if (needs_ambient_hack) {
1446                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1447                 if (r < 0)
1448                         return r;
1449         }
1450
1451         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1452 }
1453
1454 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1455         assert(u);
1456         assert(c);
1457
1458         if (set_isempty(c->syscall_archs))
1459                 return 0;
1460
1461         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1462                 return 0;
1463
1464         return seccomp_restrict_archs(c->syscall_archs);
1465 }
1466
1467 static int apply_address_families(const Unit* u, const ExecContext *c) {
1468         assert(u);
1469         assert(c);
1470
1471         if (!context_has_address_families(c))
1472                 return 0;
1473
1474         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1475                 return 0;
1476
1477         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1478 }
1479
1480 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1481         assert(u);
1482         assert(c);
1483
1484         if (!c->memory_deny_write_execute)
1485                 return 0;
1486
1487         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1488                 return 0;
1489
1490         return seccomp_memory_deny_write_execute();
1491 }
1492
1493 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1494         assert(u);
1495         assert(c);
1496
1497         if (!c->restrict_realtime)
1498                 return 0;
1499
1500         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1501                 return 0;
1502
1503         return seccomp_restrict_realtime();
1504 }
1505
1506 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1507         assert(u);
1508         assert(c);
1509
1510         if (!c->restrict_suid_sgid)
1511                 return 0;
1512
1513         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1514                 return 0;
1515
1516         return seccomp_restrict_suid_sgid();
1517 }
1518
1519 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1520         assert(u);
1521         assert(c);
1522
1523         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1524          * let's protect even those systems where this is left on in the kernel. */
1525
1526         if (!c->protect_kernel_tunables)
1527                 return 0;
1528
1529         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1530                 return 0;
1531
1532         return seccomp_protect_sysctl();
1533 }
1534
1535 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1536         assert(u);
1537         assert(c);
1538
1539         /* Turn off module syscalls on ProtectKernelModules=yes */
1540
1541         if (!c->protect_kernel_modules)
1542                 return 0;
1543
1544         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1545                 return 0;
1546
1547         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1548 }
1549
1550 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1551         assert(u);
1552         assert(c);
1553
1554         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1555
1556         if (!c->private_devices)
1557                 return 0;
1558
1559         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1560                 return 0;
1561
1562         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1563 }
1564
1565 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1566         assert(u);
1567         assert(c);
1568
1569         if (!exec_context_restrict_namespaces_set(c))
1570                 return 0;
1571
1572         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1573                 return 0;
1574
1575         return seccomp_restrict_namespaces(c->restrict_namespaces);
1576 }
1577
1578 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1579         unsigned long personality;
1580         int r;
1581
1582         assert(u);
1583         assert(c);
1584
1585         if (!c->lock_personality)
1586                 return 0;
1587
1588         if (skip_seccomp_unavailable(u, "LockPersonality="))
1589                 return 0;
1590
1591         personality = c->personality;
1592
1593         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1594         if (personality == PERSONALITY_INVALID) {
1595
1596                 r = opinionated_personality(&personality);
1597                 if (r < 0)
1598                         return r;
1599         }
1600
1601         return seccomp_lock_personality(personality);
1602 }
1603
1604 #endif
1605
1606 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1607         assert(idle_pipe);
1608
1609         idle_pipe[1] = safe_close(idle_pipe[1]);
1610         idle_pipe[2] = safe_close(idle_pipe[2]);
1611
1612         if (idle_pipe[0] >= 0) {
1613                 int r;
1614
1615                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1616
1617                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1618                         ssize_t n;
1619
1620                         /* Signal systemd that we are bored and want to continue. */
1621                         n = write(idle_pipe[3], "x", 1);
1622                         if (n > 0)
1623                                 /* Wait for systemd to react to the signal above. */
1624                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1625                 }
1626
1627                 idle_pipe[0] = safe_close(idle_pipe[0]);
1628
1629         }
1630
1631         idle_pipe[3] = safe_close(idle_pipe[3]);
1632 }
1633
1634 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1635
1636 static int build_environment(
1637                 const Unit *u,
1638                 const ExecContext *c,
1639                 const ExecParameters *p,
1640                 size_t n_fds,
1641                 const char *home,
1642                 const char *username,
1643                 const char *shell,
1644                 dev_t journal_stream_dev,
1645                 ino_t journal_stream_ino,
1646                 char ***ret) {
1647
1648         _cleanup_strv_free_ char **our_env = NULL;
1649         ExecDirectoryType t;
1650         size_t n_env = 0;
1651         char *x;
1652
1653         assert(u);
1654         assert(c);
1655         assert(p);
1656         assert(ret);
1657
1658         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1659         if (!our_env)
1660                 return -ENOMEM;
1661
1662         if (n_fds > 0) {
1663                 _cleanup_free_ char *joined = NULL;
1664
1665                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1666                         return -ENOMEM;
1667                 our_env[n_env++] = x;
1668
1669                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1670                         return -ENOMEM;
1671                 our_env[n_env++] = x;
1672
1673                 joined = strv_join(p->fd_names, ":");
1674                 if (!joined)
1675                         return -ENOMEM;
1676
1677                 x = strjoin("LISTEN_FDNAMES=", joined);
1678                 if (!x)
1679                         return -ENOMEM;
1680                 our_env[n_env++] = x;
1681         }
1682
1683         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1684                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1685                         return -ENOMEM;
1686                 our_env[n_env++] = x;
1687
1688                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1689                         return -ENOMEM;
1690                 our_env[n_env++] = x;
1691         }
1692
1693         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1694          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1695          * check the database directly. */
1696         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1697                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1698                 if (!x)
1699                         return -ENOMEM;
1700                 our_env[n_env++] = x;
1701         }
1702
1703         if (home) {
1704                 x = strjoin("HOME=", home);
1705                 if (!x)
1706                         return -ENOMEM;
1707
1708                 path_simplify(x + 5, true);
1709                 our_env[n_env++] = x;
1710         }
1711
1712         if (username) {
1713                 x = strjoin("LOGNAME=", username);
1714                 if (!x)
1715                         return -ENOMEM;
1716                 our_env[n_env++] = x;
1717
1718                 x = strjoin("USER=", username);
1719                 if (!x)
1720                         return -ENOMEM;
1721                 our_env[n_env++] = x;
1722         }
1723
1724         if (shell) {
1725                 x = strjoin("SHELL=", shell);
1726                 if (!x)
1727                         return -ENOMEM;
1728
1729                 path_simplify(x + 6, true);
1730                 our_env[n_env++] = x;
1731         }
1732
1733         if (!sd_id128_is_null(u->invocation_id)) {
1734                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1735                         return -ENOMEM;
1736
1737                 our_env[n_env++] = x;
1738         }
1739
1740         if (exec_context_needs_term(c)) {
1741                 const char *tty_path, *term = NULL;
1742
1743                 tty_path = exec_context_tty_path(c);
1744
1745                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1746                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1747                  * passes to PID 1 ends up all the way in the console login shown. */
1748
1749                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1750                         term = getenv("TERM");
1751                 if (!term)
1752                         term = default_term_for_tty(tty_path);
1753
1754                 x = strjoin("TERM=", term);
1755                 if (!x)
1756                         return -ENOMEM;
1757                 our_env[n_env++] = x;
1758         }
1759
1760         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1761                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1762                         return -ENOMEM;
1763
1764                 our_env[n_env++] = x;
1765         }
1766
1767         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1768                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1769                 const char *n;
1770
1771                 if (!p->prefix[t])
1772                         continue;
1773
1774                 if (strv_isempty(c->directories[t].paths))
1775                         continue;
1776
1777                 n = exec_directory_env_name_to_string(t);
1778                 if (!n)
1779                         continue;
1780
1781                 pre = strjoin(p->prefix[t], "/");
1782                 if (!pre)
1783                         return -ENOMEM;
1784
1785                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1786                 if (!joined)
1787                         return -ENOMEM;
1788
1789                 x = strjoin(n, "=", joined);
1790                 if (!x)
1791                         return -ENOMEM;
1792
1793                 our_env[n_env++] = x;
1794         }
1795
1796         our_env[n_env++] = NULL;
1797         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1798
1799         *ret = TAKE_PTR(our_env);
1800
1801         return 0;
1802 }
1803
1804 static int build_pass_environment(const ExecContext *c, char ***ret) {
1805         _cleanup_strv_free_ char **pass_env = NULL;
1806         size_t n_env = 0, n_bufsize = 0;
1807         char **i;
1808
1809         STRV_FOREACH(i, c->pass_environment) {
1810                 _cleanup_free_ char *x = NULL;
1811                 char *v;
1812
1813                 v = getenv(*i);
1814                 if (!v)
1815                         continue;
1816                 x = strjoin(*i, "=", v);
1817                 if (!x)
1818                         return -ENOMEM;
1819
1820                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1821                         return -ENOMEM;
1822
1823                 pass_env[n_env++] = TAKE_PTR(x);
1824                 pass_env[n_env] = NULL;
1825         }
1826
1827         *ret = TAKE_PTR(pass_env);
1828
1829         return 0;
1830 }
1831
1832 static bool exec_needs_mount_namespace(
1833                 const ExecContext *context,
1834                 const ExecParameters *params,
1835                 const ExecRuntime *runtime) {
1836
1837         assert(context);
1838         assert(params);
1839
1840         if (context->root_image)
1841                 return true;
1842
1843         if (!strv_isempty(context->read_write_paths) ||
1844             !strv_isempty(context->read_only_paths) ||
1845             !strv_isempty(context->inaccessible_paths))
1846                 return true;
1847
1848         if (context->n_bind_mounts > 0)
1849                 return true;
1850
1851         if (context->n_temporary_filesystems > 0)
1852                 return true;
1853
1854         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1855                 return true;
1856
1857         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1858                 return true;
1859
1860         if (context->private_devices ||
1861             context->private_mounts ||
1862             context->protect_system != PROTECT_SYSTEM_NO ||
1863             context->protect_home != PROTECT_HOME_NO ||
1864             context->protect_kernel_tunables ||
1865             context->protect_kernel_modules ||
1866             context->protect_control_groups)
1867                 return true;
1868
1869         if (context->root_directory) {
1870                 ExecDirectoryType t;
1871
1872                 if (context->mount_apivfs)
1873                         return true;
1874
1875                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1876                         if (!params->prefix[t])
1877                                 continue;
1878
1879                         if (!strv_isempty(context->directories[t].paths))
1880                                 return true;
1881                 }
1882         }
1883
1884         if (context->dynamic_user &&
1885             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1886              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1887              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1888                 return true;
1889
1890         return false;
1891 }
1892
1893 static int setup_private_users(uid_t uid, gid_t gid) {
1894         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1895         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1896         _cleanup_close_ int unshare_ready_fd = -1;
1897         _cleanup_(sigkill_waitp) pid_t pid = 0;
1898         uint64_t c = 1;
1899         ssize_t n;
1900         int r;
1901
1902         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1903          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1904          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1905          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1906          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1907          * continues execution normally. */
1908
1909         if (uid != 0 && uid_is_valid(uid)) {
1910                 r = asprintf(&uid_map,
1911                              "0 0 1\n"                      /* Map root → root */
1912                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1913                              uid, uid);
1914                 if (r < 0)
1915                         return -ENOMEM;
1916         } else {
1917                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1918                 if (!uid_map)
1919                         return -ENOMEM;
1920         }
1921
1922         if (gid != 0 && gid_is_valid(gid)) {
1923                 r = asprintf(&gid_map,
1924                              "0 0 1\n"                      /* Map root → root */
1925                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1926                              gid, gid);
1927                 if (r < 0)
1928                         return -ENOMEM;
1929         } else {
1930                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1931                 if (!gid_map)
1932                         return -ENOMEM;
1933         }
1934
1935         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1936          * namespace. */
1937         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1938         if (unshare_ready_fd < 0)
1939                 return -errno;
1940
1941         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1942          * failed. */
1943         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1944                 return -errno;
1945
1946         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1947         if (r < 0)
1948                 return r;
1949         if (r == 0) {
1950                 _cleanup_close_ int fd = -1;
1951                 const char *a;
1952                 pid_t ppid;
1953
1954                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1955                  * here, after the parent opened its own user namespace. */
1956
1957                 ppid = getppid();
1958                 errno_pipe[0] = safe_close(errno_pipe[0]);
1959
1960                 /* Wait until the parent unshared the user namespace */
1961                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1962                         r = -errno;
1963                         goto child_fail;
1964                 }
1965
1966                 /* Disable the setgroups() system call in the child user namespace, for good. */
1967                 a = procfs_file_alloca(ppid, "setgroups");
1968                 fd = open(a, O_WRONLY|O_CLOEXEC);
1969                 if (fd < 0) {
1970                         if (errno != ENOENT) {
1971                                 r = -errno;
1972                                 goto child_fail;
1973                         }
1974
1975                         /* If the file is missing the kernel is too old, let's continue anyway. */
1976                 } else {
1977                         if (write(fd, "deny\n", 5) < 0) {
1978                                 r = -errno;
1979                                 goto child_fail;
1980                         }
1981
1982                         fd = safe_close(fd);
1983                 }
1984
1985                 /* First write the GID map */
1986                 a = procfs_file_alloca(ppid, "gid_map");
1987                 fd = open(a, O_WRONLY|O_CLOEXEC);
1988                 if (fd < 0) {
1989                         r = -errno;
1990                         goto child_fail;
1991                 }
1992                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1993                         r = -errno;
1994                         goto child_fail;
1995                 }
1996                 fd = safe_close(fd);
1997
1998                 /* The write the UID map */
1999                 a = procfs_file_alloca(ppid, "uid_map");
2000                 fd = open(a, O_WRONLY|O_CLOEXEC);
2001                 if (fd < 0) {
2002                         r = -errno;
2003                         goto child_fail;
2004                 }
2005                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2006                         r = -errno;
2007                         goto child_fail;
2008                 }
2009
2010                 _exit(EXIT_SUCCESS);
2011
2012         child_fail:
2013                 (void) write(errno_pipe[1], &r, sizeof(r));
2014                 _exit(EXIT_FAILURE);
2015         }
2016
2017         errno_pipe[1] = safe_close(errno_pipe[1]);
2018
2019         if (unshare(CLONE_NEWUSER) < 0)
2020                 return -errno;
2021
2022         /* Let the child know that the namespace is ready now */
2023         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2024                 return -errno;
2025
2026         /* Try to read an error code from the child */
2027         n = read(errno_pipe[0], &r, sizeof(r));
2028         if (n < 0)
2029                 return -errno;
2030         if (n == sizeof(r)) { /* an error code was sent to us */
2031                 if (r < 0)
2032                         return r;
2033                 return -EIO;
2034         }
2035         if (n != 0) /* on success we should have read 0 bytes */
2036                 return -EIO;
2037
2038         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2039         pid = 0;
2040         if (r < 0)
2041                 return r;
2042         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2043                 return -EIO;
2044
2045         return 0;
2046 }
2047
2048 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2049         if (!context->dynamic_user)
2050                 return false;
2051
2052         if (type == EXEC_DIRECTORY_CONFIGURATION)
2053                 return false;
2054
2055         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2056                 return false;
2057
2058         return true;
2059 }
2060
2061 static int setup_exec_directory(
2062                 const ExecContext *context,
2063                 const ExecParameters *params,
2064                 uid_t uid,
2065                 gid_t gid,
2066                 ExecDirectoryType type,
2067                 int *exit_status) {
2068
2069         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2070                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2071                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2072                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2073                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2074                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2075         };
2076         char **rt;
2077         int r;
2078
2079         assert(context);
2080         assert(params);
2081         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2082         assert(exit_status);
2083
2084         if (!params->prefix[type])
2085                 return 0;
2086
2087         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2088                 if (!uid_is_valid(uid))
2089                         uid = 0;
2090                 if (!gid_is_valid(gid))
2091                         gid = 0;
2092         }
2093
2094         STRV_FOREACH(rt, context->directories[type].paths) {
2095                 _cleanup_free_ char *p = NULL, *pp = NULL;
2096
2097                 p = path_join(params->prefix[type], *rt);
2098                 if (!p) {
2099                         r = -ENOMEM;
2100                         goto fail;
2101                 }
2102
2103                 r = mkdir_parents_label(p, 0755);
2104                 if (r < 0)
2105                         goto fail;
2106
2107                 if (exec_directory_is_private(context, type)) {
2108                         _cleanup_free_ char *private_root = NULL;
2109
2110                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2111                          * case we want to avoid leaving a directory around fully accessible that is owned by
2112                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2113                          * trick used by container managers to prohibit host users to get access to files of
2114                          * the same UID in containers: we place everything inside a directory that has an
2115                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2116                          * for unprivileged host code. We then use fs namespacing to make this directory
2117                          * permeable for the service itself.
2118                          *
2119                          * Specifically: for a service which wants a special directory "foo/" we first create
2120                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2121                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2122                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2123                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2124                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2125                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2126                          * for the service and making sure it only gets access to the dirs it needs but no
2127                          * others. Tricky? Yes, absolutely, but it works!
2128                          *
2129                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2130                          * to be owned by the service itself.
2131                          *
2132                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2133                          * for sharing files or sockets with other services. */
2134
2135                         private_root = path_join(params->prefix[type], "private");
2136                         if (!private_root) {
2137                                 r = -ENOMEM;
2138                                 goto fail;
2139                         }
2140
2141                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2142                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2143                         if (r < 0)
2144                                 goto fail;
2145
2146                         pp = path_join(private_root, *rt);
2147                         if (!pp) {
2148                                 r = -ENOMEM;
2149                                 goto fail;
2150                         }
2151
2152                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2153                         r = mkdir_parents_label(pp, 0755);
2154                         if (r < 0)
2155                                 goto fail;
2156
2157                         if (is_dir(p, false) > 0 &&
2158                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2159
2160                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2161                                  * it over. Most likely the service has been upgraded from one that didn't use
2162                                  * DynamicUser=1, to one that does. */
2163
2164                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2165                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2166                                          exec_directory_type_to_string(type), p, pp);
2167
2168                                 if (rename(p, pp) < 0) {
2169                                         r = -errno;
2170                                         goto fail;
2171                                 }
2172                         } else {
2173                                 /* Otherwise, create the actual directory for the service */
2174
2175                                 r = mkdir_label(pp, context->directories[type].mode);
2176                                 if (r < 0 && r != -EEXIST)
2177                                         goto fail;
2178                         }
2179
2180                         /* And link it up from the original place */
2181                         r = symlink_idempotent(pp, p, true);
2182                         if (r < 0)
2183                                 goto fail;
2184
2185                 } else {
2186                         _cleanup_free_ char *target = NULL;
2187
2188                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2189                             readlink_and_make_absolute(p, &target) >= 0) {
2190                                 _cleanup_free_ char *q = NULL;
2191
2192                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193                                  * by DynamicUser=1 (see above)?
2194                                  *
2195                                  * We do this for all directory types except for ConfigurationDirectory=,
2196                                  * since they all support the private/ symlink logic at least in some
2197                                  * configurations, see above. */
2198
2199                                 q = path_join(params->prefix[type], "private", *rt);
2200                                 if (!q) {
2201                                         r = -ENOMEM;
2202                                         goto fail;
2203                                 }
2204
2205                                 if (path_equal(q, target)) {
2206
2207                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2208                                          * but is no longer. Let's move the directory back up. */
2209
2210                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2211                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2212                                                  exec_directory_type_to_string(type), q, p);
2213
2214                                         if (unlink(p) < 0) {
2215                                                 r = -errno;
2216                                                 goto fail;
2217                                         }
2218
2219                                         if (rename(q, p) < 0) {
2220                                                 r = -errno;
2221                                                 goto fail;
2222                                         }
2223                                 }
2224                         }
2225
2226                         r = mkdir_label(p, context->directories[type].mode);
2227                         if (r < 0) {
2228                                 if (r != -EEXIST)
2229                                         goto fail;
2230
2231                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2232                                         struct stat st;
2233
2234                                         /* Don't change the owner/access mode of the configuration directory,
2235                                          * as in the common case it is not written to by a service, and shall
2236                                          * not be writable. */
2237
2238                                         if (stat(p, &st) < 0) {
2239                                                 r = -errno;
2240                                                 goto fail;
2241                                         }
2242
2243                                         /* Still complain if the access mode doesn't match */
2244                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2245                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2246                                                             "(File system: %o %sMode: %o)",
2247                                                             exec_directory_type_to_string(type), *rt,
2248                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2249
2250                                         continue;
2251                                 }
2252                         }
2253                 }
2254
2255                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2256                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2257                  * current UID/GID ownership.) */
2258                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2259                 if (r < 0)
2260                         goto fail;
2261
2262                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2263                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2264                  * assignments to exist.*/
2265                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2266                 if (r < 0)
2267                         goto fail;
2268         }
2269
2270         return 0;
2271
2272 fail:
2273         *exit_status = exit_status_table[type];
2274         return r;
2275 }
2276
2277 #if ENABLE_SMACK
2278 static int setup_smack(
2279                 const ExecContext *context,
2280                 const ExecCommand *command) {
2281
2282         int r;
2283
2284         assert(context);
2285         assert(command);
2286
2287         if (context->smack_process_label) {
2288                 r = mac_smack_apply_pid(0, context->smack_process_label);
2289                 if (r < 0)
2290                         return r;
2291         }
2292 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2293         else {
2294                 _cleanup_free_ char *exec_label = NULL;
2295
2296                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2297                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2298                         return r;
2299
2300                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2301                 if (r < 0)
2302                         return r;
2303         }
2304 #endif
2305
2306         return 0;
2307 }
2308 #endif
2309
2310 static int compile_bind_mounts(
2311                 const ExecContext *context,
2312                 const ExecParameters *params,
2313                 BindMount **ret_bind_mounts,
2314                 size_t *ret_n_bind_mounts,
2315                 char ***ret_empty_directories) {
2316
2317         _cleanup_strv_free_ char **empty_directories = NULL;
2318         BindMount *bind_mounts;
2319         size_t n, h = 0, i;
2320         ExecDirectoryType t;
2321         int r;
2322
2323         assert(context);
2324         assert(params);
2325         assert(ret_bind_mounts);
2326         assert(ret_n_bind_mounts);
2327         assert(ret_empty_directories);
2328
2329         n = context->n_bind_mounts;
2330         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2331                 if (!params->prefix[t])
2332                         continue;
2333
2334                 n += strv_length(context->directories[t].paths);
2335         }
2336
2337         if (n <= 0) {
2338                 *ret_bind_mounts = NULL;
2339                 *ret_n_bind_mounts = 0;
2340                 *ret_empty_directories = NULL;
2341                 return 0;
2342         }
2343
2344         bind_mounts = new(BindMount, n);
2345         if (!bind_mounts)
2346                 return -ENOMEM;
2347
2348         for (i = 0; i < context->n_bind_mounts; i++) {
2349                 BindMount *item = context->bind_mounts + i;
2350                 char *s, *d;
2351
2352                 s = strdup(item->source);
2353                 if (!s) {
2354                         r = -ENOMEM;
2355                         goto finish;
2356                 }
2357
2358                 d = strdup(item->destination);
2359                 if (!d) {
2360                         free(s);
2361                         r = -ENOMEM;
2362                         goto finish;
2363                 }
2364
2365                 bind_mounts[h++] = (BindMount) {
2366                         .source = s,
2367                         .destination = d,
2368                         .read_only = item->read_only,
2369                         .recursive = item->recursive,
2370                         .ignore_enoent = item->ignore_enoent,
2371                 };
2372         }
2373
2374         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2375                 char **suffix;
2376
2377                 if (!params->prefix[t])
2378                         continue;
2379
2380                 if (strv_isempty(context->directories[t].paths))
2381                         continue;
2382
2383                 if (exec_directory_is_private(context, t) &&
2384                     !(context->root_directory || context->root_image)) {
2385                         char *private_root;
2386
2387                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2388                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2389                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2390
2391                         private_root = path_join(params->prefix[t], "private");
2392                         if (!private_root) {
2393                                 r = -ENOMEM;
2394                                 goto finish;
2395                         }
2396
2397                         r = strv_consume(&empty_directories, private_root);
2398                         if (r < 0)
2399                                 goto finish;
2400                 }
2401
2402                 STRV_FOREACH(suffix, context->directories[t].paths) {
2403                         char *s, *d;
2404
2405                         if (exec_directory_is_private(context, t))
2406                                 s = path_join(params->prefix[t], "private", *suffix);
2407                         else
2408                                 s = path_join(params->prefix[t], *suffix);
2409                         if (!s) {
2410                                 r = -ENOMEM;
2411                                 goto finish;
2412                         }
2413
2414                         if (exec_directory_is_private(context, t) &&
2415                             (context->root_directory || context->root_image))
2416                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2417                                  * directory is not created on the root directory. So, let's bind-mount the directory
2418                                  * on the 'non-private' place. */
2419                                 d = path_join(params->prefix[t], *suffix);
2420                         else
2421                                 d = strdup(s);
2422                         if (!d) {
2423                                 free(s);
2424                                 r = -ENOMEM;
2425                                 goto finish;
2426                         }
2427
2428                         bind_mounts[h++] = (BindMount) {
2429                                 .source = s,
2430                                 .destination = d,
2431                                 .read_only = false,
2432                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2433                                 .recursive = true,
2434                                 .ignore_enoent = false,
2435                         };
2436                 }
2437         }
2438
2439         assert(h == n);
2440
2441         *ret_bind_mounts = bind_mounts;
2442         *ret_n_bind_mounts = n;
2443         *ret_empty_directories = TAKE_PTR(empty_directories);
2444
2445         return (int) n;
2446
2447 finish:
2448         bind_mount_free_many(bind_mounts, h);
2449         return r;
2450 }
2451
2452 static int apply_mount_namespace(
2453                 const Unit *u,
2454                 const ExecCommand *command,
2455                 const ExecContext *context,
2456                 const ExecParameters *params,
2457                 const ExecRuntime *runtime,
2458                 char **error_path) {
2459
2460         _cleanup_strv_free_ char **empty_directories = NULL;
2461         char *tmp = NULL, *var = NULL;
2462         const char *root_dir = NULL, *root_image = NULL;
2463         NamespaceInfo ns_info;
2464         bool needs_sandboxing;
2465         BindMount *bind_mounts = NULL;
2466         size_t n_bind_mounts = 0;
2467         int r;
2468
2469         assert(context);
2470
2471         /* The runtime struct only contains the parent of the private /tmp,
2472          * which is non-accessible to world users. Inside of it there's a /tmp
2473          * that is sticky, and that's the one we want to use here. */
2474
2475         if (context->private_tmp && runtime) {
2476                 if (runtime->tmp_dir)
2477                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2478                 if (runtime->var_tmp_dir)
2479                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2480         }
2481
2482         if (params->flags & EXEC_APPLY_CHROOT) {
2483                 root_image = context->root_image;
2484
2485                 if (!root_image)
2486                         root_dir = context->root_directory;
2487         }
2488
2489         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2490         if (r < 0)
2491                 return r;
2492
2493         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2494         if (needs_sandboxing)
2495                 ns_info = (NamespaceInfo) {
2496                         .ignore_protect_paths = false,
2497                         .private_dev = context->private_devices,
2498                         .protect_control_groups = context->protect_control_groups,
2499                         .protect_kernel_tunables = context->protect_kernel_tunables,
2500                         .protect_kernel_modules = context->protect_kernel_modules,
2501                         .protect_hostname = context->protect_hostname,
2502                         .mount_apivfs = context->mount_apivfs,
2503                         .private_mounts = context->private_mounts,
2504                 };
2505         else if (!context->dynamic_user && root_dir)
2506                 /*
2507                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2508                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2509                  * fail if we are enable to apply the sandbox inside the mount namespace.
2510                  */
2511                 ns_info = (NamespaceInfo) {
2512                         .ignore_protect_paths = true,
2513                 };
2514         else
2515                 ns_info = (NamespaceInfo) {};
2516
2517         if (context->mount_flags == MS_SHARED)
2518                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2519
2520         r = setup_namespace(root_dir, root_image,
2521                             &ns_info, context->read_write_paths,
2522                             needs_sandboxing ? context->read_only_paths : NULL,
2523                             needs_sandboxing ? context->inaccessible_paths : NULL,
2524                             empty_directories,
2525                             bind_mounts,
2526                             n_bind_mounts,
2527                             context->temporary_filesystems,
2528                             context->n_temporary_filesystems,
2529                             tmp,
2530                             var,
2531                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2532                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2533                             context->mount_flags,
2534                             DISSECT_IMAGE_DISCARD_ON_LOOP,
2535                             error_path);
2536
2537         bind_mount_free_many(bind_mounts, n_bind_mounts);
2538
2539         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2540          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2541          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2542          * completely different execution environment. */
2543         if (r == -ENOANO) {
2544                 if (n_bind_mounts == 0 &&
2545                     context->n_temporary_filesystems == 0 &&
2546                     !root_dir && !root_image &&
2547                     !context->dynamic_user) {
2548                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2549                         return 0;
2550                 }
2551
2552                 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2553                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2554                                n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2555
2556                 return -EOPNOTSUPP;
2557         }
2558
2559         return r;
2560 }
2561
2562 static int apply_working_directory(
2563                 const ExecContext *context,
2564                 const ExecParameters *params,
2565                 const char *home,
2566                 int *exit_status) {
2567
2568         const char *d, *wd;
2569
2570         assert(context);
2571         assert(exit_status);
2572
2573         if (context->working_directory_home) {
2574
2575                 if (!home) {
2576                         *exit_status = EXIT_CHDIR;
2577                         return -ENXIO;
2578                 }
2579
2580                 wd = home;
2581
2582         } else if (context->working_directory)
2583                 wd = context->working_directory;
2584         else
2585                 wd = "/";
2586
2587         if (params->flags & EXEC_APPLY_CHROOT)
2588                 d = wd;
2589         else
2590                 d = prefix_roota(context->root_directory, wd);
2591
2592         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2593                 *exit_status = EXIT_CHDIR;
2594                 return -errno;
2595         }
2596
2597         return 0;
2598 }
2599
2600 static int apply_root_directory(
2601                 const ExecContext *context,
2602                 const ExecParameters *params,
2603                 const bool needs_mount_ns,
2604                 int *exit_status) {
2605
2606         assert(context);
2607         assert(exit_status);
2608
2609         if (params->flags & EXEC_APPLY_CHROOT) {
2610                 if (!needs_mount_ns && context->root_directory)
2611                         if (chroot(context->root_directory) < 0) {
2612                                 *exit_status = EXIT_CHROOT;
2613                                 return -errno;
2614                         }
2615         }
2616
2617         return 0;
2618 }
2619
2620 static int setup_keyring(
2621                 const Unit *u,
2622                 const ExecContext *context,
2623                 const ExecParameters *p,
2624                 uid_t uid, gid_t gid) {
2625
2626         key_serial_t keyring;
2627         int r = 0;
2628         uid_t saved_uid;
2629         gid_t saved_gid;
2630
2631         assert(u);
2632         assert(context);
2633         assert(p);
2634
2635         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2636          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2637          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2638          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2639          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2640          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2641
2642         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2643                 return 0;
2644
2645         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2646          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2647          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2648          * & group is just as nasty as acquiring a reference to the user keyring. */
2649
2650         saved_uid = getuid();
2651         saved_gid = getgid();
2652
2653         if (gid_is_valid(gid) && gid != saved_gid) {
2654                 if (setregid(gid, -1) < 0)
2655                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2656         }
2657
2658         if (uid_is_valid(uid) && uid != saved_uid) {
2659                 if (setreuid(uid, -1) < 0) {
2660                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2661                         goto out;
2662                 }
2663         }
2664
2665         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2666         if (keyring == -1) {
2667                 if (errno == ENOSYS)
2668                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2669                 else if (IN_SET(errno, EACCES, EPERM))
2670                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2671                 else if (errno == EDQUOT)
2672                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2673                 else
2674                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2675
2676                 goto out;
2677         }
2678
2679         /* When requested link the user keyring into the session keyring. */
2680         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2681
2682                 if (keyctl(KEYCTL_LINK,
2683                            KEY_SPEC_USER_KEYRING,
2684                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2685                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2686                         goto out;
2687                 }
2688         }
2689
2690         /* Restore uid/gid back */
2691         if (uid_is_valid(uid) && uid != saved_uid) {
2692                 if (setreuid(saved_uid, -1) < 0) {
2693                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2694                         goto out;
2695                 }
2696         }
2697
2698         if (gid_is_valid(gid) && gid != saved_gid) {
2699                 if (setregid(saved_gid, -1) < 0)
2700                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2701         }
2702
2703         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2704         if (!sd_id128_is_null(u->invocation_id)) {
2705                 key_serial_t key;
2706
2707                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2708                 if (key == -1)
2709                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2710                 else {
2711                         if (keyctl(KEYCTL_SETPERM, key,
2712                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2713                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2714                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2715                 }
2716         }
2717
2718 out:
2719         /* Revert back uid & gid for the the last time, and exit */
2720         /* no extra logging, as only the first already reported error matters */
2721         if (getuid() != saved_uid)
2722                 (void) setreuid(saved_uid, -1);
2723
2724         if (getgid() != saved_gid)
2725                 (void) setregid(saved_gid, -1);
2726
2727         return r;
2728 }
2729
2730 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2731         assert(array);
2732         assert(n);
2733         assert(pair);
2734
2735         if (pair[0] >= 0)
2736                 array[(*n)++] = pair[0];
2737         if (pair[1] >= 0)
2738                 array[(*n)++] = pair[1];
2739 }
2740
2741 static int close_remaining_fds(
2742                 const ExecParameters *params,
2743                 const ExecRuntime *runtime,
2744                 const DynamicCreds *dcreds,
2745                 int user_lookup_fd,
2746                 int socket_fd,
2747                 int exec_fd,
2748                 int *fds, size_t n_fds) {
2749
2750         size_t n_dont_close = 0;
2751         int dont_close[n_fds + 12];
2752
2753         assert(params);
2754
2755         if (params->stdin_fd >= 0)
2756                 dont_close[n_dont_close++] = params->stdin_fd;
2757         if (params->stdout_fd >= 0)
2758                 dont_close[n_dont_close++] = params->stdout_fd;
2759         if (params->stderr_fd >= 0)
2760                 dont_close[n_dont_close++] = params->stderr_fd;
2761
2762         if (socket_fd >= 0)
2763                 dont_close[n_dont_close++] = socket_fd;
2764         if (exec_fd >= 0)
2765                 dont_close[n_dont_close++] = exec_fd;
2766         if (n_fds > 0) {
2767                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2768                 n_dont_close += n_fds;
2769         }
2770
2771         if (runtime)
2772                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2773
2774         if (dcreds) {
2775                 if (dcreds->user)
2776                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2777                 if (dcreds->group)
2778                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2779         }
2780
2781         if (user_lookup_fd >= 0)
2782                 dont_close[n_dont_close++] = user_lookup_fd;
2783
2784         return close_all_fds(dont_close, n_dont_close);
2785 }
2786
2787 static int send_user_lookup(
2788                 Unit *unit,
2789                 int user_lookup_fd,
2790                 uid_t uid,
2791                 gid_t gid) {
2792
2793         assert(unit);
2794
2795         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2796          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2797          * specified. */
2798
2799         if (user_lookup_fd < 0)
2800                 return 0;
2801
2802         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2803                 return 0;
2804
2805         if (writev(user_lookup_fd,
2806                (struct iovec[]) {
2807                            IOVEC_INIT(&uid, sizeof(uid)),
2808                            IOVEC_INIT(&gid, sizeof(gid)),
2809                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2810                 return -errno;
2811
2812         return 0;
2813 }
2814
2815 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2816         int r;
2817
2818         assert(c);
2819         assert(home);
2820         assert(buf);
2821
2822         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2823
2824         if (*home)
2825                 return 0;
2826
2827         if (!c->working_directory_home)
2828                 return 0;
2829
2830         r = get_home_dir(buf);
2831         if (r < 0)
2832                 return r;
2833
2834         *home = *buf;
2835         return 1;
2836 }
2837
2838 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2839         _cleanup_strv_free_ char ** list = NULL;
2840         ExecDirectoryType t;
2841         int r;
2842
2843         assert(c);
2844         assert(p);
2845         assert(ret);
2846
2847         assert(c->dynamic_user);
2848
2849         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2850          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2851          * directories. */
2852
2853         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2854                 char **i;
2855
2856                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2857                         continue;
2858
2859                 if (!p->prefix[t])
2860                         continue;
2861
2862                 STRV_FOREACH(i, c->directories[t].paths) {
2863                         char *e;
2864
2865                         if (exec_directory_is_private(c, t))
2866                                 e = path_join(p->prefix[t], "private", *i);
2867                         else
2868                                 e = path_join(p->prefix[t], *i);
2869                         if (!e)
2870                                 return -ENOMEM;
2871
2872                         r = strv_consume(&list, e);
2873                         if (r < 0)
2874                                 return r;
2875                 }
2876         }
2877
2878         *ret = TAKE_PTR(list);
2879
2880         return 0;
2881 }
2882
2883 static char *exec_command_line(char **argv);
2884
2885 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2886         bool using_subcgroup;
2887         char *p;
2888
2889         assert(params);
2890         assert(ret);
2891
2892         if (!params->cgroup_path)
2893                 return -EINVAL;
2894
2895         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2896          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2897          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2898          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2899          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2900          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2901          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2902          * flag, which is only passed for the former statements, not for the latter. */
2903
2904         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2905         if (using_subcgroup)
2906                 p = path_join(params->cgroup_path, ".control");
2907         else
2908                 p = strdup(params->cgroup_path);
2909         if (!p)
2910                 return -ENOMEM;
2911
2912         *ret = p;
2913         return using_subcgroup;
2914 }
2915
2916 static int exec_child(
2917                 Unit *unit,
2918                 const ExecCommand *command,
2919                 const ExecContext *context,
2920                 const ExecParameters *params,
2921                 ExecRuntime *runtime,
2922                 DynamicCreds *dcreds,
2923                 int socket_fd,
2924                 const int named_iofds[static 3],
2925                 int *fds,
2926                 size_t n_socket_fds,
2927                 size_t n_storage_fds,
2928                 char **files_env,
2929                 int user_lookup_fd,
2930                 int *exit_status) {
2931
2932         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2933         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2934         _cleanup_free_ gid_t *supplementary_gids = NULL;
2935         const char *username = NULL, *groupname = NULL;
2936         _cleanup_free_ char *home_buffer = NULL;
2937         const char *home = NULL, *shell = NULL;
2938         char **final_argv = NULL;
2939         dev_t journal_stream_dev = 0;
2940         ino_t journal_stream_ino = 0;
2941         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2942                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2943                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2944                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2945 #if HAVE_SELINUX
2946         _cleanup_free_ char *mac_selinux_context_net = NULL;
2947         bool use_selinux = false;
2948 #endif
2949 #if ENABLE_SMACK
2950         bool use_smack = false;
2951 #endif
2952 #if HAVE_APPARMOR
2953         bool use_apparmor = false;
2954 #endif
2955         uid_t uid = UID_INVALID;
2956         gid_t gid = GID_INVALID;
2957         size_t n_fds;
2958         ExecDirectoryType dt;
2959         int secure_bits;
2960
2961         assert(unit);
2962         assert(command);
2963         assert(context);
2964         assert(params);
2965         assert(exit_status);
2966
2967         rename_process_from_path(command->path);
2968
2969         /* We reset exactly these signals, since they are the
2970          * only ones we set to SIG_IGN in the main daemon. All
2971          * others we leave untouched because we set them to
2972          * SIG_DFL or a valid handler initially, both of which
2973          * will be demoted to SIG_DFL. */
2974         (void) default_signals(SIGNALS_CRASH_HANDLER,
2975                                SIGNALS_IGNORE, -1);
2976
2977         if (context->ignore_sigpipe)
2978                 (void) ignore_signals(SIGPIPE, -1);
2979
2980         r = reset_signal_mask();
2981         if (r < 0) {
2982                 *exit_status = EXIT_SIGNAL_MASK;
2983                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2984         }
2985
2986         if (params->idle_pipe)
2987                 do_idle_pipe_dance(params->idle_pipe);
2988
2989         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2990          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2991          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2992          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2993
2994         log_forget_fds();
2995         log_set_open_when_needed(true);
2996
2997         /* In case anything used libc syslog(), close this here, too */
2998         closelog();
2999
3000         n_fds = n_socket_fds + n_storage_fds;
3001         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3002         if (r < 0) {
3003                 *exit_status = EXIT_FDS;
3004                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3005         }
3006
3007         if (!context->same_pgrp)
3008                 if (setsid() < 0) {
3009                         *exit_status = EXIT_SETSID;
3010                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3011                 }
3012
3013         exec_context_tty_reset(context, params);
3014
3015         if (unit_shall_confirm_spawn(unit)) {
3016                 const char *vc = params->confirm_spawn;
3017                 _cleanup_free_ char *cmdline = NULL;
3018
3019                 cmdline = exec_command_line(command->argv);
3020                 if (!cmdline) {
3021                         *exit_status = EXIT_MEMORY;
3022                         return log_oom();
3023                 }
3024
3025                 r = ask_for_confirmation(vc, unit, cmdline);
3026                 if (r != CONFIRM_EXECUTE) {
3027                         if (r == CONFIRM_PRETEND_SUCCESS) {
3028                                 *exit_status = EXIT_SUCCESS;
3029                                 return 0;
3030                         }
3031                         *exit_status = EXIT_CONFIRM;
3032                         log_unit_error(unit, "Execution cancelled by the user");
3033                         return -ECANCELED;
3034                 }
3035         }
3036
3037         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3038          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3039          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3040          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3041          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3042         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3043             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3044                 *exit_status = EXIT_MEMORY;
3045                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3046         }
3047
3048         if (context->dynamic_user && dcreds) {
3049                 _cleanup_strv_free_ char **suggested_paths = NULL;
3050
3051                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3052                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3053                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3054                         *exit_status = EXIT_USER;
3055                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3056                 }
3057
3058                 r = compile_suggested_paths(context, params, &suggested_paths);
3059                 if (r < 0) {
3060                         *exit_status = EXIT_MEMORY;
3061                         return log_oom();
3062                 }
3063
3064                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3065                 if (r < 0) {
3066                         *exit_status = EXIT_USER;
3067                         if (r == -EILSEQ) {
3068                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3069                                 return -EOPNOTSUPP;
3070                         }
3071                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3072                 }
3073
3074                 if (!uid_is_valid(uid)) {
3075                         *exit_status = EXIT_USER;
3076                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3077                         return -ESRCH;
3078                 }
3079
3080                 if (!gid_is_valid(gid)) {
3081                         *exit_status = EXIT_USER;
3082                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3083                         return -ESRCH;
3084                 }
3085
3086                 if (dcreds->user)
3087                         username = dcreds->user->name;
3088
3089         } else {
3090                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3091                 if (r < 0) {
3092                         *exit_status = EXIT_USER;
3093                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3094                 }
3095
3096                 r = get_fixed_group(context, &groupname, &gid);
3097                 if (r < 0) {
3098                         *exit_status = EXIT_GROUP;
3099                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3100                 }
3101         }
3102
3103         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3104         r = get_supplementary_groups(context, username, groupname, gid,
3105                                      &supplementary_gids, &ngids);
3106         if (r < 0) {
3107                 *exit_status = EXIT_GROUP;
3108                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3109         }
3110
3111         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3112         if (r < 0) {
3113                 *exit_status = EXIT_USER;
3114                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3115         }
3116
3117         user_lookup_fd = safe_close(user_lookup_fd);
3118
3119         r = acquire_home(context, uid, &home, &home_buffer);
3120         if (r < 0) {
3121                 *exit_status = EXIT_CHDIR;
3122                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3123         }
3124
3125         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3126          * must sure to drop O_NONBLOCK */
3127         if (socket_fd >= 0)
3128                 (void) fd_nonblock(socket_fd, false);
3129
3130         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3131          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3132         if (params->cgroup_path) {
3133                 _cleanup_free_ char *p = NULL;
3134
3135                 r = exec_parameters_get_cgroup_path(params, &p);
3136                 if (r < 0) {
3137                         *exit_status = EXIT_CGROUP;
3138                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3139                 }
3140
3141                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3142                 if (r < 0) {
3143                         *exit_status = EXIT_CGROUP;
3144                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3145                 }
3146         }
3147
3148         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3149                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3150                 if (r < 0) {
3151                         *exit_status = EXIT_NETWORK;
3152                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3153                 }
3154         }
3155
3156         r = setup_input(context, params, socket_fd, named_iofds);
3157         if (r < 0) {
3158                 *exit_status = EXIT_STDIN;
3159                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3160         }
3161
3162         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3163         if (r < 0) {
3164                 *exit_status = EXIT_STDOUT;
3165                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3166         }
3167
3168         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3169         if (r < 0) {
3170                 *exit_status = EXIT_STDERR;
3171                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3172         }
3173
3174         if (context->oom_score_adjust_set) {
3175                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3176                  * prohibit write access to this file, and we shouldn't trip up over that. */
3177                 r = set_oom_score_adjust(context->oom_score_adjust);
3178                 if (IN_SET(r, -EPERM, -EACCES))
3179                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3180                 else if (r < 0) {
3181                         *exit_status = EXIT_OOM_ADJUST;
3182                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3183                 }
3184         }
3185
3186         if (context->nice_set)
3187                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3188                         *exit_status = EXIT_NICE;
3189                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3190                 }
3191
3192         if (context->cpu_sched_set) {
3193                 struct sched_param param = {
3194                         .sched_priority = context->cpu_sched_priority,
3195                 };
3196
3197                 r = sched_setscheduler(0,
3198                                        context->cpu_sched_policy |
3199                                        (context->cpu_sched_reset_on_fork ?
3200                                         SCHED_RESET_ON_FORK : 0),
3201                                        &param);
3202                 if (r < 0) {
3203                         *exit_status = EXIT_SETSCHEDULER;
3204                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3205                 }
3206         }
3207
3208         if (context->cpu_set.set)
3209                 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3210                         *exit_status = EXIT_CPUAFFINITY;
3211                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3212                 }
3213
3214         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3215                 r = apply_numa_policy(&context->numa_policy);
3216                 if (r == -EOPNOTSUPP)
3217                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3218                 else if (r < 0) {
3219                         *exit_status = EXIT_NUMA_POLICY;
3220                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3221                 }
3222         }
3223
3224         if (context->ioprio_set)
3225                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3226                         *exit_status = EXIT_IOPRIO;
3227                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3228                 }
3229
3230         if (context->timer_slack_nsec != NSEC_INFINITY)
3231                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3232                         *exit_status = EXIT_TIMERSLACK;
3233                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3234                 }
3235
3236         if (context->personality != PERSONALITY_INVALID) {
3237                 r = safe_personality(context->personality);
3238                 if (r < 0) {
3239                         *exit_status = EXIT_PERSONALITY;
3240                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3241                 }
3242         }
3243
3244         if (context->utmp_id)
3245                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3246                                       context->tty_path,
3247                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3248                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3249                                       USER_PROCESS,
3250                                       username);
3251
3252         if (uid_is_valid(uid)) {
3253                 r = chown_terminal(STDIN_FILENO, uid);
3254                 if (r < 0) {
3255                         *exit_status = EXIT_STDIN;
3256                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3257                 }
3258         }
3259
3260         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3261          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3262          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3263          * touch a single hierarchy too. */
3264         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3265                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3266                 if (r < 0) {
3267                         *exit_status = EXIT_CGROUP;
3268                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3269                 }
3270         }
3271
3272         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3273                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3274                 if (r < 0)
3275                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3276         }
3277
3278         r = build_environment(
3279                         unit,
3280                         context,
3281                         params,
3282                         n_fds,
3283                         home,
3284                         username,
3285                         shell,
3286                         journal_stream_dev,
3287                         journal_stream_ino,
3288                         &our_env);
3289         if (r < 0) {
3290                 *exit_status = EXIT_MEMORY;
3291                 return log_oom();
3292         }
3293
3294         r = build_pass_environment(context, &pass_env);
3295         if (r < 0) {
3296                 *exit_status = EXIT_MEMORY;
3297                 return log_oom();
3298         }
3299
3300         accum_env = strv_env_merge(5,
3301                                    params->environment,
3302                                    our_env,
3303                                    pass_env,
3304                                    context->environment,
3305                                    files_env,
3306                                    NULL);
3307         if (!accum_env) {
3308                 *exit_status = EXIT_MEMORY;
3309                 return log_oom();
3310         }
3311         accum_env = strv_env_clean(accum_env);
3312
3313         (void) umask(context->umask);
3314
3315         r = setup_keyring(unit, context, params, uid, gid);
3316         if (r < 0) {
3317                 *exit_status = EXIT_KEYRING;
3318                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3319         }
3320
3321         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3322         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3323
3324         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3325         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3326
3327         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3328         if (needs_ambient_hack)
3329                 needs_setuid = false;
3330         else
3331                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3332
3333         if (needs_sandboxing) {
3334                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3335                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3336                  * impacting our own code paths. */
3337
3338 #if HAVE_SELINUX
3339                 use_selinux = mac_selinux_use();
3340 #endif
3341 #if ENABLE_SMACK
3342                 use_smack = mac_smack_use();
3343 #endif
3344 #if HAVE_APPARMOR
3345                 use_apparmor = mac_apparmor_use();
3346 #endif
3347         }
3348
3349         if (needs_sandboxing) {
3350                 int which_failed;
3351
3352                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3353                  * is set here. (See below.) */
3354
3355                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3356                 if (r < 0) {
3357                         *exit_status = EXIT_LIMITS;
3358                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3359                 }
3360         }
3361
3362         if (needs_setuid) {
3363
3364                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3365                  * wins here. (See above.) */
3366
3367                 if (context->pam_name && username) {
3368                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3369                         if (r < 0) {
3370                                 *exit_status = EXIT_PAM;
3371                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3372                         }
3373                 }
3374         }
3375
3376         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3377
3378                 if (ns_type_supported(NAMESPACE_NET)) {
3379                         r = setup_netns(runtime->netns_storage_socket);
3380                         if (r < 0) {
3381                                 *exit_status = EXIT_NETWORK;
3382                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3383                         }
3384                 } else if (context->network_namespace_path) {
3385                         *exit_status = EXIT_NETWORK;
3386                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3387                 } else
3388                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3389         }
3390
3391         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3392         if (needs_mount_namespace) {
3393                 _cleanup_free_ char *error_path = NULL;
3394
3395                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3396                 if (r < 0) {
3397                         *exit_status = EXIT_NAMESPACE;
3398                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3399                                                     error_path ? ": " : "", strempty(error_path));
3400                 }
3401         }
3402
3403         if (context->protect_hostname) {
3404                 if (ns_type_supported(NAMESPACE_UTS)) {
3405                         if (unshare(CLONE_NEWUTS) < 0) {
3406                                 *exit_status = EXIT_NAMESPACE;
3407                                 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3408                         }
3409                 } else
3410                         log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3411 #if HAVE_SECCOMP
3412                 r = seccomp_protect_hostname();
3413                 if (r < 0) {
3414                         *exit_status = EXIT_SECCOMP;
3415                         return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3416                 }
3417 #endif
3418         }
3419
3420         /* Drop groups as early as possbile */
3421         if (needs_setuid) {
3422                 r = enforce_groups(gid, supplementary_gids, ngids);
3423                 if (r < 0) {
3424                         *exit_status = EXIT_GROUP;
3425                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3426                 }
3427         }
3428
3429         if (needs_sandboxing) {
3430 #if HAVE_SELINUX
3431                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3432                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3433                         if (r < 0) {
3434                                 *exit_status = EXIT_SELINUX_CONTEXT;
3435                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3436                         }
3437                 }
3438 #endif
3439
3440                 if (context->private_users) {
3441                         r = setup_private_users(uid, gid);
3442                         if (r < 0) {
3443                                 *exit_status = EXIT_USER;
3444                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3445                         }
3446                 }
3447         }
3448
3449         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3450          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3451          * however if we have it as we want to keep it open until the final execve(). */
3452
3453         if (params->exec_fd >= 0) {
3454                 exec_fd = params->exec_fd;
3455
3456                 if (exec_fd < 3 + (int) n_fds) {
3457                         int moved_fd;
3458
3459                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3460                          * process we are about to execute. */
3461
3462                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3463                         if (moved_fd < 0) {
3464                                 *exit_status = EXIT_FDS;
3465                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3466                         }
3467
3468                         safe_close(exec_fd);
3469                         exec_fd = moved_fd;
3470                 } else {
3471                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3472                         r = fd_cloexec(exec_fd, true);
3473                         if (r < 0) {
3474                                 *exit_status = EXIT_FDS;
3475                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3476                         }
3477                 }
3478
3479                 fds_with_exec_fd = newa(int, n_fds + 1);
3480                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3481                 fds_with_exec_fd[n_fds] = exec_fd;
3482                 n_fds_with_exec_fd = n_fds + 1;
3483         } else {
3484                 fds_with_exec_fd = fds;
3485                 n_fds_with_exec_fd = n_fds;
3486         }
3487
3488         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3489         if (r >= 0)
3490                 r = shift_fds(fds, n_fds);
3491         if (r >= 0)
3492                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3493         if (r < 0) {
3494                 *exit_status = EXIT_FDS;
3495                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3496         }
3497
3498         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3499          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3500          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3501          * came this far. */
3502
3503         secure_bits = context->secure_bits;
3504
3505         if (needs_sandboxing) {
3506                 uint64_t bset;
3507
3508                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3509                  * requested. (Note this is placed after the general resource limit initialization, see
3510                  * above, in order to take precedence.) */
3511                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3512                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3513                                 *exit_status = EXIT_LIMITS;
3514                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3515                         }
3516                 }
3517
3518 #if ENABLE_SMACK
3519                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3520                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3521                 if (use_smack) {
3522                         r = setup_smack(context, command);
3523                         if (r < 0) {
3524                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3525                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3526                         }
3527                 }
3528 #endif
3529
3530                 bset = context->capability_bounding_set;
3531                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3532                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3533                  * instead of us doing that */
3534                 if (needs_ambient_hack)
3535                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3536                                 (UINT64_C(1) << CAP_SETUID) |
3537                                 (UINT64_C(1) << CAP_SETGID);
3538
3539                 if (!cap_test_all(bset)) {
3540                         r = capability_bounding_set_drop(bset, false);
3541                         if (r < 0) {
3542                                 *exit_status = EXIT_CAPABILITIES;
3543                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3544                         }
3545                 }
3546
3547                 /* This is done before enforce_user, but ambient set
3548                  * does not survive over setresuid() if keep_caps is not set. */
3549                 if (!needs_ambient_hack &&
3550                     context->capability_ambient_set != 0) {
3551                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3552                         if (r < 0) {
3553                                 *exit_status = EXIT_CAPABILITIES;
3554                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3555                         }
3556                 }
3557         }
3558
3559         /* chroot to root directory first, before we lose the ability to chroot */
3560         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3561         if (r < 0)
3562                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3563
3564         if (needs_setuid) {
3565                 if (uid_is_valid(uid)) {
3566                         r = enforce_user(context, uid);
3567                         if (r < 0) {
3568                                 *exit_status = EXIT_USER;
3569                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3570                         }
3571
3572                         if (!needs_ambient_hack &&
3573                             context->capability_ambient_set != 0) {
3574
3575                                 /* Fix the ambient capabilities after user change. */
3576                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3577                                 if (r < 0) {
3578                                         *exit_status = EXIT_CAPABILITIES;
3579                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3580                                 }
3581
3582                                 /* If we were asked to change user and ambient capabilities
3583                                  * were requested, we had to add keep-caps to the securebits
3584                                  * so that we would maintain the inherited capability set
3585                                  * through the setresuid(). Make sure that the bit is added
3586                                  * also to the context secure_bits so that we don't try to
3587                                  * drop the bit away next. */
3588
3589                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3590                         }
3591                 }
3592         }
3593
3594         /* Apply working directory here, because the working directory might be on NFS and only the user running
3595          * this service might have the correct privilege to change to the working directory */
3596         r = apply_working_directory(context, params, home, exit_status);
3597         if (r < 0)
3598                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3599
3600         if (needs_sandboxing) {
3601                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3602                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3603                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3604                  * are restricted. */
3605
3606 #if HAVE_SELINUX
3607                 if (use_selinux) {
3608                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3609
3610                         if (exec_context) {
3611                                 r = setexeccon(exec_context);
3612                                 if (r < 0) {
3613                                         *exit_status = EXIT_SELINUX_CONTEXT;
3614                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3615                                 }
3616                         }
3617                 }
3618 #endif
3619
3620 #if HAVE_APPARMOR
3621                 if (use_apparmor && context->apparmor_profile) {
3622                         r = aa_change_onexec(context->apparmor_profile);
3623                         if (r < 0 && !context->apparmor_profile_ignore) {
3624                                 *exit_status = EXIT_APPARMOR_PROFILE;
3625                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3626                         }
3627                 }
3628 #endif
3629
3630                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3631                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3632                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3633                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3634                                 *exit_status = EXIT_SECUREBITS;
3635                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3636                         }
3637
3638                 if (context_has_no_new_privileges(context))
3639                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3640                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3641                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3642                         }
3643
3644 #if HAVE_SECCOMP
3645                 r = apply_address_families(unit, context);
3646                 if (r < 0) {
3647                         *exit_status = EXIT_ADDRESS_FAMILIES;
3648                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3649                 }
3650
3651                 r = apply_memory_deny_write_execute(unit, context);
3652                 if (r < 0) {
3653                         *exit_status = EXIT_SECCOMP;
3654                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3655                 }
3656
3657                 r = apply_restrict_realtime(unit, context);
3658                 if (r < 0) {
3659                         *exit_status = EXIT_SECCOMP;
3660                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3661                 }
3662
3663                 r = apply_restrict_suid_sgid(unit, context);
3664                 if (r < 0) {
3665                         *exit_status = EXIT_SECCOMP;
3666                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3667                 }
3668
3669                 r = apply_restrict_namespaces(unit, context);
3670                 if (r < 0) {
3671                         *exit_status = EXIT_SECCOMP;
3672                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3673                 }
3674
3675                 r = apply_protect_sysctl(unit, context);
3676                 if (r < 0) {
3677                         *exit_status = EXIT_SECCOMP;
3678                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3679                 }
3680
3681                 r = apply_protect_kernel_modules(unit, context);
3682                 if (r < 0) {
3683                         *exit_status = EXIT_SECCOMP;
3684                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3685                 }
3686
3687                 r = apply_private_devices(unit, context);
3688                 if (r < 0) {
3689                         *exit_status = EXIT_SECCOMP;
3690                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3691                 }
3692
3693                 r = apply_syscall_archs(unit, context);
3694                 if (r < 0) {
3695                         *exit_status = EXIT_SECCOMP;
3696                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3697                 }
3698
3699                 r = apply_lock_personality(unit, context);
3700                 if (r < 0) {
3701                         *exit_status = EXIT_SECCOMP;
3702                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3703                 }
3704
3705                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3706                  * by the filter as little as possible. */
3707                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3708                 if (r < 0) {
3709                         *exit_status = EXIT_SECCOMP;
3710                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3711                 }
3712 #endif
3713         }
3714
3715         if (!strv_isempty(context->unset_environment)) {
3716                 char **ee = NULL;
3717
3718                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3719                 if (!ee) {
3720                         *exit_status = EXIT_MEMORY;
3721                         return log_oom();
3722                 }
3723
3724                 strv_free_and_replace(accum_env, ee);
3725         }
3726
3727         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3728                 replaced_argv = replace_env_argv(command->argv, accum_env);
3729                 if (!replaced_argv) {
3730                         *exit_status = EXIT_MEMORY;
3731                         return log_oom();
3732                 }
3733                 final_argv = replaced_argv;
3734         } else
3735                 final_argv = command->argv;
3736
3737         if (DEBUG_LOGGING) {
3738                 _cleanup_free_ char *line;
3739
3740                 line = exec_command_line(final_argv);
3741                 if (line)
3742                         log_struct(LOG_DEBUG,
3743                                    "EXECUTABLE=%s", command->path,
3744                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3745                                    LOG_UNIT_ID(unit),
3746                                    LOG_UNIT_INVOCATION_ID(unit));
3747         }
3748
3749         if (exec_fd >= 0) {
3750                 uint8_t hot = 1;
3751
3752                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3753                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3754
3755                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3756                         *exit_status = EXIT_EXEC;
3757                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3758                 }
3759         }
3760
3761         execve(command->path, final_argv, accum_env);
3762         r = -errno;
3763
3764         if (exec_fd >= 0) {
3765                 uint8_t hot = 0;
3766
3767                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3768                  * that POLLHUP on it no longer means execve() succeeded. */
3769
3770                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3771                         *exit_status = EXIT_EXEC;
3772                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3773                 }
3774         }
3775
3776         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3777                 log_struct_errno(LOG_INFO, r,
3778                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3779                                  LOG_UNIT_ID(unit),
3780                                  LOG_UNIT_INVOCATION_ID(unit),
3781                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3782                                                   command->path),
3783                                  "EXECUTABLE=%s", command->path);
3784                 return 0;
3785         }
3786
3787         *exit_status = EXIT_EXEC;
3788         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3789 }
3790
3791 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3792 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3793
3794 int exec_spawn(Unit *unit,
3795                ExecCommand *command,
3796                const ExecContext *context,
3797                const ExecParameters *params,
3798                ExecRuntime *runtime,
3799                DynamicCreds *dcreds,
3800                pid_t *ret) {
3801
3802         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3803         _cleanup_free_ char *subcgroup_path = NULL;
3804         _cleanup_strv_free_ char **files_env = NULL;
3805         size_t n_storage_fds = 0, n_socket_fds = 0;
3806         _cleanup_free_ char *line = NULL;
3807         pid_t pid;
3808
3809         assert(unit);
3810         assert(command);
3811         assert(context);
3812         assert(ret);
3813         assert(params);
3814         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3815
3816         if (context->std_input == EXEC_INPUT_SOCKET ||
3817             context->std_output == EXEC_OUTPUT_SOCKET ||
3818             context->std_error == EXEC_OUTPUT_SOCKET) {
3819
3820                 if (params->n_socket_fds > 1) {
3821                         log_unit_error(unit, "Got more than one socket.");
3822                         return -EINVAL;
3823                 }
3824
3825                 if (params->n_socket_fds == 0) {
3826                         log_unit_error(unit, "Got no socket.");
3827                         return -EINVAL;
3828                 }
3829
3830                 socket_fd = params->fds[0];
3831         } else {
3832                 socket_fd = -1;
3833                 fds = params->fds;
3834                 n_socket_fds = params->n_socket_fds;
3835                 n_storage_fds = params->n_storage_fds;
3836         }
3837
3838         r = exec_context_named_iofds(context, params, named_iofds);
3839         if (r < 0)
3840                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3841
3842         r = exec_context_load_environment(unit, context, &files_env);
3843         if (r < 0)
3844                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3845
3846         line = exec_command_line(command->argv);
3847         if (!line)
3848                 return log_oom();
3849
3850         log_struct(LOG_DEBUG,
3851                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3852                    "EXECUTABLE=%s", command->path,
3853                    LOG_UNIT_ID(unit),
3854                    LOG_UNIT_INVOCATION_ID(unit));
3855
3856         if (params->cgroup_path) {
3857                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3858                 if (r < 0)
3859                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3860                 if (r > 0) { /* We are using a child cgroup */
3861                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3862                         if (r < 0)
3863                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3864                 }
3865         }
3866
3867         pid = fork();
3868         if (pid < 0)
3869                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3870
3871         if (pid == 0) {
3872                 int exit_status = EXIT_SUCCESS;
3873
3874                 r = exec_child(unit,
3875                                command,
3876                                context,
3877                                params,
3878                                runtime,
3879                                dcreds,
3880                                socket_fd,
3881                                named_iofds,
3882                                fds,
3883                                n_socket_fds,
3884                                n_storage_fds,
3885                                files_env,
3886                                unit->manager->user_lookup_fds[1],
3887                                &exit_status);
3888
3889                 if (r < 0) {
3890                         const char *status =
3891                                 exit_status_to_string(exit_status,
3892                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
3893
3894                         log_struct_errno(LOG_ERR, r,
3895                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3896                                          LOG_UNIT_ID(unit),
3897                                          LOG_UNIT_INVOCATION_ID(unit),
3898                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3899                                                           status, command->path),
3900                                          "EXECUTABLE=%s", command->path);
3901                 }
3902
3903                 _exit(exit_status);
3904         }
3905
3906         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3907
3908         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3909          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3910          * process will be killed too). */
3911         if (subcgroup_path)
3912                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3913
3914         exec_status_start(&command->exec_status, pid);
3915
3916         *ret = pid;
3917         return 0;
3918 }
3919
3920 void exec_context_init(ExecContext *c) {
3921         ExecDirectoryType i;
3922
3923         assert(c);
3924
3925         c->umask = 0022;
3926         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3927         c->cpu_sched_policy = SCHED_OTHER;
3928         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3929         c->syslog_level_prefix = true;
3930         c->ignore_sigpipe = true;
3931         c->timer_slack_nsec = NSEC_INFINITY;
3932         c->personality = PERSONALITY_INVALID;
3933         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3934                 c->directories[i].mode = 0755;
3935         c->timeout_clean_usec = USEC_INFINITY;
3936         c->capability_bounding_set = CAP_ALL;
3937         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3938         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3939         c->log_level_max = -1;
3940         numa_policy_reset(&c->numa_policy);
3941 }
3942
3943 void exec_context_done(ExecContext *c) {
3944         ExecDirectoryType i;
3945         size_t l;
3946
3947         assert(c);
3948
3949         c->environment = strv_free(c->environment);
3950         c->environment_files = strv_free(c->environment_files);
3951         c->pass_environment = strv_free(c->pass_environment);
3952         c->unset_environment = strv_free(c->unset_environment);
3953
3954         rlimit_free_all(c->rlimit);
3955
3956         for (l = 0; l < 3; l++) {
3957                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3958                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3959         }
3960
3961         c->working_directory = mfree(c->working_directory);
3962         c->root_directory = mfree(c->root_directory);
3963         c->root_image = mfree(c->root_image);
3964         c->tty_path = mfree(c->tty_path);
3965         c->syslog_identifier = mfree(c->syslog_identifier);
3966         c->user = mfree(c->user);
3967         c->group = mfree(c->group);
3968
3969         c->supplementary_groups = strv_free(c->supplementary_groups);
3970
3971         c->pam_name = mfree(c->pam_name);
3972
3973         c->read_only_paths = strv_free(c->read_only_paths);
3974         c->read_write_paths = strv_free(c->read_write_paths);
3975         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3976
3977         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3978         c->bind_mounts = NULL;
3979         c->n_bind_mounts = 0;
3980         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3981         c->temporary_filesystems = NULL;
3982         c->n_temporary_filesystems = 0;
3983
3984         cpu_set_reset(&c->cpu_set);
3985         numa_policy_reset(&c->numa_policy);
3986
3987         c->utmp_id = mfree(c->utmp_id);
3988         c->selinux_context = mfree(c->selinux_context);
3989         c->apparmor_profile = mfree(c->apparmor_profile);
3990         c->smack_process_label = mfree(c->smack_process_label);
3991
3992         c->syscall_filter = hashmap_free(c->syscall_filter);
3993         c->syscall_archs = set_free(c->syscall_archs);
3994         c->address_families = set_free(c->address_families);
3995
3996         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3997                 c->directories[i].paths = strv_free(c->directories[i].paths);
3998
3999         c->log_level_max = -1;
4000
4001         exec_context_free_log_extra_fields(c);
4002
4003         c->log_rate_limit_interval_usec = 0;
4004         c->log_rate_limit_burst = 0;
4005
4006         c->stdin_data = mfree(c->stdin_data);
4007         c->stdin_data_size = 0;
4008
4009         c->network_namespace_path = mfree(c->network_namespace_path);
4010 }
4011
4012 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4013         char **i;
4014
4015         assert(c);
4016
4017         if (!runtime_prefix)
4018                 return 0;
4019
4020         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4021                 _cleanup_free_ char *p;
4022
4023                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4024                         p = path_join(runtime_prefix, "private", *i);
4025                 else
4026                         p = path_join(runtime_prefix, *i);
4027                 if (!p)
4028                         return -ENOMEM;
4029
4030                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4031                  * service next. */
4032                 (void) rm_rf(p, REMOVE_ROOT);
4033         }
4034
4035         return 0;
4036 }
4037
4038 static void exec_command_done(ExecCommand *c) {
4039         assert(c);
4040
4041         c->path = mfree(c->path);
4042         c->argv = strv_free(c->argv);
4043 }
4044
4045 void exec_command_done_array(ExecCommand *c, size_t n) {
4046         size_t i;
4047
4048         for (i = 0; i < n; i++)
4049                 exec_command_done(c+i);
4050 }
4051
4052 ExecCommand* exec_command_free_list(ExecCommand *c) {
4053         ExecCommand *i;
4054
4055         while ((i = c)) {
4056                 LIST_REMOVE(command, c, i);
4057                 exec_command_done(i);
4058                 free(i);
4059         }
4060
4061         return NULL;
4062 }
4063
4064 void exec_command_free_array(ExecCommand **c, size_t n) {
4065         size_t i;
4066
4067         for (i = 0; i < n; i++)
4068                 c[i] = exec_command_free_list(c[i]);
4069 }
4070
4071 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4072         size_t i;
4073
4074         for (i = 0; i < n; i++)
4075                 exec_status_reset(&c[i].exec_status);
4076 }
4077
4078 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4079         size_t i;
4080
4081         for (i = 0; i < n; i++) {
4082                 ExecCommand *z;
4083
4084                 LIST_FOREACH(command, z, c[i])
4085                         exec_status_reset(&z->exec_status);
4086         }
4087 }
4088
4089 typedef struct InvalidEnvInfo {
4090         const Unit *unit;
4091         const char *path;
4092 } InvalidEnvInfo;
4093
4094 static void invalid_env(const char *p, void *userdata) {
4095         InvalidEnvInfo *info = userdata;
4096
4097         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4098 }
4099
4100 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4101         assert(c);
4102
4103         switch (fd_index) {
4104
4105         case STDIN_FILENO:
4106                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4107                         return NULL;
4108
4109                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4110
4111         case STDOUT_FILENO:
4112                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4113                         return NULL;
4114
4115                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4116
4117         case STDERR_FILENO:
4118                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4119                         return NULL;
4120
4121                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4122
4123         default:
4124                 return NULL;
4125         }
4126 }
4127
4128 static int exec_context_named_iofds(
4129                 const ExecContext *c,
4130                 const ExecParameters *p,
4131                 int named_iofds[static 3]) {
4132
4133         size_t i, targets;
4134         const char* stdio_fdname[3];
4135         size_t n_fds;
4136
4137         assert(c);
4138         assert(p);
4139         assert(named_iofds);
4140
4141         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4142                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4143                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4144
4145         for (i = 0; i < 3; i++)
4146                 stdio_fdname[i] = exec_context_fdname(c, i);
4147
4148         n_fds = p->n_storage_fds + p->n_socket_fds;
4149
4150         for (i = 0; i < n_fds  && targets > 0; i++)
4151                 if (named_iofds[STDIN_FILENO] < 0 &&
4152                     c->std_input == EXEC_INPUT_NAMED_FD &&
4153                     stdio_fdname[STDIN_FILENO] &&
4154                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4155
4156                         named_iofds[STDIN_FILENO] = p->fds[i];
4157                         targets--;
4158
4159                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4160                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4161                            stdio_fdname[STDOUT_FILENO] &&
4162                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4163
4164                         named_iofds[STDOUT_FILENO] = p->fds[i];
4165                         targets--;
4166
4167                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4168                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4169                            stdio_fdname[STDERR_FILENO] &&
4170                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4171
4172                         named_iofds[STDERR_FILENO] = p->fds[i];
4173                         targets--;
4174                 }
4175
4176         return targets == 0 ? 0 : -ENOENT;
4177 }
4178
4179 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4180         char **i, **r = NULL;
4181
4182         assert(c);
4183         assert(l);
4184
4185         STRV_FOREACH(i, c->environment_files) {
4186                 char *fn;
4187                 int k;
4188                 unsigned n;
4189                 bool ignore = false;
4190                 char **p;
4191                 _cleanup_globfree_ glob_t pglob = {};
4192
4193                 fn = *i;
4194
4195                 if (fn[0] == '-') {
4196                         ignore = true;
4197                         fn++;
4198                 }
4199
4200                 if (!path_is_absolute(fn)) {
4201                         if (ignore)
4202                                 continue;
4203
4204                         strv_free(r);
4205                         return -EINVAL;
4206                 }
4207
4208                 /* Filename supports globbing, take all matching files */
4209                 k = safe_glob(fn, 0, &pglob);
4210                 if (k < 0) {
4211                         if (ignore)
4212                                 continue;
4213
4214                         strv_free(r);
4215                         return k;
4216                 }
4217
4218                 /* When we don't match anything, -ENOENT should be returned */
4219                 assert(pglob.gl_pathc > 0);
4220
4221                 for (n = 0; n < pglob.gl_pathc; n++) {
4222                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4223                         if (k < 0) {
4224                                 if (ignore)
4225                                         continue;
4226
4227                                 strv_free(r);
4228                                 return k;
4229                         }
4230                         /* Log invalid environment variables with filename */
4231                         if (p) {
4232                                 InvalidEnvInfo info = {
4233                                         .unit = unit,
4234                                         .path = pglob.gl_pathv[n]
4235                                 };
4236
4237                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4238                         }
4239
4240                         if (!r)
4241                                 r = p;
4242                         else {
4243                                 char **m;
4244
4245                                 m = strv_env_merge(2, r, p);
4246                                 strv_free(r);
4247                                 strv_free(p);
4248                                 if (!m)
4249                                         return -ENOMEM;
4250
4251                                 r = m;
4252                         }
4253                 }
4254         }
4255
4256         *l = r;
4257
4258         return 0;
4259 }
4260
4261 static bool tty_may_match_dev_console(const char *tty) {
4262         _cleanup_free_ char *resolved = NULL;
4263
4264         if (!tty)
4265                 return true;
4266
4267         tty = skip_dev_prefix(tty);
4268
4269         /* trivial identity? */
4270         if (streq(tty, "console"))
4271                 return true;
4272
4273         if (resolve_dev_console(&resolved) < 0)
4274                 return true; /* if we could not resolve, assume it may */
4275
4276         /* "tty0" means the active VC, so it may be the same sometimes */
4277         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4278 }
4279
4280 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4281         assert(ec);
4282
4283         return ec->tty_reset ||
4284                 ec->tty_vhangup ||
4285                 ec->tty_vt_disallocate ||
4286                 is_terminal_input(ec->std_input) ||
4287                 is_terminal_output(ec->std_output) ||
4288                 is_terminal_output(ec->std_error);
4289 }
4290
4291 bool exec_context_may_touch_console(const ExecContext *ec) {
4292
4293         return exec_context_may_touch_tty(ec) &&
4294                tty_may_match_dev_console(exec_context_tty_path(ec));
4295 }
4296
4297 static void strv_fprintf(FILE *f, char **l) {
4298         char **g;
4299
4300         assert(f);
4301
4302         STRV_FOREACH(g, l)
4303                 fprintf(f, " %s", *g);
4304 }
4305
4306 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4307         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4308         ExecDirectoryType dt;
4309         unsigned i;
4310         int r;
4311
4312         assert(c);
4313         assert(f);
4314
4315         prefix = strempty(prefix);
4316
4317         fprintf(f,
4318                 "%sUMask: %04o\n"
4319                 "%sWorkingDirectory: %s\n"
4320                 "%sRootDirectory: %s\n"
4321                 "%sNonBlocking: %s\n"
4322                 "%sPrivateTmp: %s\n"
4323                 "%sPrivateDevices: %s\n"
4324                 "%sProtectKernelTunables: %s\n"
4325                 "%sProtectKernelModules: %s\n"
4326                 "%sProtectControlGroups: %s\n"
4327                 "%sPrivateNetwork: %s\n"
4328                 "%sPrivateUsers: %s\n"
4329                 "%sProtectHome: %s\n"
4330                 "%sProtectSystem: %s\n"
4331                 "%sMountAPIVFS: %s\n"
4332                 "%sIgnoreSIGPIPE: %s\n"
4333                 "%sMemoryDenyWriteExecute: %s\n"
4334                 "%sRestrictRealtime: %s\n"
4335                 "%sRestrictSUIDSGID: %s\n"
4336                 "%sKeyringMode: %s\n"
4337                 "%sProtectHostname: %s\n",
4338                 prefix, c->umask,
4339                 prefix, c->working_directory ? c->working_directory : "/",
4340                 prefix, c->root_directory ? c->root_directory : "/",
4341                 prefix, yes_no(c->non_blocking),
4342                 prefix, yes_no(c->private_tmp),
4343                 prefix, yes_no(c->private_devices),
4344                 prefix, yes_no(c->protect_kernel_tunables),
4345                 prefix, yes_no(c->protect_kernel_modules),
4346                 prefix, yes_no(c->protect_control_groups),
4347                 prefix, yes_no(c->private_network),
4348                 prefix, yes_no(c->private_users),
4349                 prefix, protect_home_to_string(c->protect_home),
4350                 prefix, protect_system_to_string(c->protect_system),
4351                 prefix, yes_no(c->mount_apivfs),
4352                 prefix, yes_no(c->ignore_sigpipe),
4353                 prefix, yes_no(c->memory_deny_write_execute),
4354                 prefix, yes_no(c->restrict_realtime),
4355                 prefix, yes_no(c->restrict_suid_sgid),
4356                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4357                 prefix, yes_no(c->protect_hostname));
4358
4359         if (c->root_image)
4360                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4361
4362         STRV_FOREACH(e, c->environment)
4363                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4364
4365         STRV_FOREACH(e, c->environment_files)
4366                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4367
4368         STRV_FOREACH(e, c->pass_environment)
4369                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4370
4371         STRV_FOREACH(e, c->unset_environment)
4372                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4373
4374         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4375
4376         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4377                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4378
4379                 STRV_FOREACH(d, c->directories[dt].paths)
4380                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4381         }
4382
4383         fprintf(f,
4384                 "%sTimeoutCleanSec: %s\n",
4385                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4386
4387         if (c->nice_set)
4388                 fprintf(f,
4389                         "%sNice: %i\n",
4390                         prefix, c->nice);
4391
4392         if (c->oom_score_adjust_set)
4393                 fprintf(f,
4394                         "%sOOMScoreAdjust: %i\n",
4395                         prefix, c->oom_score_adjust);
4396
4397         for (i = 0; i < RLIM_NLIMITS; i++)
4398                 if (c->rlimit[i]) {
4399                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4400                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4401                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4402                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4403                 }
4404
4405         if (c->ioprio_set) {
4406                 _cleanup_free_ char *class_str = NULL;
4407
4408                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4409                 if (r >= 0)
4410                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4411
4412                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4413         }
4414
4415         if (c->cpu_sched_set) {
4416                 _cleanup_free_ char *policy_str = NULL;
4417
4418                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4419                 if (r >= 0)
4420                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4421
4422                 fprintf(f,
4423                         "%sCPUSchedulingPriority: %i\n"
4424                         "%sCPUSchedulingResetOnFork: %s\n",
4425                         prefix, c->cpu_sched_priority,
4426                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4427         }
4428
4429         if (c->cpu_set.set) {
4430                 _cleanup_free_ char *affinity = NULL;
4431
4432                 affinity = cpu_set_to_range_string(&c->cpu_set);
4433                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4434         }
4435
4436         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4437                 _cleanup_free_ char *nodes = NULL;
4438
4439                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4440                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4441                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4442         }
4443
4444         if (c->timer_slack_nsec != NSEC_INFINITY)
4445                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4446
4447         fprintf(f,
4448                 "%sStandardInput: %s\n"
4449                 "%sStandardOutput: %s\n"
4450                 "%sStandardError: %s\n",
4451                 prefix, exec_input_to_string(c->std_input),
4452                 prefix, exec_output_to_string(c->std_output),
4453                 prefix, exec_output_to_string(c->std_error));
4454
4455         if (c->std_input == EXEC_INPUT_NAMED_FD)
4456                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4457         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4458                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4459         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4460                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4461
4462         if (c->std_input == EXEC_INPUT_FILE)
4463                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4464         if (c->std_output == EXEC_OUTPUT_FILE)
4465                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4466         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4467                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4468         if (c->std_error == EXEC_OUTPUT_FILE)
4469                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4470         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4471                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4472
4473         if (c->tty_path)
4474                 fprintf(f,
4475                         "%sTTYPath: %s\n"
4476                         "%sTTYReset: %s\n"
4477                         "%sTTYVHangup: %s\n"
4478                         "%sTTYVTDisallocate: %s\n",
4479                         prefix, c->tty_path,
4480                         prefix, yes_no(c->tty_reset),
4481                         prefix, yes_no(c->tty_vhangup),
4482                         prefix, yes_no(c->tty_vt_disallocate));
4483
4484         if (IN_SET(c->std_output,
4485                    EXEC_OUTPUT_SYSLOG,
4486                    EXEC_OUTPUT_KMSG,
4487                    EXEC_OUTPUT_JOURNAL,
4488                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4489                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4490                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4491             IN_SET(c->std_error,
4492                    EXEC_OUTPUT_SYSLOG,
4493                    EXEC_OUTPUT_KMSG,
4494                    EXEC_OUTPUT_JOURNAL,
4495                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4496                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4497                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4498
4499                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4500
4501                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4502                 if (r >= 0)
4503                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4504
4505                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4506                 if (r >= 0)
4507                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4508         }
4509
4510         if (c->log_level_max >= 0) {
4511                 _cleanup_free_ char *t = NULL;
4512
4513                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4514
4515                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4516         }
4517
4518         if (c->log_rate_limit_interval_usec > 0) {
4519                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4520
4521                 fprintf(f,
4522                         "%sLogRateLimitIntervalSec: %s\n",
4523                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4524         }
4525
4526         if (c->log_rate_limit_burst > 0)
4527                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4528
4529         if (c->n_log_extra_fields > 0) {
4530                 size_t j;
4531
4532                 for (j = 0; j < c->n_log_extra_fields; j++) {
4533                         fprintf(f, "%sLogExtraFields: ", prefix);
4534                         fwrite(c->log_extra_fields[j].iov_base,
4535                                1, c->log_extra_fields[j].iov_len,
4536                                f);
4537                         fputc('\n', f);
4538                 }
4539         }
4540
4541         if (c->secure_bits) {
4542                 _cleanup_free_ char *str = NULL;
4543
4544                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4545                 if (r >= 0)
4546                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4547         }
4548
4549         if (c->capability_bounding_set != CAP_ALL) {
4550                 _cleanup_free_ char *str = NULL;
4551
4552                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4553                 if (r >= 0)
4554                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4555         }
4556
4557         if (c->capability_ambient_set != 0) {
4558                 _cleanup_free_ char *str = NULL;
4559
4560                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4561                 if (r >= 0)
4562                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4563         }
4564
4565         if (c->user)
4566                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4567         if (c->group)
4568                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4569
4570         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4571
4572         if (!strv_isempty(c->supplementary_groups)) {
4573                 fprintf(f, "%sSupplementaryGroups:", prefix);
4574                 strv_fprintf(f, c->supplementary_groups);
4575                 fputs("\n", f);
4576         }
4577
4578         if (c->pam_name)
4579                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4580
4581         if (!strv_isempty(c->read_write_paths)) {
4582                 fprintf(f, "%sReadWritePaths:", prefix);
4583                 strv_fprintf(f, c->read_write_paths);
4584                 fputs("\n", f);
4585         }
4586
4587         if (!strv_isempty(c->read_only_paths)) {
4588                 fprintf(f, "%sReadOnlyPaths:", prefix);
4589                 strv_fprintf(f, c->read_only_paths);
4590                 fputs("\n", f);
4591         }
4592
4593         if (!strv_isempty(c->inaccessible_paths)) {
4594                 fprintf(f, "%sInaccessiblePaths:", prefix);
4595                 strv_fprintf(f, c->inaccessible_paths);
4596                 fputs("\n", f);
4597         }
4598
4599         if (c->n_bind_mounts > 0)
4600                 for (i = 0; i < c->n_bind_mounts; i++)
4601                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4602                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4603                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4604                                 c->bind_mounts[i].source,
4605                                 c->bind_mounts[i].destination,
4606                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4607
4608         if (c->n_temporary_filesystems > 0)
4609                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4610                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4611
4612                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4613                                 t->path,
4614                                 isempty(t->options) ? "" : ":",
4615                                 strempty(t->options));
4616                 }
4617
4618         if (c->utmp_id)
4619                 fprintf(f,
4620                         "%sUtmpIdentifier: %s\n",
4621                         prefix, c->utmp_id);
4622
4623         if (c->selinux_context)
4624                 fprintf(f,
4625                         "%sSELinuxContext: %s%s\n",
4626                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4627
4628         if (c->apparmor_profile)
4629                 fprintf(f,
4630                         "%sAppArmorProfile: %s%s\n",
4631                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4632
4633         if (c->smack_process_label)
4634                 fprintf(f,
4635                         "%sSmackProcessLabel: %s%s\n",
4636                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4637
4638         if (c->personality != PERSONALITY_INVALID)
4639                 fprintf(f,
4640                         "%sPersonality: %s\n",
4641                         prefix, strna(personality_to_string(c->personality)));
4642
4643         fprintf(f,
4644                 "%sLockPersonality: %s\n",
4645                 prefix, yes_no(c->lock_personality));
4646
4647         if (c->syscall_filter) {
4648 #if HAVE_SECCOMP
4649                 Iterator j;
4650                 void *id, *val;
4651                 bool first = true;
4652 #endif
4653
4654                 fprintf(f,
4655                         "%sSystemCallFilter: ",
4656                         prefix);
4657
4658                 if (!c->syscall_whitelist)
4659                         fputc('~', f);
4660
4661 #if HAVE_SECCOMP
4662                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4663                         _cleanup_free_ char *name = NULL;
4664                         const char *errno_name = NULL;
4665                         int num = PTR_TO_INT(val);
4666
4667                         if (first)
4668                                 first = false;
4669                         else
4670                                 fputc(' ', f);
4671
4672                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4673                         fputs(strna(name), f);
4674
4675                         if (num >= 0) {
4676                                 errno_name = errno_to_name(num);
4677                                 if (errno_name)
4678                                         fprintf(f, ":%s", errno_name);
4679                                 else
4680                                         fprintf(f, ":%d", num);
4681                         }
4682                 }
4683 #endif
4684
4685                 fputc('\n', f);
4686         }
4687
4688         if (c->syscall_archs) {
4689 #if HAVE_SECCOMP
4690                 Iterator j;
4691                 void *id;
4692 #endif
4693
4694                 fprintf(f,
4695                         "%sSystemCallArchitectures:",
4696                         prefix);
4697
4698 #if HAVE_SECCOMP
4699                 SET_FOREACH(id, c->syscall_archs, j)
4700                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4701 #endif
4702                 fputc('\n', f);
4703         }
4704
4705         if (exec_context_restrict_namespaces_set(c)) {
4706                 _cleanup_free_ char *s = NULL;
4707
4708                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4709                 if (r >= 0)
4710                         fprintf(f, "%sRestrictNamespaces: %s\n",
4711                                 prefix, s);
4712         }
4713
4714         if (c->network_namespace_path)
4715                 fprintf(f,
4716                         "%sNetworkNamespacePath: %s\n",
4717                         prefix, c->network_namespace_path);
4718
4719         if (c->syscall_errno > 0) {
4720                 const char *errno_name;
4721
4722                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4723
4724                 errno_name = errno_to_name(c->syscall_errno);
4725                 if (errno_name)
4726                         fprintf(f, "%s\n", errno_name);
4727                 else
4728                         fprintf(f, "%d\n", c->syscall_errno);
4729         }
4730 }
4731
4732 bool exec_context_maintains_privileges(const ExecContext *c) {
4733         assert(c);
4734
4735         /* Returns true if the process forked off would run under
4736          * an unchanged UID or as root. */
4737
4738         if (!c->user)
4739                 return true;
4740
4741         if (streq(c->user, "root") || streq(c->user, "0"))
4742                 return true;
4743
4744         return false;
4745 }
4746
4747 int exec_context_get_effective_ioprio(const ExecContext *c) {
4748         int p;
4749
4750         assert(c);
4751
4752         if (c->ioprio_set)
4753                 return c->ioprio;
4754
4755         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4756         if (p < 0)
4757                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4758
4759         return p;
4760 }
4761
4762 void exec_context_free_log_extra_fields(ExecContext *c) {
4763         size_t l;
4764
4765         assert(c);
4766
4767         for (l = 0; l < c->n_log_extra_fields; l++)
4768                 free(c->log_extra_fields[l].iov_base);
4769         c->log_extra_fields = mfree(c->log_extra_fields);
4770         c->n_log_extra_fields = 0;
4771 }
4772
4773 void exec_context_revert_tty(ExecContext *c) {
4774         int r;
4775
4776         assert(c);
4777
4778         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4779         exec_context_tty_reset(c, NULL);
4780
4781         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4782          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4783          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4784
4785         if (exec_context_may_touch_tty(c)) {
4786                 const char *path;
4787
4788                 path = exec_context_tty_path(c);
4789                 if (path) {
4790                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4791                         if (r < 0 && r != -ENOENT)
4792                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4793                 }
4794         }
4795 }
4796
4797 int exec_context_get_clean_directories(
4798                 ExecContext *c,
4799                 char **prefix,
4800                 ExecCleanMask mask,
4801                 char ***ret) {
4802
4803         _cleanup_strv_free_ char **l = NULL;
4804         ExecDirectoryType t;
4805         int r;
4806
4807         assert(c);
4808         assert(prefix);
4809         assert(ret);
4810
4811         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4812                 char **i;
4813
4814                 if (!FLAGS_SET(mask, 1U << t))
4815                         continue;
4816
4817                 if (!prefix[t])
4818                         continue;
4819
4820                 STRV_FOREACH(i, c->directories[t].paths) {
4821                         char *j;
4822
4823                         j = path_join(prefix[t], *i);
4824                         if (!j)
4825                                 return -ENOMEM;
4826
4827                         r = strv_consume(&l, j);
4828                         if (r < 0)
4829                                 return r;
4830
4831                         /* Also remove private directories unconditionally. */
4832                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
4833                                 j = path_join(prefix[t], "private", *i);
4834                                 if (!j)
4835                                         return -ENOMEM;
4836
4837                                 r = strv_consume(&l, j);
4838                                 if (r < 0)
4839                                         return r;
4840                         }
4841                 }
4842         }
4843
4844         *ret = TAKE_PTR(l);
4845         return 0;
4846 }
4847
4848 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4849         ExecCleanMask mask = 0;
4850
4851         assert(c);
4852         assert(ret);
4853
4854         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4855                 if (!strv_isempty(c->directories[t].paths))
4856                         mask |= 1U << t;
4857
4858         *ret = mask;
4859         return 0;
4860 }
4861
4862 void exec_status_start(ExecStatus *s, pid_t pid) {
4863         assert(s);
4864
4865         *s = (ExecStatus) {
4866                 .pid = pid,
4867         };
4868
4869         dual_timestamp_get(&s->start_timestamp);
4870 }
4871
4872 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4873         assert(s);
4874
4875         if (s->pid != pid) {
4876                 *s = (ExecStatus) {
4877                         .pid = pid,
4878                 };
4879         }
4880
4881         dual_timestamp_get(&s->exit_timestamp);
4882
4883         s->code = code;
4884         s->status = status;
4885
4886         if (context && context->utmp_id)
4887                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4888 }
4889
4890 void exec_status_reset(ExecStatus *s) {
4891         assert(s);
4892
4893         *s = (ExecStatus) {};
4894 }
4895
4896 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4897         char buf[FORMAT_TIMESTAMP_MAX];
4898
4899         assert(s);
4900         assert(f);
4901
4902         if (s->pid <= 0)
4903                 return;
4904
4905         prefix = strempty(prefix);
4906
4907         fprintf(f,
4908                 "%sPID: "PID_FMT"\n",
4909                 prefix, s->pid);
4910
4911         if (dual_timestamp_is_set(&s->start_timestamp))
4912                 fprintf(f,
4913                         "%sStart Timestamp: %s\n",
4914                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4915
4916         if (dual_timestamp_is_set(&s->exit_timestamp))
4917                 fprintf(f,
4918                         "%sExit Timestamp: %s\n"
4919                         "%sExit Code: %s\n"
4920                         "%sExit Status: %i\n",
4921                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4922                         prefix, sigchld_code_to_string(s->code),
4923                         prefix, s->status);
4924 }
4925
4926 static char *exec_command_line(char **argv) {
4927         size_t k;
4928         char *n, *p, **a;
4929         bool first = true;
4930
4931         assert(argv);
4932
4933         k = 1;
4934         STRV_FOREACH(a, argv)
4935                 k += strlen(*a)+3;
4936
4937         n = new(char, k);
4938         if (!n)
4939                 return NULL;
4940
4941         p = n;
4942         STRV_FOREACH(a, argv) {
4943
4944                 if (!first)
4945                         *(p++) = ' ';
4946                 else
4947                         first = false;
4948
4949                 if (strpbrk(*a, WHITESPACE)) {
4950                         *(p++) = '\'';
4951                         p = stpcpy(p, *a);
4952                         *(p++) = '\'';
4953                 } else
4954                         p = stpcpy(p, *a);
4955
4956         }
4957
4958         *p = 0;
4959
4960         /* FIXME: this doesn't really handle arguments that have
4961          * spaces and ticks in them */
4962
4963         return n;
4964 }
4965
4966 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4967         _cleanup_free_ char *cmd = NULL;
4968         const char *prefix2;
4969
4970         assert(c);
4971         assert(f);
4972
4973         prefix = strempty(prefix);
4974         prefix2 = strjoina(prefix, "\t");
4975
4976         cmd = exec_command_line(c->argv);
4977         fprintf(f,
4978                 "%sCommand Line: %s\n",
4979                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
4980
4981         exec_status_dump(&c->exec_status, f, prefix2);
4982 }
4983
4984 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4985         assert(f);
4986
4987         prefix = strempty(prefix);
4988
4989         LIST_FOREACH(command, c, c)
4990                 exec_command_dump(c, f, prefix);
4991 }
4992
4993 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4994         ExecCommand *end;
4995
4996         assert(l);
4997         assert(e);
4998
4999         if (*l) {
5000                 /* It's kind of important, that we keep the order here */
5001                 LIST_FIND_TAIL(command, *l, end);
5002                 LIST_INSERT_AFTER(command, *l, end, e);
5003         } else
5004               *l = e;
5005 }
5006
5007 int exec_command_set(ExecCommand *c, const char *path, ...) {
5008         va_list ap;
5009         char **l, *p;
5010
5011         assert(c);
5012         assert(path);
5013
5014         va_start(ap, path);
5015         l = strv_new_ap(path, ap);
5016         va_end(ap);
5017
5018         if (!l)
5019                 return -ENOMEM;
5020
5021         p = strdup(path);
5022         if (!p) {
5023                 strv_free(l);
5024                 return -ENOMEM;
5025         }
5026
5027         free_and_replace(c->path, p);
5028
5029         return strv_free_and_replace(c->argv, l);
5030 }
5031
5032 int exec_command_append(ExecCommand *c, const char *path, ...) {
5033         _cleanup_strv_free_ char **l = NULL;
5034         va_list ap;
5035         int r;
5036
5037         assert(c);
5038         assert(path);
5039
5040         va_start(ap, path);
5041         l = strv_new_ap(path, ap);
5042         va_end(ap);
5043
5044         if (!l)
5045                 return -ENOMEM;
5046
5047         r = strv_extend_strv(&c->argv, l, false);
5048         if (r < 0)
5049                 return r;
5050
5051         return 0;
5052 }
5053
5054 static void *remove_tmpdir_thread(void *p) {
5055         _cleanup_free_ char *path = p;
5056
5057         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5058         return NULL;
5059 }
5060
5061 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5062         int r;
5063
5064         if (!rt)
5065                 return NULL;
5066
5067         if (rt->manager)
5068                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5069
5070         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5071         if (destroy && rt->tmp_dir) {
5072                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5073
5074                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5075                 if (r < 0) {
5076                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5077                         free(rt->tmp_dir);
5078                 }
5079
5080                 rt->tmp_dir = NULL;
5081         }
5082
5083         if (destroy && rt->var_tmp_dir) {
5084                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5085
5086                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5087                 if (r < 0) {
5088                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5089                         free(rt->var_tmp_dir);
5090                 }
5091
5092                 rt->var_tmp_dir = NULL;
5093         }
5094
5095         rt->id = mfree(rt->id);
5096         rt->tmp_dir = mfree(rt->tmp_dir);
5097         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5098         safe_close_pair(rt->netns_storage_socket);
5099         return mfree(rt);
5100 }
5101
5102 static void exec_runtime_freep(ExecRuntime **rt) {
5103         (void) exec_runtime_free(*rt, false);
5104 }
5105
5106 static int exec_runtime_allocate(ExecRuntime **ret) {
5107         ExecRuntime *n;
5108
5109         assert(ret);
5110
5111         n = new(ExecRuntime, 1);
5112         if (!n)
5113                 return -ENOMEM;
5114
5115         *n = (ExecRuntime) {
5116                 .netns_storage_socket = { -1, -1 },
5117         };
5118
5119         *ret = n;
5120         return 0;
5121 }
5122
5123 static int exec_runtime_add(
5124                 Manager *m,
5125                 const char *id,
5126                 const char *tmp_dir,
5127                 const char *var_tmp_dir,
5128                 const int netns_storage_socket[2],
5129                 ExecRuntime **ret) {
5130
5131         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5132         int r;
5133
5134         assert(m);
5135         assert(id);
5136
5137         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5138         if (r < 0)
5139                 return r;
5140
5141         r = exec_runtime_allocate(&rt);
5142         if (r < 0)
5143                 return r;
5144
5145         rt->id = strdup(id);
5146         if (!rt->id)
5147                 return -ENOMEM;
5148
5149         if (tmp_dir) {
5150                 rt->tmp_dir = strdup(tmp_dir);
5151                 if (!rt->tmp_dir)
5152                         return -ENOMEM;
5153
5154                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5155                 assert(var_tmp_dir);
5156                 rt->var_tmp_dir = strdup(var_tmp_dir);
5157                 if (!rt->var_tmp_dir)
5158                         return -ENOMEM;
5159         }
5160
5161         if (netns_storage_socket) {
5162                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5163                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5164         }
5165
5166         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5167         if (r < 0)
5168                 return r;
5169
5170         rt->manager = m;
5171
5172         if (ret)
5173                 *ret = rt;
5174
5175         /* do not remove created ExecRuntime object when the operation succeeds. */
5176         rt = NULL;
5177         return 0;
5178 }
5179
5180 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5181         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5182         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5183         int r;
5184
5185         assert(m);
5186         assert(c);
5187         assert(id);
5188
5189         /* It is not necessary to create ExecRuntime object. */
5190         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5191                 return 0;
5192
5193         if (c->private_tmp) {
5194                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5195                 if (r < 0)
5196                         return r;
5197         }
5198
5199         if (c->private_network || c->network_namespace_path) {
5200                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5201                         return -errno;
5202         }
5203
5204         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5205         if (r < 0)
5206                 return r;
5207
5208         /* Avoid cleanup */
5209         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5210         return 1;
5211 }
5212
5213 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5214         ExecRuntime *rt;
5215         int r;
5216
5217         assert(m);
5218         assert(id);
5219         assert(ret);
5220
5221         rt = hashmap_get(m->exec_runtime_by_id, id);
5222         if (rt)
5223                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5224                 goto ref;
5225
5226         if (!create)
5227                 return 0;
5228
5229         /* If not found, then create a new object. */
5230         r = exec_runtime_make(m, c, id, &rt);
5231         if (r <= 0)
5232                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5233                 return r;
5234
5235 ref:
5236         /* increment reference counter. */
5237         rt->n_ref++;
5238         *ret = rt;
5239         return 1;
5240 }
5241
5242 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5243         if (!rt)
5244                 return NULL;
5245
5246         assert(rt->n_ref > 0);
5247
5248         rt->n_ref--;
5249         if (rt->n_ref > 0)
5250                 return NULL;
5251
5252         return exec_runtime_free(rt, destroy);
5253 }
5254
5255 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5256         ExecRuntime *rt;
5257         Iterator i;
5258
5259         assert(m);
5260         assert(f);
5261         assert(fds);
5262
5263         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5264                 fprintf(f, "exec-runtime=%s", rt->id);
5265
5266                 if (rt->tmp_dir)
5267                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5268
5269                 if (rt->var_tmp_dir)
5270                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5271
5272                 if (rt->netns_storage_socket[0] >= 0) {
5273                         int copy;
5274
5275                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5276                         if (copy < 0)
5277                                 return copy;
5278
5279                         fprintf(f, " netns-socket-0=%i", copy);
5280                 }
5281
5282                 if (rt->netns_storage_socket[1] >= 0) {
5283                         int copy;
5284
5285                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5286                         if (copy < 0)
5287                                 return copy;
5288
5289                         fprintf(f, " netns-socket-1=%i", copy);
5290                 }
5291
5292                 fputc('\n', f);
5293         }
5294
5295         return 0;
5296 }
5297
5298 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5299         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5300         ExecRuntime *rt;
5301         int r;
5302
5303         /* This is for the migration from old (v237 or earlier) deserialization text.
5304          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5305          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5306          * so or not from the serialized text, then we always creates a new object owned by this. */
5307
5308         assert(u);
5309         assert(key);
5310         assert(value);
5311
5312         /* Manager manages ExecRuntime objects by the unit id.
5313          * So, we omit the serialized text when the unit does not have id (yet?)... */
5314         if (isempty(u->id)) {
5315                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5316                 return 0;
5317         }
5318
5319         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5320         if (r < 0) {
5321                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5322                 return 0;
5323         }
5324
5325         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5326         if (!rt) {
5327                 r = exec_runtime_allocate(&rt_create);
5328                 if (r < 0)
5329                         return log_oom();
5330
5331                 rt_create->id = strdup(u->id);
5332                 if (!rt_create->id)
5333                         return log_oom();
5334
5335                 rt = rt_create;
5336         }
5337
5338         if (streq(key, "tmp-dir")) {
5339                 char *copy;
5340
5341                 copy = strdup(value);
5342                 if (!copy)
5343                         return log_oom();
5344
5345                 free_and_replace(rt->tmp_dir, copy);
5346
5347         } else if (streq(key, "var-tmp-dir")) {
5348                 char *copy;
5349
5350                 copy = strdup(value);
5351                 if (!copy)
5352                         return log_oom();
5353
5354                 free_and_replace(rt->var_tmp_dir, copy);
5355
5356         } else if (streq(key, "netns-socket-0")) {
5357                 int fd;
5358
5359                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5360                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5361                         return 0;
5362                 }
5363
5364                 safe_close(rt->netns_storage_socket[0]);
5365                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5366
5367         } else if (streq(key, "netns-socket-1")) {
5368                 int fd;
5369
5370                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5371                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5372                         return 0;
5373                 }
5374
5375                 safe_close(rt->netns_storage_socket[1]);
5376                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5377         } else
5378                 return 0;
5379
5380         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5381         if (rt_create) {
5382                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5383                 if (r < 0) {
5384                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5385                         return 0;
5386                 }
5387
5388                 rt_create->manager = u->manager;
5389
5390                 /* Avoid cleanup */
5391                 rt_create = NULL;
5392         }
5393
5394         return 1;
5395 }
5396
5397 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5398         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5399         int r, fd0 = -1, fd1 = -1;
5400         const char *p, *v = value;
5401         size_t n;
5402
5403         assert(m);
5404         assert(value);
5405         assert(fds);
5406
5407         n = strcspn(v, " ");
5408         id = strndupa(v, n);
5409         if (v[n] != ' ')
5410                 goto finalize;
5411         p = v + n + 1;
5412
5413         v = startswith(p, "tmp-dir=");
5414         if (v) {
5415                 n = strcspn(v, " ");
5416                 tmp_dir = strndupa(v, n);
5417                 if (v[n] != ' ')
5418                         goto finalize;
5419                 p = v + n + 1;
5420         }
5421
5422         v = startswith(p, "var-tmp-dir=");
5423         if (v) {
5424                 n = strcspn(v, " ");
5425                 var_tmp_dir = strndupa(v, n);
5426                 if (v[n] != ' ')
5427                         goto finalize;
5428                 p = v + n + 1;
5429         }
5430
5431         v = startswith(p, "netns-socket-0=");
5432         if (v) {
5433                 char *buf;
5434
5435                 n = strcspn(v, " ");
5436                 buf = strndupa(v, n);
5437                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5438                         log_debug("Unable to process exec-runtime netns fd specification.");
5439                         return;
5440                 }
5441                 fd0 = fdset_remove(fds, fd0);
5442                 if (v[n] != ' ')
5443                         goto finalize;
5444                 p = v + n + 1;
5445         }
5446
5447         v = startswith(p, "netns-socket-1=");
5448         if (v) {
5449                 char *buf;
5450
5451                 n = strcspn(v, " ");
5452                 buf = strndupa(v, n);
5453                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5454                         log_debug("Unable to process exec-runtime netns fd specification.");
5455                         return;
5456                 }
5457                 fd1 = fdset_remove(fds, fd1);
5458         }
5459
5460 finalize:
5461
5462         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5463         if (r < 0)
5464                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5465 }
5466
5467 void exec_runtime_vacuum(Manager *m) {
5468         ExecRuntime *rt;
5469         Iterator i;
5470
5471         assert(m);
5472
5473         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5474
5475         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5476                 if (rt->n_ref > 0)
5477                         continue;
5478
5479                 (void) exec_runtime_free(rt, false);
5480         }
5481 }
5482
5483 void exec_params_clear(ExecParameters *p) {
5484         if (!p)
5485                 return;
5486
5487         strv_free(p->environment);
5488 }
5489
5490 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5491         [EXEC_INPUT_NULL] = "null",
5492         [EXEC_INPUT_TTY] = "tty",
5493         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5494         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5495         [EXEC_INPUT_SOCKET] = "socket",
5496         [EXEC_INPUT_NAMED_FD] = "fd",
5497         [EXEC_INPUT_DATA] = "data",
5498         [EXEC_INPUT_FILE] = "file",
5499 };
5500
5501 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5502
5503 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5504         [EXEC_OUTPUT_INHERIT] = "inherit",
5505         [EXEC_OUTPUT_NULL] = "null",
5506         [EXEC_OUTPUT_TTY] = "tty",
5507         [EXEC_OUTPUT_SYSLOG] = "syslog",
5508         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5509         [EXEC_OUTPUT_KMSG] = "kmsg",
5510         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5511         [EXEC_OUTPUT_JOURNAL] = "journal",
5512         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5513         [EXEC_OUTPUT_SOCKET] = "socket",
5514         [EXEC_OUTPUT_NAMED_FD] = "fd",
5515         [EXEC_OUTPUT_FILE] = "file",
5516         [EXEC_OUTPUT_FILE_APPEND] = "append",
5517 };
5518
5519 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5520
5521 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5522         [EXEC_UTMP_INIT] = "init",
5523         [EXEC_UTMP_LOGIN] = "login",
5524         [EXEC_UTMP_USER] = "user",
5525 };
5526
5527 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5528
5529 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5530         [EXEC_PRESERVE_NO] = "no",
5531         [EXEC_PRESERVE_YES] = "yes",
5532         [EXEC_PRESERVE_RESTART] = "restart",
5533 };
5534
5535 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5536
5537 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5538 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5539         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5540         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5541         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5542         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5543         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5544 };
5545
5546 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5547
5548 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5549  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5550  * directories, specifically .timer units with their timestamp touch file. */
5551 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5552         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5553         [EXEC_DIRECTORY_STATE] = "state",
5554         [EXEC_DIRECTORY_CACHE] = "cache",
5555         [EXEC_DIRECTORY_LOGS] = "logs",
5556         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5557 };
5558
5559 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5560
5561 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5562  * the service payload in. */
5563 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5564         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5565         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5566         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5567         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5568         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5569 };
5570
5571 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5572
5573 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5574         [EXEC_KEYRING_INHERIT] = "inherit",
5575         [EXEC_KEYRING_PRIVATE] = "private",
5576         [EXEC_KEYRING_SHARED] = "shared",
5577 };
5578
5579 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);