src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "execute.h"
  57 #include "exit-status.h"
  58 #include "fd-util.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "memory-util.h"
  69 #include "missing.h"
  70 #include "mkdir.h"
  71 #include "namespace.h"
  72 #include "parse-util.h"
  73 #include "path-util.h"
  74 #include "process-util.h"
  75 #include "rlimit-util.h"
  76 #include "rm-rf.h"
  77 #if HAVE_SECCOMP
  78 #include "seccomp-util.h"
  79 #endif
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "utmp-wtmp.h"
  96
  97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  99
 100 /* This assumes there is a 'tty' group */
 101 #define TTY_MODE 0620
 102
 103 #define SNDBUF_SIZE (8*1024*1024)
 104
 105 static int shift_fds(int fds[], size_t n_fds) {
 106         int start, restart_from;
 107
 108         if (n_fds <= 0)
 109                 return 0;
 110
 111         /* Modifies the fds array! (sorts it) */
 112
 113         assert(fds);
 114
 115         start = 0;
 116         for (;;) {
 117                 int i;
 118
 119                 restart_from = -1;
 120
 121                 for (i = start; i < (int) n_fds; i++) {
 122                         int nfd;
 123
 124                         /* Already at right index? */
 125                         if (fds[i] == i+3)
 126                                 continue;
 127
 128                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 129                         if (nfd < 0)
 130                                 return -errno;
 131
 132                         safe_close(fds[i]);
 133                         fds[i] = nfd;
 134
 135                         /* Hmm, the fd we wanted isn't free? Then
 136                          * let's remember that and try again from here */
 137                         if (nfd != i+3 && restart_from < 0)
 138                                 restart_from = i;
 139                 }
 140
 141                 if (restart_from < 0)
 142                         break;
 143
 144                 start = restart_from;
 145         }
 146
 147         return 0;
 148 }
 149
 150 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 151         size_t i, n_fds;
 152         int r;
 153
 154         n_fds = n_socket_fds + n_storage_fds;
 155         if (n_fds <= 0)
 156                 return 0;
 157
 158         assert(fds);
 159
 160         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 161          * O_NONBLOCK only applies to socket activation though. */
 162
 163         for (i = 0; i < n_fds; i++) {
 164
 165                 if (i < n_socket_fds) {
 166                         r = fd_nonblock(fds[i], nonblock);
 167                         if (r < 0)
 168                                 return r;
 169                 }
 170
 171                 /* We unconditionally drop FD_CLOEXEC from the fds,
 172                  * since after all we want to pass these fds to our
 173                  * children */
 174
 175                 r = fd_cloexec(fds[i], false);
 176                 if (r < 0)
 177                         return r;
 178         }
 179
 180         return 0;
 181 }
 182
 183 static const char *exec_context_tty_path(const ExecContext *context) {
 184         assert(context);
 185
 186         if (context->stdio_as_fds)
 187                 return NULL;
 188
 189         if (context->tty_path)
 190                 return context->tty_path;
 191
 192         return "/dev/console";
 193 }
 194
 195 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 196         const char *path;
 197
 198         assert(context);
 199
 200         path = exec_context_tty_path(context);
 201
 202         if (context->tty_vhangup) {
 203                 if (p && p->stdin_fd >= 0)
 204                         (void) terminal_vhangup_fd(p->stdin_fd);
 205                 else if (path)
 206                         (void) terminal_vhangup(path);
 207         }
 208
 209         if (context->tty_reset) {
 210                 if (p && p->stdin_fd >= 0)
 211                         (void) reset_terminal_fd(p->stdin_fd, true);
 212                 else if (path)
 213                         (void) reset_terminal(path);
 214         }
 215
 216         if (context->tty_vt_disallocate && path)
 217                 (void) vt_disallocate(path);
 218 }
 219
 220 static bool is_terminal_input(ExecInput i) {
 221         return IN_SET(i,
 222                       EXEC_INPUT_TTY,
 223                       EXEC_INPUT_TTY_FORCE,
 224                       EXEC_INPUT_TTY_FAIL);
 225 }
 226
 227 static bool is_terminal_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_TTY,
 230                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 231                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 232                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 233 }
 234
 235 static bool is_syslog_output(ExecOutput o) {
 236         return IN_SET(o,
 237                       EXEC_OUTPUT_SYSLOG,
 238                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 239 }
 240
 241 static bool is_kmsg_output(ExecOutput o) {
 242         return IN_SET(o,
 243                       EXEC_OUTPUT_KMSG,
 244                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 245 }
 246
 247 static bool exec_context_needs_term(const ExecContext *c) {
 248         assert(c);
 249
 250         /* Return true if the execution context suggests we should set $TERM to something useful. */
 251
 252         if (is_terminal_input(c->std_input))
 253                 return true;
 254
 255         if (is_terminal_output(c->std_output))
 256                 return true;
 257
 258         if (is_terminal_output(c->std_error))
 259                 return true;
 260
 261         return !!c->tty_path;
 262 }
 263
 264 static int open_null_as(int flags, int nfd) {
 265         int fd;
 266
 267         assert(nfd >= 0);
 268
 269         fd = open("/dev/null", flags|O_NOCTTY);
 270         if (fd < 0)
 271                 return -errno;
 272
 273         return move_fd(fd, nfd, false);
 274 }
 275
 276 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 277         static const union sockaddr_union sa = {
 278                 .un.sun_family = AF_UNIX,
 279                 .un.sun_path = "/run/systemd/journal/stdout",
 280         };
 281         uid_t olduid = UID_INVALID;
 282         gid_t oldgid = GID_INVALID;
 283         int r;
 284
 285         if (gid_is_valid(gid)) {
 286                 oldgid = getgid();
 287
 288                 if (setegid(gid) < 0)
 289                         return -errno;
 290         }
 291
 292         if (uid_is_valid(uid)) {
 293                 olduid = getuid();
 294
 295                 if (seteuid(uid) < 0) {
 296                         r = -errno;
 297                         goto restore_gid;
 298                 }
 299         }
 300
 301         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 302
 303         /* If we fail to restore the uid or gid, things will likely
 304            fail later on. This should only happen if an LSM interferes. */
 305
 306         if (uid_is_valid(uid))
 307                 (void) seteuid(olduid);
 308
 309  restore_gid:
 310         if (gid_is_valid(gid))
 311                 (void) setegid(oldgid);
 312
 313         return r;
 314 }
 315
 316 static int connect_logger_as(
 317                 const Unit *unit,
 318                 const ExecContext *context,
 319                 const ExecParameters *params,
 320                 ExecOutput output,
 321                 const char *ident,
 322                 int nfd,
 323                 uid_t uid,
 324                 gid_t gid) {
 325
 326         _cleanup_close_ int fd = -1;
 327         int r;
 328
 329         assert(context);
 330         assert(params);
 331         assert(output < _EXEC_OUTPUT_MAX);
 332         assert(ident);
 333         assert(nfd >= 0);
 334
 335         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = connect_journal_socket(fd, uid, gid);
 340         if (r < 0)
 341                 return r;
 342
 343         if (shutdown(fd, SHUT_RD) < 0)
 344                 return -errno;
 345
 346         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 347
 348         if (dprintf(fd,
 349                 "%s\n"
 350                 "%s\n"
 351                 "%i\n"
 352                 "%i\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n",
 356                 context->syslog_identifier ?: ident,
 357                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 358                 context->syslog_priority,
 359                 !!context->syslog_level_prefix,
 360                 is_syslog_output(output),
 361                 is_kmsg_output(output),
 362                 is_terminal_output(output)) < 0)
 363                 return -errno;
 364
 365         return move_fd(TAKE_FD(fd), nfd, false);
 366 }
 367
 368 static int open_terminal_as(const char *path, int flags, int nfd) {
 369         int fd;
 370
 371         assert(path);
 372         assert(nfd >= 0);
 373
 374         fd = open_terminal(path, flags | O_NOCTTY);
 375         if (fd < 0)
 376                 return fd;
 377
 378         return move_fd(fd, nfd, false);
 379 }
 380
 381 static int acquire_path(const char *path, int flags, mode_t mode) {
 382         union sockaddr_union sa = {};
 383         _cleanup_close_ int fd = -1;
 384         int r, salen;
 385
 386         assert(path);
 387
 388         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 389                 flags |= O_CREAT;
 390
 391         fd = open(path, flags|O_NOCTTY, mode);
 392         if (fd >= 0)
 393                 return TAKE_FD(fd);
 394
 395         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 396                 return -errno;
 397         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 398                 return -ENXIO;
 399
 400         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 401
 402         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 403         if (fd < 0)
 404                 return -errno;
 405
 406         salen = sockaddr_un_set_path(&sa.un, path);
 407         if (salen < 0)
 408                 return salen;
 409
 410         if (connect(fd, &sa.sa, salen) < 0)
 411                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 412                                                            * indication that his wasn't an AF_UNIX socket after all */
 413
 414         if ((flags & O_ACCMODE) == O_RDONLY)
 415                 r = shutdown(fd, SHUT_WR);
 416         else if ((flags & O_ACCMODE) == O_WRONLY)
 417                 r = shutdown(fd, SHUT_RD);
 418         else
 419                 return TAKE_FD(fd);
 420         if (r < 0)
 421                 return -errno;
 422
 423         return TAKE_FD(fd);
 424 }
 425
 426 static int fixup_input(
 427                 const ExecContext *context,
 428                 int socket_fd,
 429                 bool apply_tty_stdin) {
 430
 431         ExecInput std_input;
 432
 433         assert(context);
 434
 435         std_input = context->std_input;
 436
 437         if (is_terminal_input(std_input) && !apply_tty_stdin)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         return std_input;
 447 }
 448
 449 static int fixup_output(ExecOutput std_output, int socket_fd) {
 450
 451         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 452                 return EXEC_OUTPUT_INHERIT;
 453
 454         return std_output;
 455 }
 456
 457 static int setup_input(
 458                 const ExecContext *context,
 459                 const ExecParameters *params,
 460                 int socket_fd,
 461                 int named_iofds[3]) {
 462
 463         ExecInput i;
 464
 465         assert(context);
 466         assert(params);
 467
 468         if (params->stdin_fd >= 0) {
 469                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 470                         return -errno;
 471
 472                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 473                 if (isatty(STDIN_FILENO)) {
 474                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 475                         (void) reset_terminal_fd(STDIN_FILENO, true);
 476                 }
 477
 478                 return STDIN_FILENO;
 479         }
 480
 481         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 482
 483         switch (i) {
 484
 485         case EXEC_INPUT_NULL:
 486                 return open_null_as(O_RDONLY, STDIN_FILENO);
 487
 488         case EXEC_INPUT_TTY:
 489         case EXEC_INPUT_TTY_FORCE:
 490         case EXEC_INPUT_TTY_FAIL: {
 491                 int fd;
 492
 493                 fd = acquire_terminal(exec_context_tty_path(context),
 494                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 495                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 496                                                                   ACQUIRE_TERMINAL_WAIT,
 497                                       USEC_INFINITY);
 498                 if (fd < 0)
 499                         return fd;
 500
 501                 return move_fd(fd, STDIN_FILENO, false);
 502         }
 503
 504         case EXEC_INPUT_SOCKET:
 505                 assert(socket_fd >= 0);
 506
 507                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 508
 509         case EXEC_INPUT_NAMED_FD:
 510                 assert(named_iofds[STDIN_FILENO] >= 0);
 511
 512                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 513                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 514
 515         case EXEC_INPUT_DATA: {
 516                 int fd;
 517
 518                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 519                 if (fd < 0)
 520                         return fd;
 521
 522                 return move_fd(fd, STDIN_FILENO, false);
 523         }
 524
 525         case EXEC_INPUT_FILE: {
 526                 bool rw;
 527                 int fd;
 528
 529                 assert(context->stdio_file[STDIN_FILENO]);
 530
 531                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 532                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 533
 534                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 535                 if (fd < 0)
 536                         return fd;
 537
 538                 return move_fd(fd, STDIN_FILENO, false);
 539         }
 540
 541         default:
 542                 assert_not_reached("Unknown input type");
 543         }
 544 }
 545
 546 static bool can_inherit_stderr_from_stdout(
 547                 const ExecContext *context,
 548                 ExecOutput o,
 549                 ExecOutput e) {
 550
 551         assert(context);
 552
 553         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 554          * stderr fd */
 555
 556         if (e == EXEC_OUTPUT_INHERIT)
 557                 return true;
 558         if (e != o)
 559                 return false;
 560
 561         if (e == EXEC_OUTPUT_NAMED_FD)
 562                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 563
 564         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 565                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 566
 567         return true;
 568 }
 569
 570 static int setup_output(
 571                 const Unit *unit,
 572                 const ExecContext *context,
 573                 const ExecParameters *params,
 574                 int fileno,
 575                 int socket_fd,
 576                 int named_iofds[3],
 577                 const char *ident,
 578                 uid_t uid,
 579                 gid_t gid,
 580                 dev_t *journal_stream_dev,
 581                 ino_t *journal_stream_ino) {
 582
 583         ExecOutput o;
 584         ExecInput i;
 585         int r;
 586
 587         assert(unit);
 588         assert(context);
 589         assert(params);
 590         assert(ident);
 591         assert(journal_stream_dev);
 592         assert(journal_stream_ino);
 593
 594         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 595
 596                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 597                         return -errno;
 598
 599                 return STDOUT_FILENO;
 600         }
 601
 602         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 603                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 604                         return -errno;
 605
 606                 return STDERR_FILENO;
 607         }
 608
 609         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 610         o = fixup_output(context->std_output, socket_fd);
 611
 612         if (fileno == STDERR_FILENO) {
 613                 ExecOutput e;
 614                 e = fixup_output(context->std_error, socket_fd);
 615
 616                 /* This expects the input and output are already set up */
 617
 618                 /* Don't change the stderr file descriptor if we inherit all
 619                  * the way and are not on a tty */
 620                 if (e == EXEC_OUTPUT_INHERIT &&
 621                     o == EXEC_OUTPUT_INHERIT &&
 622                     i == EXEC_INPUT_NULL &&
 623                     !is_terminal_input(context->std_input) &&
 624                     getppid () != 1)
 625                         return fileno;
 626
 627                 /* Duplicate from stdout if possible */
 628                 if (can_inherit_stderr_from_stdout(context, o, e))
 629                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 630
 631                 o = e;
 632
 633         } else if (o == EXEC_OUTPUT_INHERIT) {
 634                 /* If input got downgraded, inherit the original value */
 635                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 636                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 637
 638                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 639                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 640                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 641
 642                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 643                 if (getppid() != 1)
 644                         return fileno;
 645
 646                 /* We need to open /dev/null here anew, to get the right access mode. */
 647                 return open_null_as(O_WRONLY, fileno);
 648         }
 649
 650         switch (o) {
 651
 652         case EXEC_OUTPUT_NULL:
 653                 return open_null_as(O_WRONLY, fileno);
 654
 655         case EXEC_OUTPUT_TTY:
 656                 if (is_terminal_input(i))
 657                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 658
 659                 /* We don't reset the terminal if this is just about output */
 660                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 661
 662         case EXEC_OUTPUT_SYSLOG:
 663         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 664         case EXEC_OUTPUT_KMSG:
 665         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 666         case EXEC_OUTPUT_JOURNAL:
 667         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 668                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 669                 if (r < 0) {
 670                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 671                         r = open_null_as(O_WRONLY, fileno);
 672                 } else {
 673                         struct stat st;
 674
 675                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 676                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 677                          * services to detect whether they are connected to the journal or not.
 678                          *
 679                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 680                          * about STDERR as that's usually the best way to do logging. */
 681
 682                         if (fstat(fileno, &st) >= 0 &&
 683                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 684                                 *journal_stream_dev = st.st_dev;
 685                                 *journal_stream_ino = st.st_ino;
 686                         }
 687                 }
 688                 return r;
 689
 690         case EXEC_OUTPUT_SOCKET:
 691                 assert(socket_fd >= 0);
 692
 693                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 694
 695         case EXEC_OUTPUT_NAMED_FD:
 696                 assert(named_iofds[fileno] >= 0);
 697
 698                 (void) fd_nonblock(named_iofds[fileno], false);
 699                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 700
 701         case EXEC_OUTPUT_FILE:
 702         case EXEC_OUTPUT_FILE_APPEND: {
 703                 bool rw;
 704                 int fd, flags;
 705
 706                 assert(context->stdio_file[fileno]);
 707
 708                 rw = context->std_input == EXEC_INPUT_FILE &&
 709                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 710
 711                 if (rw)
 712                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 713
 714                 flags = O_WRONLY;
 715                 if (o == EXEC_OUTPUT_FILE_APPEND)
 716                         flags |= O_APPEND;
 717
 718                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 719                 if (fd < 0)
 720                         return fd;
 721
 722                 return move_fd(fd, fileno, 0);
 723         }
 724
 725         default:
 726                 assert_not_reached("Unknown error type");
 727         }
 728 }
 729
 730 static int chown_terminal(int fd, uid_t uid) {
 731         int r;
 732
 733         assert(fd >= 0);
 734
 735         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 736         if (isatty(fd) < 1) {
 737                 if (IN_SET(errno, EINVAL, ENOTTY))
 738                         return 0; /* not a tty */
 739
 740                 return -errno;
 741         }
 742
 743         /* This might fail. What matters are the results. */
 744         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 745         if (r < 0)
 746                 return r;
 747
 748         return 1;
 749 }
 750
 751 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 752         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 753         int r;
 754
 755         assert(_saved_stdin);
 756         assert(_saved_stdout);
 757
 758         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 759         if (saved_stdin < 0)
 760                 return -errno;
 761
 762         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 763         if (saved_stdout < 0)
 764                 return -errno;
 765
 766         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 767         if (fd < 0)
 768                 return fd;
 769
 770         r = chown_terminal(fd, getuid());
 771         if (r < 0)
 772                 return r;
 773
 774         r = reset_terminal_fd(fd, true);
 775         if (r < 0)
 776                 return r;
 777
 778         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 779         fd = -1;
 780         if (r < 0)
 781                 return r;
 782
 783         *_saved_stdin = saved_stdin;
 784         *_saved_stdout = saved_stdout;
 785
 786         saved_stdin = saved_stdout = -1;
 787
 788         return 0;
 789 }
 790
 791 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 792         assert(err < 0);
 793
 794         if (err == -ETIMEDOUT)
 795                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 796         else {
 797                 errno = -err;
 798                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 799         }
 800 }
 801
 802 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 803         _cleanup_close_ int fd = -1;
 804
 805         assert(vc);
 806
 807         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 808         if (fd < 0)
 809                 return;
 810
 811         write_confirm_error_fd(err, fd, u);
 812 }
 813
 814 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 815         int r = 0;
 816
 817         assert(saved_stdin);
 818         assert(saved_stdout);
 819
 820         release_terminal();
 821
 822         if (*saved_stdin >= 0)
 823                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 824                         r = -errno;
 825
 826         if (*saved_stdout >= 0)
 827                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 828                         r = -errno;
 829
 830         *saved_stdin = safe_close(*saved_stdin);
 831         *saved_stdout = safe_close(*saved_stdout);
 832
 833         return r;
 834 }
 835
 836 enum {
 837         CONFIRM_PRETEND_FAILURE = -1,
 838         CONFIRM_PRETEND_SUCCESS =  0,
 839         CONFIRM_EXECUTE = 1,
 840 };
 841
 842 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 843         int saved_stdout = -1, saved_stdin = -1, r;
 844         _cleanup_free_ char *e = NULL;
 845         char c;
 846
 847         /* For any internal errors, assume a positive response. */
 848         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 849         if (r < 0) {
 850                 write_confirm_error(r, vc, u);
 851                 return CONFIRM_EXECUTE;
 852         }
 853
 854         /* confirm_spawn might have been disabled while we were sleeping. */
 855         if (manager_is_confirm_spawn_disabled(u->manager)) {
 856                 r = 1;
 857                 goto restore_stdio;
 858         }
 859
 860         e = ellipsize(cmdline, 60, 100);
 861         if (!e) {
 862                 log_oom();
 863                 r = CONFIRM_EXECUTE;
 864                 goto restore_stdio;
 865         }
 866
 867         for (;;) {
 868                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 869                 if (r < 0) {
 870                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 871                         r = CONFIRM_EXECUTE;
 872                         goto restore_stdio;
 873                 }
 874
 875                 switch (c) {
 876                 case 'c':
 877                         printf("Resuming normal execution.\n");
 878                         manager_disable_confirm_spawn();
 879                         r = 1;
 880                         break;
 881                 case 'D':
 882                         unit_dump(u, stdout, "  ");
 883                         continue; /* ask again */
 884                 case 'f':
 885                         printf("Failing execution.\n");
 886                         r = CONFIRM_PRETEND_FAILURE;
 887                         break;
 888                 case 'h':
 889                         printf("  c - continue, proceed without asking anymore\n"
 890                                "  D - dump, show the state of the unit\n"
 891                                "  f - fail, don't execute the command and pretend it failed\n"
 892                                "  h - help\n"
 893                                "  i - info, show a short summary of the unit\n"
 894                                "  j - jobs, show jobs that are in progress\n"
 895                                "  s - skip, don't execute the command and pretend it succeeded\n"
 896                                "  y - yes, execute the command\n");
 897                         continue; /* ask again */
 898                 case 'i':
 899                         printf("  Description: %s\n"
 900                                "  Unit:        %s\n"
 901                                "  Command:     %s\n",
 902                                u->id, u->description, cmdline);
 903                         continue; /* ask again */
 904                 case 'j':
 905                         manager_dump_jobs(u->manager, stdout, "  ");
 906                         continue; /* ask again */
 907                 case 'n':
 908                         /* 'n' was removed in favor of 'f'. */
 909                         printf("Didn't understand 'n', did you mean 'f'?\n");
 910                         continue; /* ask again */
 911                 case 's':
 912                         printf("Skipping execution.\n");
 913                         r = CONFIRM_PRETEND_SUCCESS;
 914                         break;
 915                 case 'y':
 916                         r = CONFIRM_EXECUTE;
 917                         break;
 918                 default:
 919                         assert_not_reached("Unhandled choice");
 920                 }
 921                 break;
 922         }
 923
 924 restore_stdio:
 925         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 926         return r;
 927 }
 928
 929 static int get_fixed_user(const ExecContext *c, const char **user,
 930                           uid_t *uid, gid_t *gid,
 931                           const char **home, const char **shell) {
 932         int r;
 933         const char *name;
 934
 935         assert(c);
 936
 937         if (!c->user)
 938                 return 0;
 939
 940         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 941          * (i.e. are "/" or "/bin/nologin"). */
 942
 943         name = c->user;
 944         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 945         if (r < 0)
 946                 return r;
 947
 948         *user = name;
 949         return 0;
 950 }
 951
 952 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 953         int r;
 954         const char *name;
 955
 956         assert(c);
 957
 958         if (!c->group)
 959                 return 0;
 960
 961         name = c->group;
 962         r = get_group_creds(&name, gid, 0);
 963         if (r < 0)
 964                 return r;
 965
 966         *group = name;
 967         return 0;
 968 }
 969
 970 static int get_supplementary_groups(const ExecContext *c, const char *user,
 971                                     const char *group, gid_t gid,
 972                                     gid_t **supplementary_gids, int *ngids) {
 973         char **i;
 974         int r, k = 0;
 975         int ngroups_max;
 976         bool keep_groups = false;
 977         gid_t *groups = NULL;
 978         _cleanup_free_ gid_t *l_gids = NULL;
 979
 980         assert(c);
 981
 982         /*
 983          * If user is given, then lookup GID and supplementary groups list.
 984          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 985          * here and as early as possible so we keep the list of supplementary
 986          * groups of the caller.
 987          */
 988         if (user && gid_is_valid(gid) && gid != 0) {
 989                 /* First step, initialize groups from /etc/groups */
 990                 if (initgroups(user, gid) < 0)
 991                         return -errno;
 992
 993                 keep_groups = true;
 994         }
 995
 996         if (strv_isempty(c->supplementary_groups))
 997                 return 0;
 998
 999         /*
1000          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1001          * be positive, otherwise fail.
1002          */
1003         errno = 0;
1004         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1005         if (ngroups_max <= 0) {
1006                 if (errno > 0)
1007                         return -errno;
1008                 else
1009                         return -EOPNOTSUPP; /* For all other values */
1010         }
1011
1012         l_gids = new(gid_t, ngroups_max);
1013         if (!l_gids)
1014                 return -ENOMEM;
1015
1016         if (keep_groups) {
1017                 /*
1018                  * Lookup the list of groups that the user belongs to, we
1019                  * avoid NSS lookups here too for gid=0.
1020                  */
1021                 k = ngroups_max;
1022                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1023                         return -EINVAL;
1024         } else
1025                 k = 0;
1026
1027         STRV_FOREACH(i, c->supplementary_groups) {
1028                 const char *g;
1029
1030                 if (k >= ngroups_max)
1031                         return -E2BIG;
1032
1033                 g = *i;
1034                 r = get_group_creds(&g, l_gids+k, 0);
1035                 if (r < 0)
1036                         return r;
1037
1038                 k++;
1039         }
1040
1041         /*
1042          * Sets ngids to zero to drop all supplementary groups, happens
1043          * when we are under root and SupplementaryGroups= is empty.
1044          */
1045         if (k == 0) {
1046                 *ngids = 0;
1047                 return 0;
1048         }
1049
1050         /* Otherwise get the final list of supplementary groups */
1051         groups = memdup(l_gids, sizeof(gid_t) * k);
1052         if (!groups)
1053                 return -ENOMEM;
1054
1055         *supplementary_gids = groups;
1056         *ngids = k;
1057
1058         groups = NULL;
1059
1060         return 0;
1061 }
1062
1063 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1064         int r;
1065
1066         /* Handle SupplementaryGroups= if it is not empty */
1067         if (ngids > 0) {
1068                 r = maybe_setgroups(ngids, supplementary_gids);
1069                 if (r < 0)
1070                         return r;
1071         }
1072
1073         if (gid_is_valid(gid)) {
1074                 /* Then set our gids */
1075                 if (setresgid(gid, gid, gid) < 0)
1076                         return -errno;
1077         }
1078
1079         return 0;
1080 }
1081
1082 static int enforce_user(const ExecContext *context, uid_t uid) {
1083         assert(context);
1084
1085         if (!uid_is_valid(uid))
1086                 return 0;
1087
1088         /* Sets (but doesn't look up) the uid and make sure we keep the
1089          * capabilities while doing so. */
1090
1091         if (context->capability_ambient_set != 0) {
1092
1093                 /* First step: If we need to keep capabilities but
1094                  * drop privileges we need to make sure we keep our
1095                  * caps, while we drop privileges. */
1096                 if (uid != 0) {
1097                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1098
1099                         if (prctl(PR_GET_SECUREBITS) != sb)
1100                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1101                                         return -errno;
1102                 }
1103         }
1104
1105         /* Second step: actually set the uids */
1106         if (setresuid(uid, uid, uid) < 0)
1107                 return -errno;
1108
1109         /* At this point we should have all necessary capabilities but
1110            are otherwise a normal user. However, the caps might got
1111            corrupted due to the setresuid() so we need clean them up
1112            later. This is done outside of this call. */
1113
1114         return 0;
1115 }
1116
1117 #if HAVE_PAM
1118
1119 static int null_conv(
1120                 int num_msg,
1121                 const struct pam_message **msg,
1122                 struct pam_response **resp,
1123                 void *appdata_ptr) {
1124
1125         /* We don't support conversations */
1126
1127         return PAM_CONV_ERR;
1128 }
1129
1130 #endif
1131
1132 static int setup_pam(
1133                 const char *name,
1134                 const char *user,
1135                 uid_t uid,
1136                 gid_t gid,
1137                 const char *tty,
1138                 char ***env,
1139                 int fds[], size_t n_fds) {
1140
1141 #if HAVE_PAM
1142
1143         static const struct pam_conv conv = {
1144                 .conv = null_conv,
1145                 .appdata_ptr = NULL
1146         };
1147
1148         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1149         pam_handle_t *handle = NULL;
1150         sigset_t old_ss;
1151         int pam_code = PAM_SUCCESS, r;
1152         char **nv, **e = NULL;
1153         bool close_session = false;
1154         pid_t pam_pid = 0, parent_pid;
1155         int flags = 0;
1156
1157         assert(name);
1158         assert(user);
1159         assert(env);
1160
1161         /* We set up PAM in the parent process, then fork. The child
1162          * will then stay around until killed via PR_GET_PDEATHSIG or
1163          * systemd via the cgroup logic. It will then remove the PAM
1164          * session again. The parent process will exec() the actual
1165          * daemon. We do things this way to ensure that the main PID
1166          * of the daemon is the one we initially fork()ed. */
1167
1168         r = barrier_create(&barrier);
1169         if (r < 0)
1170                 goto fail;
1171
1172         if (log_get_max_level() < LOG_DEBUG)
1173                 flags |= PAM_SILENT;
1174
1175         pam_code = pam_start(name, user, &conv, &handle);
1176         if (pam_code != PAM_SUCCESS) {
1177                 handle = NULL;
1178                 goto fail;
1179         }
1180
1181         if (!tty) {
1182                 _cleanup_free_ char *q = NULL;
1183
1184                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1185                  * out if that's the case, and read the TTY off it. */
1186
1187                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1188                         tty = strjoina("/dev/", q);
1189         }
1190
1191         if (tty) {
1192                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1193                 if (pam_code != PAM_SUCCESS)
1194                         goto fail;
1195         }
1196
1197         STRV_FOREACH(nv, *env) {
1198                 pam_code = pam_putenv(handle, *nv);
1199                 if (pam_code != PAM_SUCCESS)
1200                         goto fail;
1201         }
1202
1203         pam_code = pam_acct_mgmt(handle, flags);
1204         if (pam_code != PAM_SUCCESS)
1205                 goto fail;
1206
1207         pam_code = pam_open_session(handle, flags);
1208         if (pam_code != PAM_SUCCESS)
1209                 goto fail;
1210
1211         close_session = true;
1212
1213         e = pam_getenvlist(handle);
1214         if (!e) {
1215                 pam_code = PAM_BUF_ERR;
1216                 goto fail;
1217         }
1218
1219         /* Block SIGTERM, so that we know that it won't get lost in
1220          * the child */
1221
1222         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1223
1224         parent_pid = getpid_cached();
1225
1226         r = safe_fork("(sd-pam)", 0, &pam_pid);
1227         if (r < 0)
1228                 goto fail;
1229         if (r == 0) {
1230                 int sig, ret = EXIT_PAM;
1231
1232                 /* The child's job is to reset the PAM session on
1233                  * termination */
1234                 barrier_set_role(&barrier, BARRIER_CHILD);
1235
1236                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1237                  * are open here that have been opened by PAM. */
1238                 (void) close_many(fds, n_fds);
1239
1240                 /* Drop privileges - we don't need any to pam_close_session
1241                  * and this will make PR_SET_PDEATHSIG work in most cases.
1242                  * If this fails, ignore the error - but expect sd-pam threads
1243                  * to fail to exit normally */
1244
1245                 r = maybe_setgroups(0, NULL);
1246                 if (r < 0)
1247                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1248                 if (setresgid(gid, gid, gid) < 0)
1249                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1250                 if (setresuid(uid, uid, uid) < 0)
1251                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1252
1253                 (void) ignore_signals(SIGPIPE, -1);
1254
1255                 /* Wait until our parent died. This will only work if
1256                  * the above setresuid() succeeds, otherwise the kernel
1257                  * will not allow unprivileged parents kill their privileged
1258                  * children this way. We rely on the control groups kill logic
1259                  * to do the rest for us. */
1260                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1261                         goto child_finish;
1262
1263                 /* Tell the parent that our setup is done. This is especially
1264                  * important regarding dropping privileges. Otherwise, unit
1265                  * setup might race against our setresuid(2) call.
1266                  *
1267                  * If the parent aborted, we'll detect this below, hence ignore
1268                  * return failure here. */
1269                 (void) barrier_place(&barrier);
1270
1271                 /* Check if our parent process might already have died? */
1272                 if (getppid() == parent_pid) {
1273                         sigset_t ss;
1274
1275                         assert_se(sigemptyset(&ss) >= 0);
1276                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1277
1278                         for (;;) {
1279                                 if (sigwait(&ss, &sig) < 0) {
1280                                         if (errno == EINTR)
1281                                                 continue;
1282
1283                                         goto child_finish;
1284                                 }
1285
1286                                 assert(sig == SIGTERM);
1287                                 break;
1288                         }
1289                 }
1290
1291                 /* If our parent died we'll end the session */
1292                 if (getppid() != parent_pid) {
1293                         pam_code = pam_close_session(handle, flags);
1294                         if (pam_code != PAM_SUCCESS)
1295                                 goto child_finish;
1296                 }
1297
1298                 ret = 0;
1299
1300         child_finish:
1301                 pam_end(handle, pam_code | flags);
1302                 _exit(ret);
1303         }
1304
1305         barrier_set_role(&barrier, BARRIER_PARENT);
1306
1307         /* If the child was forked off successfully it will do all the
1308          * cleanups, so forget about the handle here. */
1309         handle = NULL;
1310
1311         /* Unblock SIGTERM again in the parent */
1312         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1313
1314         /* We close the log explicitly here, since the PAM modules
1315          * might have opened it, but we don't want this fd around. */
1316         closelog();
1317
1318         /* Synchronously wait for the child to initialize. We don't care for
1319          * errors as we cannot recover. However, warn loudly if it happens. */
1320         if (!barrier_place_and_sync(&barrier))
1321                 log_error("PAM initialization failed");
1322
1323         return strv_free_and_replace(*env, e);
1324
1325 fail:
1326         if (pam_code != PAM_SUCCESS) {
1327                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1328                 r = -EPERM;  /* PAM errors do not map to errno */
1329         } else
1330                 log_error_errno(r, "PAM failed: %m");
1331
1332         if (handle) {
1333                 if (close_session)
1334                         pam_code = pam_close_session(handle, flags);
1335
1336                 pam_end(handle, pam_code | flags);
1337         }
1338
1339         strv_free(e);
1340         closelog();
1341
1342         return r;
1343 #else
1344         return 0;
1345 #endif
1346 }
1347
1348 static void rename_process_from_path(const char *path) {
1349         char process_name[11];
1350         const char *p;
1351         size_t l;
1352
1353         /* This resulting string must fit in 10 chars (i.e. the length
1354          * of "/sbin/init") to look pretty in /bin/ps */
1355
1356         p = basename(path);
1357         if (isempty(p)) {
1358                 rename_process("(...)");
1359                 return;
1360         }
1361
1362         l = strlen(p);
1363         if (l > 8) {
1364                 /* The end of the process name is usually more
1365                  * interesting, since the first bit might just be
1366                  * "systemd-" */
1367                 p = p + l - 8;
1368                 l = 8;
1369         }
1370
1371         process_name[0] = '(';
1372         memcpy(process_name+1, p, l);
1373         process_name[1+l] = ')';
1374         process_name[1+l+1] = 0;
1375
1376         rename_process(process_name);
1377 }
1378
1379 static bool context_has_address_families(const ExecContext *c) {
1380         assert(c);
1381
1382         return c->address_families_whitelist ||
1383                 !set_isempty(c->address_families);
1384 }
1385
1386 static bool context_has_syscall_filters(const ExecContext *c) {
1387         assert(c);
1388
1389         return c->syscall_whitelist ||
1390                 !hashmap_isempty(c->syscall_filter);
1391 }
1392
1393 static bool context_has_no_new_privileges(const ExecContext *c) {
1394         assert(c);
1395
1396         if (c->no_new_privileges)
1397                 return true;
1398
1399         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1400                 return false;
1401
1402         /* We need NNP if we have any form of seccomp and are unprivileged */
1403         return context_has_address_families(c) ||
1404                 c->memory_deny_write_execute ||
1405                 c->restrict_realtime ||
1406                 c->restrict_suid_sgid ||
1407                 exec_context_restrict_namespaces_set(c) ||
1408                 c->protect_kernel_tunables ||
1409                 c->protect_kernel_modules ||
1410                 c->private_devices ||
1411                 context_has_syscall_filters(c) ||
1412                 !set_isempty(c->syscall_archs) ||
1413                 c->lock_personality ||
1414                 c->protect_hostname;
1415 }
1416
1417 #if HAVE_SECCOMP
1418
1419 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1420
1421         if (is_seccomp_available())
1422                 return false;
1423
1424         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1425         return true;
1426 }
1427
1428 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1429         uint32_t negative_action, default_action, action;
1430         int r;
1431
1432         assert(u);
1433         assert(c);
1434
1435         if (!context_has_syscall_filters(c))
1436                 return 0;
1437
1438         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1439                 return 0;
1440
1441         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1442
1443         if (c->syscall_whitelist) {
1444                 default_action = negative_action;
1445                 action = SCMP_ACT_ALLOW;
1446         } else {
1447                 default_action = SCMP_ACT_ALLOW;
1448                 action = negative_action;
1449         }
1450
1451         if (needs_ambient_hack) {
1452                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1453                 if (r < 0)
1454                         return r;
1455         }
1456
1457         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1458 }
1459
1460 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1461         assert(u);
1462         assert(c);
1463
1464         if (set_isempty(c->syscall_archs))
1465                 return 0;
1466
1467         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1468                 return 0;
1469
1470         return seccomp_restrict_archs(c->syscall_archs);
1471 }
1472
1473 static int apply_address_families(const Unit* u, const ExecContext *c) {
1474         assert(u);
1475         assert(c);
1476
1477         if (!context_has_address_families(c))
1478                 return 0;
1479
1480         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1481                 return 0;
1482
1483         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1484 }
1485
1486 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1487         assert(u);
1488         assert(c);
1489
1490         if (!c->memory_deny_write_execute)
1491                 return 0;
1492
1493         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1494                 return 0;
1495
1496         return seccomp_memory_deny_write_execute();
1497 }
1498
1499 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1500         assert(u);
1501         assert(c);
1502
1503         if (!c->restrict_realtime)
1504                 return 0;
1505
1506         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1507                 return 0;
1508
1509         return seccomp_restrict_realtime();
1510 }
1511
1512 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1513         assert(u);
1514         assert(c);
1515
1516         if (!c->restrict_suid_sgid)
1517                 return 0;
1518
1519         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1520                 return 0;
1521
1522         return seccomp_restrict_suid_sgid();
1523 }
1524
1525 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1526         assert(u);
1527         assert(c);
1528
1529         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1530          * let's protect even those systems where this is left on in the kernel. */
1531
1532         if (!c->protect_kernel_tunables)
1533                 return 0;
1534
1535         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1536                 return 0;
1537
1538         return seccomp_protect_sysctl();
1539 }
1540
1541 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1542         assert(u);
1543         assert(c);
1544
1545         /* Turn off module syscalls on ProtectKernelModules=yes */
1546
1547         if (!c->protect_kernel_modules)
1548                 return 0;
1549
1550         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1551                 return 0;
1552
1553         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1554 }
1555
1556 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1557         assert(u);
1558         assert(c);
1559
1560         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1561
1562         if (!c->private_devices)
1563                 return 0;
1564
1565         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1566                 return 0;
1567
1568         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1569 }
1570
1571 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1572         assert(u);
1573         assert(c);
1574
1575         if (!exec_context_restrict_namespaces_set(c))
1576                 return 0;
1577
1578         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1579                 return 0;
1580
1581         return seccomp_restrict_namespaces(c->restrict_namespaces);
1582 }
1583
1584 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1585         unsigned long personality;
1586         int r;
1587
1588         assert(u);
1589         assert(c);
1590
1591         if (!c->lock_personality)
1592                 return 0;
1593
1594         if (skip_seccomp_unavailable(u, "LockPersonality="))
1595                 return 0;
1596
1597         personality = c->personality;
1598
1599         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1600         if (personality == PERSONALITY_INVALID) {
1601
1602                 r = opinionated_personality(&personality);
1603                 if (r < 0)
1604                         return r;
1605         }
1606
1607         return seccomp_lock_personality(personality);
1608 }
1609
1610 #endif
1611
1612 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1613         assert(idle_pipe);
1614
1615         idle_pipe[1] = safe_close(idle_pipe[1]);
1616         idle_pipe[2] = safe_close(idle_pipe[2]);
1617
1618         if (idle_pipe[0] >= 0) {
1619                 int r;
1620
1621                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1622
1623                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1624                         ssize_t n;
1625
1626                         /* Signal systemd that we are bored and want to continue. */
1627                         n = write(idle_pipe[3], "x", 1);
1628                         if (n > 0)
1629                                 /* Wait for systemd to react to the signal above. */
1630                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1631                 }
1632
1633                 idle_pipe[0] = safe_close(idle_pipe[0]);
1634
1635         }
1636
1637         idle_pipe[3] = safe_close(idle_pipe[3]);
1638 }
1639
1640 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1641
1642 static int build_environment(
1643                 const Unit *u,
1644                 const ExecContext *c,
1645                 const ExecParameters *p,
1646                 size_t n_fds,
1647                 const char *home,
1648                 const char *username,
1649                 const char *shell,
1650                 dev_t journal_stream_dev,
1651                 ino_t journal_stream_ino,
1652                 char ***ret) {
1653
1654         _cleanup_strv_free_ char **our_env = NULL;
1655         ExecDirectoryType t;
1656         size_t n_env = 0;
1657         char *x;
1658
1659         assert(u);
1660         assert(c);
1661         assert(p);
1662         assert(ret);
1663
1664         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1665         if (!our_env)
1666                 return -ENOMEM;
1667
1668         if (n_fds > 0) {
1669                 _cleanup_free_ char *joined = NULL;
1670
1671                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1672                         return -ENOMEM;
1673                 our_env[n_env++] = x;
1674
1675                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1676                         return -ENOMEM;
1677                 our_env[n_env++] = x;
1678
1679                 joined = strv_join(p->fd_names, ":");
1680                 if (!joined)
1681                         return -ENOMEM;
1682
1683                 x = strjoin("LISTEN_FDNAMES=", joined);
1684                 if (!x)
1685                         return -ENOMEM;
1686                 our_env[n_env++] = x;
1687         }
1688
1689         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1690                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1691                         return -ENOMEM;
1692                 our_env[n_env++] = x;
1693
1694                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1695                         return -ENOMEM;
1696                 our_env[n_env++] = x;
1697         }
1698
1699         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1700          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1701          * check the database directly. */
1702         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1703                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1704                 if (!x)
1705                         return -ENOMEM;
1706                 our_env[n_env++] = x;
1707         }
1708
1709         if (home) {
1710                 x = strappend("HOME=", home);
1711                 if (!x)
1712                         return -ENOMEM;
1713
1714                 path_simplify(x + 5, true);
1715                 our_env[n_env++] = x;
1716         }
1717
1718         if (username) {
1719                 x = strappend("LOGNAME=", username);
1720                 if (!x)
1721                         return -ENOMEM;
1722                 our_env[n_env++] = x;
1723
1724                 x = strappend("USER=", username);
1725                 if (!x)
1726                         return -ENOMEM;
1727                 our_env[n_env++] = x;
1728         }
1729
1730         if (shell) {
1731                 x = strappend("SHELL=", shell);
1732                 if (!x)
1733                         return -ENOMEM;
1734
1735                 path_simplify(x + 6, true);
1736                 our_env[n_env++] = x;
1737         }
1738
1739         if (!sd_id128_is_null(u->invocation_id)) {
1740                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1741                         return -ENOMEM;
1742
1743                 our_env[n_env++] = x;
1744         }
1745
1746         if (exec_context_needs_term(c)) {
1747                 const char *tty_path, *term = NULL;
1748
1749                 tty_path = exec_context_tty_path(c);
1750
1751                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1752                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1753                  * passes to PID 1 ends up all the way in the console login shown. */
1754
1755                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1756                         term = getenv("TERM");
1757                 if (!term)
1758                         term = default_term_for_tty(tty_path);
1759
1760                 x = strappend("TERM=", term);
1761                 if (!x)
1762                         return -ENOMEM;
1763                 our_env[n_env++] = x;
1764         }
1765
1766         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1767                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1768                         return -ENOMEM;
1769
1770                 our_env[n_env++] = x;
1771         }
1772
1773         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1774                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1775                 const char *n;
1776
1777                 if (!p->prefix[t])
1778                         continue;
1779
1780                 if (strv_isempty(c->directories[t].paths))
1781                         continue;
1782
1783                 n = exec_directory_env_name_to_string(t);
1784                 if (!n)
1785                         continue;
1786
1787                 pre = strjoin(p->prefix[t], "/");
1788                 if (!pre)
1789                         return -ENOMEM;
1790
1791                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1792                 if (!joined)
1793                         return -ENOMEM;
1794
1795                 x = strjoin(n, "=", joined);
1796                 if (!x)
1797                         return -ENOMEM;
1798
1799                 our_env[n_env++] = x;
1800         }
1801
1802         our_env[n_env++] = NULL;
1803         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1804
1805         *ret = TAKE_PTR(our_env);
1806
1807         return 0;
1808 }
1809
1810 static int build_pass_environment(const ExecContext *c, char ***ret) {
1811         _cleanup_strv_free_ char **pass_env = NULL;
1812         size_t n_env = 0, n_bufsize = 0;
1813         char **i;
1814
1815         STRV_FOREACH(i, c->pass_environment) {
1816                 _cleanup_free_ char *x = NULL;
1817                 char *v;
1818
1819                 v = getenv(*i);
1820                 if (!v)
1821                         continue;
1822                 x = strjoin(*i, "=", v);
1823                 if (!x)
1824                         return -ENOMEM;
1825
1826                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1827                         return -ENOMEM;
1828
1829                 pass_env[n_env++] = TAKE_PTR(x);
1830                 pass_env[n_env] = NULL;
1831         }
1832
1833         *ret = TAKE_PTR(pass_env);
1834
1835         return 0;
1836 }
1837
1838 static bool exec_needs_mount_namespace(
1839                 const ExecContext *context,
1840                 const ExecParameters *params,
1841                 const ExecRuntime *runtime) {
1842
1843         assert(context);
1844         assert(params);
1845
1846         if (context->root_image)
1847                 return true;
1848
1849         if (!strv_isempty(context->read_write_paths) ||
1850             !strv_isempty(context->read_only_paths) ||
1851             !strv_isempty(context->inaccessible_paths))
1852                 return true;
1853
1854         if (context->n_bind_mounts > 0)
1855                 return true;
1856
1857         if (context->n_temporary_filesystems > 0)
1858                 return true;
1859
1860         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1861                 return true;
1862
1863         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1864                 return true;
1865
1866         if (context->private_devices ||
1867             context->private_mounts ||
1868             context->protect_system != PROTECT_SYSTEM_NO ||
1869             context->protect_home != PROTECT_HOME_NO ||
1870             context->protect_kernel_tunables ||
1871             context->protect_kernel_modules ||
1872             context->protect_control_groups)
1873                 return true;
1874
1875         if (context->root_directory) {
1876                 ExecDirectoryType t;
1877
1878                 if (context->mount_apivfs)
1879                         return true;
1880
1881                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1882                         if (!params->prefix[t])
1883                                 continue;
1884
1885                         if (!strv_isempty(context->directories[t].paths))
1886                                 return true;
1887                 }
1888         }
1889
1890         if (context->dynamic_user &&
1891             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1892              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1893              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1894                 return true;
1895
1896         return false;
1897 }
1898
1899 static int setup_private_users(uid_t uid, gid_t gid) {
1900         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1901         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1902         _cleanup_close_ int unshare_ready_fd = -1;
1903         _cleanup_(sigkill_waitp) pid_t pid = 0;
1904         uint64_t c = 1;
1905         ssize_t n;
1906         int r;
1907
1908         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1909          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1910          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1911          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1912          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1913          * continues execution normally. */
1914
1915         if (uid != 0 && uid_is_valid(uid)) {
1916                 r = asprintf(&uid_map,
1917                              "0 0 1\n"                      /* Map root → root */
1918                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1919                              uid, uid);
1920                 if (r < 0)
1921                         return -ENOMEM;
1922         } else {
1923                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1924                 if (!uid_map)
1925                         return -ENOMEM;
1926         }
1927
1928         if (gid != 0 && gid_is_valid(gid)) {
1929                 r = asprintf(&gid_map,
1930                              "0 0 1\n"                      /* Map root → root */
1931                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1932                              gid, gid);
1933                 if (r < 0)
1934                         return -ENOMEM;
1935         } else {
1936                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1937                 if (!gid_map)
1938                         return -ENOMEM;
1939         }
1940
1941         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1942          * namespace. */
1943         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1944         if (unshare_ready_fd < 0)
1945                 return -errno;
1946
1947         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1948          * failed. */
1949         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1950                 return -errno;
1951
1952         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1953         if (r < 0)
1954                 return r;
1955         if (r == 0) {
1956                 _cleanup_close_ int fd = -1;
1957                 const char *a;
1958                 pid_t ppid;
1959
1960                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1961                  * here, after the parent opened its own user namespace. */
1962
1963                 ppid = getppid();
1964                 errno_pipe[0] = safe_close(errno_pipe[0]);
1965
1966                 /* Wait until the parent unshared the user namespace */
1967                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1968                         r = -errno;
1969                         goto child_fail;
1970                 }
1971
1972                 /* Disable the setgroups() system call in the child user namespace, for good. */
1973                 a = procfs_file_alloca(ppid, "setgroups");
1974                 fd = open(a, O_WRONLY|O_CLOEXEC);
1975                 if (fd < 0) {
1976                         if (errno != ENOENT) {
1977                                 r = -errno;
1978                                 goto child_fail;
1979                         }
1980
1981                         /* If the file is missing the kernel is too old, let's continue anyway. */
1982                 } else {
1983                         if (write(fd, "deny\n", 5) < 0) {
1984                                 r = -errno;
1985                                 goto child_fail;
1986                         }
1987
1988                         fd = safe_close(fd);
1989                 }
1990
1991                 /* First write the GID map */
1992                 a = procfs_file_alloca(ppid, "gid_map");
1993                 fd = open(a, O_WRONLY|O_CLOEXEC);
1994                 if (fd < 0) {
1995                         r = -errno;
1996                         goto child_fail;
1997                 }
1998                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1999                         r = -errno;
2000                         goto child_fail;
2001                 }
2002                 fd = safe_close(fd);
2003
2004                 /* The write the UID map */
2005                 a = procfs_file_alloca(ppid, "uid_map");
2006                 fd = open(a, O_WRONLY|O_CLOEXEC);
2007                 if (fd < 0) {
2008                         r = -errno;
2009                         goto child_fail;
2010                 }
2011                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2012                         r = -errno;
2013                         goto child_fail;
2014                 }
2015
2016                 _exit(EXIT_SUCCESS);
2017
2018         child_fail:
2019                 (void) write(errno_pipe[1], &r, sizeof(r));
2020                 _exit(EXIT_FAILURE);
2021         }
2022
2023         errno_pipe[1] = safe_close(errno_pipe[1]);
2024
2025         if (unshare(CLONE_NEWUSER) < 0)
2026                 return -errno;
2027
2028         /* Let the child know that the namespace is ready now */
2029         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2030                 return -errno;
2031
2032         /* Try to read an error code from the child */
2033         n = read(errno_pipe[0], &r, sizeof(r));
2034         if (n < 0)
2035                 return -errno;
2036         if (n == sizeof(r)) { /* an error code was sent to us */
2037                 if (r < 0)
2038                         return r;
2039                 return -EIO;
2040         }
2041         if (n != 0) /* on success we should have read 0 bytes */
2042                 return -EIO;
2043
2044         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2045         pid = 0;
2046         if (r < 0)
2047                 return r;
2048         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2049                 return -EIO;
2050
2051         return 0;
2052 }
2053
2054 static int setup_exec_directory(
2055                 const ExecContext *context,
2056                 const ExecParameters *params,
2057                 uid_t uid,
2058                 gid_t gid,
2059                 ExecDirectoryType type,
2060                 int *exit_status) {
2061
2062         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2063                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2064                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2065                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2066                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2067                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2068         };
2069         char **rt;
2070         int r;
2071
2072         assert(context);
2073         assert(params);
2074         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2075         assert(exit_status);
2076
2077         if (!params->prefix[type])
2078                 return 0;
2079
2080         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2081                 if (!uid_is_valid(uid))
2082                         uid = 0;
2083                 if (!gid_is_valid(gid))
2084                         gid = 0;
2085         }
2086
2087         STRV_FOREACH(rt, context->directories[type].paths) {
2088                 _cleanup_free_ char *p = NULL, *pp = NULL;
2089
2090                 p = path_join(params->prefix[type], *rt);
2091                 if (!p) {
2092                         r = -ENOMEM;
2093                         goto fail;
2094                 }
2095
2096                 r = mkdir_parents_label(p, 0755);
2097                 if (r < 0)
2098                         goto fail;
2099
2100                 if (context->dynamic_user &&
2101                     (!IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) ||
2102                      (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode != EXEC_PRESERVE_NO))) {
2103                         _cleanup_free_ char *private_root = NULL;
2104
2105                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2106                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2107                          * whose UID is later on reused. To lock this down we use the same trick used by container
2108                          * managers to prohibit host users to get access to files of the same UID in containers: we
2109                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2110                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2111                          * to make this directory permeable for the service itself.
2112                          *
2113                          * Specifically: for a service which wants a special directory "foo/" we first create a
2114                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2115                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2116                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2117                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2118                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2119                          * disabling the access boundary for the service and making sure it only gets access to the
2120                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2121                          *
2122                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2123                          * owned by the service itself.
2124                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2125                          * files or sockets with other services. */
2126
2127                         private_root = path_join(params->prefix[type], "private");
2128                         if (!private_root) {
2129                                 r = -ENOMEM;
2130                                 goto fail;
2131                         }
2132
2133                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2134                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2135                         if (r < 0)
2136                                 goto fail;
2137
2138                         pp = path_join(private_root, *rt);
2139                         if (!pp) {
2140                                 r = -ENOMEM;
2141                                 goto fail;
2142                         }
2143
2144                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2145                         r = mkdir_parents_label(pp, 0755);
2146                         if (r < 0)
2147                                 goto fail;
2148
2149                         if (is_dir(p, false) > 0 &&
2150                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2151
2152                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2153                                  * it over. Most likely the service has been upgraded from one that didn't use
2154                                  * DynamicUser=1, to one that does. */
2155
2156                                 if (rename(p, pp) < 0) {
2157                                         r = -errno;
2158                                         goto fail;
2159                                 }
2160                         } else {
2161                                 /* Otherwise, create the actual directory for the service */
2162
2163                                 r = mkdir_label(pp, context->directories[type].mode);
2164                                 if (r < 0 && r != -EEXIST)
2165                                         goto fail;
2166                         }
2167
2168                         /* And link it up from the original place */
2169                         r = symlink_idempotent(pp, p, true);
2170                         if (r < 0)
2171                                 goto fail;
2172
2173                 } else {
2174                         r = mkdir_label(p, context->directories[type].mode);
2175                         if (r < 0) {
2176                                 if (r != -EEXIST)
2177                                         goto fail;
2178
2179                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2180                                         struct stat st;
2181
2182                                         /* Don't change the owner/access mode of the configuration directory,
2183                                          * as in the common case it is not written to by a service, and shall
2184                                          * not be writable. */
2185
2186                                         if (stat(p, &st) < 0) {
2187                                                 r = -errno;
2188                                                 goto fail;
2189                                         }
2190
2191                                         /* Still complain if the access mode doesn't match */
2192                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2193                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2194                                                             "(File system: %o %sMode: %o)",
2195                                                             exec_directory_type_to_string(type), *rt,
2196                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2197
2198                                         continue;
2199                                 }
2200                         }
2201                 }
2202
2203                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2204                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2205                  * current UID/GID ownership.) */
2206                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2207                 if (r < 0)
2208                         goto fail;
2209
2210                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2211                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2212                  * assignments to exist.*/
2213                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2214                 if (r < 0)
2215                         goto fail;
2216         }
2217
2218         return 0;
2219
2220 fail:
2221         *exit_status = exit_status_table[type];
2222         return r;
2223 }
2224
2225 #if ENABLE_SMACK
2226 static int setup_smack(
2227                 const ExecContext *context,
2228                 const ExecCommand *command) {
2229
2230         int r;
2231
2232         assert(context);
2233         assert(command);
2234
2235         if (context->smack_process_label) {
2236                 r = mac_smack_apply_pid(0, context->smack_process_label);
2237                 if (r < 0)
2238                         return r;
2239         }
2240 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2241         else {
2242                 _cleanup_free_ char *exec_label = NULL;
2243
2244                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2245                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2246                         return r;
2247
2248                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2249                 if (r < 0)
2250                         return r;
2251         }
2252 #endif
2253
2254         return 0;
2255 }
2256 #endif
2257
2258 static int compile_bind_mounts(
2259                 const ExecContext *context,
2260                 const ExecParameters *params,
2261                 BindMount **ret_bind_mounts,
2262                 size_t *ret_n_bind_mounts,
2263                 char ***ret_empty_directories) {
2264
2265         _cleanup_strv_free_ char **empty_directories = NULL;
2266         BindMount *bind_mounts;
2267         size_t n, h = 0, i;
2268         ExecDirectoryType t;
2269         int r;
2270
2271         assert(context);
2272         assert(params);
2273         assert(ret_bind_mounts);
2274         assert(ret_n_bind_mounts);
2275         assert(ret_empty_directories);
2276
2277         n = context->n_bind_mounts;
2278         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2279                 if (!params->prefix[t])
2280                         continue;
2281
2282                 n += strv_length(context->directories[t].paths);
2283         }
2284
2285         if (n <= 0) {
2286                 *ret_bind_mounts = NULL;
2287                 *ret_n_bind_mounts = 0;
2288                 *ret_empty_directories = NULL;
2289                 return 0;
2290         }
2291
2292         bind_mounts = new(BindMount, n);
2293         if (!bind_mounts)
2294                 return -ENOMEM;
2295
2296         for (i = 0; i < context->n_bind_mounts; i++) {
2297                 BindMount *item = context->bind_mounts + i;
2298                 char *s, *d;
2299
2300                 s = strdup(item->source);
2301                 if (!s) {
2302                         r = -ENOMEM;
2303                         goto finish;
2304                 }
2305
2306                 d = strdup(item->destination);
2307                 if (!d) {
2308                         free(s);
2309                         r = -ENOMEM;
2310                         goto finish;
2311                 }
2312
2313                 bind_mounts[h++] = (BindMount) {
2314                         .source = s,
2315                         .destination = d,
2316                         .read_only = item->read_only,
2317                         .recursive = item->recursive,
2318                         .ignore_enoent = item->ignore_enoent,
2319                 };
2320         }
2321
2322         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2323                 char **suffix;
2324
2325                 if (!params->prefix[t])
2326                         continue;
2327
2328                 if (strv_isempty(context->directories[t].paths))
2329                         continue;
2330
2331                 if (context->dynamic_user &&
2332                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2333                     !(context->root_directory || context->root_image)) {
2334                         char *private_root;
2335
2336                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2337                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2338                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2339
2340                         private_root = strjoin(params->prefix[t], "/private");
2341                         if (!private_root) {
2342                                 r = -ENOMEM;
2343                                 goto finish;
2344                         }
2345
2346                         r = strv_consume(&empty_directories, private_root);
2347                         if (r < 0)
2348                                 goto finish;
2349                 }
2350
2351                 STRV_FOREACH(suffix, context->directories[t].paths) {
2352                         char *s, *d;
2353
2354                         if (context->dynamic_user &&
2355                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2356                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2357                         else
2358                                 s = strjoin(params->prefix[t], "/", *suffix);
2359                         if (!s) {
2360                                 r = -ENOMEM;
2361                                 goto finish;
2362                         }
2363
2364                         if (context->dynamic_user &&
2365                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2366                             (context->root_directory || context->root_image))
2367                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2368                                  * directory is not created on the root directory. So, let's bind-mount the directory
2369                                  * on the 'non-private' place. */
2370                                 d = strjoin(params->prefix[t], "/", *suffix);
2371                         else
2372                                 d = strdup(s);
2373                         if (!d) {
2374                                 free(s);
2375                                 r = -ENOMEM;
2376                                 goto finish;
2377                         }
2378
2379                         bind_mounts[h++] = (BindMount) {
2380                                 .source = s,
2381                                 .destination = d,
2382                                 .read_only = false,
2383                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2384                                 .recursive = true,
2385                                 .ignore_enoent = false,
2386                         };
2387                 }
2388         }
2389
2390         assert(h == n);
2391
2392         *ret_bind_mounts = bind_mounts;
2393         *ret_n_bind_mounts = n;
2394         *ret_empty_directories = TAKE_PTR(empty_directories);
2395
2396         return (int) n;
2397
2398 finish:
2399         bind_mount_free_many(bind_mounts, h);
2400         return r;
2401 }
2402
2403 static int apply_mount_namespace(
2404                 const Unit *u,
2405                 const ExecCommand *command,
2406                 const ExecContext *context,
2407                 const ExecParameters *params,
2408                 const ExecRuntime *runtime,
2409                 char **error_path) {
2410
2411         _cleanup_strv_free_ char **empty_directories = NULL;
2412         char *tmp = NULL, *var = NULL;
2413         const char *root_dir = NULL, *root_image = NULL;
2414         NamespaceInfo ns_info;
2415         bool needs_sandboxing;
2416         BindMount *bind_mounts = NULL;
2417         size_t n_bind_mounts = 0;
2418         int r;
2419
2420         assert(context);
2421
2422         /* The runtime struct only contains the parent of the private /tmp,
2423          * which is non-accessible to world users. Inside of it there's a /tmp
2424          * that is sticky, and that's the one we want to use here. */
2425
2426         if (context->private_tmp && runtime) {
2427                 if (runtime->tmp_dir)
2428                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2429                 if (runtime->var_tmp_dir)
2430                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2431         }
2432
2433         if (params->flags & EXEC_APPLY_CHROOT) {
2434                 root_image = context->root_image;
2435
2436                 if (!root_image)
2437                         root_dir = context->root_directory;
2438         }
2439
2440         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2441         if (r < 0)
2442                 return r;
2443
2444         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2445         if (needs_sandboxing)
2446                 ns_info = (NamespaceInfo) {
2447                         .ignore_protect_paths = false,
2448                         .private_dev = context->private_devices,
2449                         .protect_control_groups = context->protect_control_groups,
2450                         .protect_kernel_tunables = context->protect_kernel_tunables,
2451                         .protect_kernel_modules = context->protect_kernel_modules,
2452                         .protect_hostname = context->protect_hostname,
2453                         .mount_apivfs = context->mount_apivfs,
2454                         .private_mounts = context->private_mounts,
2455                 };
2456         else if (!context->dynamic_user && root_dir)
2457                 /*
2458                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2459                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2460                  * fail if we are enable to apply the sandbox inside the mount namespace.
2461                  */
2462                 ns_info = (NamespaceInfo) {
2463                         .ignore_protect_paths = true,
2464                 };
2465         else
2466                 ns_info = (NamespaceInfo) {};
2467
2468         if (context->mount_flags == MS_SHARED)
2469                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2470
2471         r = setup_namespace(root_dir, root_image,
2472                             &ns_info, context->read_write_paths,
2473                             needs_sandboxing ? context->read_only_paths : NULL,
2474                             needs_sandboxing ? context->inaccessible_paths : NULL,
2475                             empty_directories,
2476                             bind_mounts,
2477                             n_bind_mounts,
2478                             context->temporary_filesystems,
2479                             context->n_temporary_filesystems,
2480                             tmp,
2481                             var,
2482                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2483                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2484                             context->mount_flags,
2485                             DISSECT_IMAGE_DISCARD_ON_LOOP,
2486                             error_path);
2487
2488         bind_mount_free_many(bind_mounts, n_bind_mounts);
2489
2490         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2491          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2492          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2493          * completely different execution environment. */
2494         if (r == -ENOANO) {
2495                 if (n_bind_mounts == 0 &&
2496                     context->n_temporary_filesystems == 0 &&
2497                     !root_dir && !root_image &&
2498                     !context->dynamic_user) {
2499                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2500                         return 0;
2501                 }
2502
2503                 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2504                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2505                                n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2506
2507                 return -EOPNOTSUPP;
2508         }
2509
2510         return r;
2511 }
2512
2513 static int apply_working_directory(
2514                 const ExecContext *context,
2515                 const ExecParameters *params,
2516                 const char *home,
2517                 const bool needs_mount_ns,
2518                 int *exit_status) {
2519
2520         const char *d, *wd;
2521
2522         assert(context);
2523         assert(exit_status);
2524
2525         if (context->working_directory_home) {
2526
2527                 if (!home) {
2528                         *exit_status = EXIT_CHDIR;
2529                         return -ENXIO;
2530                 }
2531
2532                 wd = home;
2533
2534         } else if (context->working_directory)
2535                 wd = context->working_directory;
2536         else
2537                 wd = "/";
2538
2539         if (params->flags & EXEC_APPLY_CHROOT) {
2540                 if (!needs_mount_ns && context->root_directory)
2541                         if (chroot(context->root_directory) < 0) {
2542                                 *exit_status = EXIT_CHROOT;
2543                                 return -errno;
2544                         }
2545
2546                 d = wd;
2547         } else
2548                 d = prefix_roota(context->root_directory, wd);
2549
2550         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2551                 *exit_status = EXIT_CHDIR;
2552                 return -errno;
2553         }
2554
2555         return 0;
2556 }
2557
2558 static int setup_keyring(
2559                 const Unit *u,
2560                 const ExecContext *context,
2561                 const ExecParameters *p,
2562                 uid_t uid, gid_t gid) {
2563
2564         key_serial_t keyring;
2565         int r = 0;
2566         uid_t saved_uid;
2567         gid_t saved_gid;
2568
2569         assert(u);
2570         assert(context);
2571         assert(p);
2572
2573         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2574          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2575          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2576          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2577          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2578          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2579
2580         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2581                 return 0;
2582
2583         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2584          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2585          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2586          * & group is just as nasty as acquiring a reference to the user keyring. */
2587
2588         saved_uid = getuid();
2589         saved_gid = getgid();
2590
2591         if (gid_is_valid(gid) && gid != saved_gid) {
2592                 if (setregid(gid, -1) < 0)
2593                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2594         }
2595
2596         if (uid_is_valid(uid) && uid != saved_uid) {
2597                 if (setreuid(uid, -1) < 0) {
2598                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2599                         goto out;
2600                 }
2601         }
2602
2603         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2604         if (keyring == -1) {
2605                 if (errno == ENOSYS)
2606                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2607                 else if (IN_SET(errno, EACCES, EPERM))
2608                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2609                 else if (errno == EDQUOT)
2610                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2611                 else
2612                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2613
2614                 goto out;
2615         }
2616
2617         /* When requested link the user keyring into the session keyring. */
2618         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2619
2620                 if (keyctl(KEYCTL_LINK,
2621                            KEY_SPEC_USER_KEYRING,
2622                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2623                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2624                         goto out;
2625                 }
2626         }
2627
2628         /* Restore uid/gid back */
2629         if (uid_is_valid(uid) && uid != saved_uid) {
2630                 if (setreuid(saved_uid, -1) < 0) {
2631                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2632                         goto out;
2633                 }
2634         }
2635
2636         if (gid_is_valid(gid) && gid != saved_gid) {
2637                 if (setregid(saved_gid, -1) < 0)
2638                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2639         }
2640
2641         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2642         if (!sd_id128_is_null(u->invocation_id)) {
2643                 key_serial_t key;
2644
2645                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2646                 if (key == -1)
2647                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2648                 else {
2649                         if (keyctl(KEYCTL_SETPERM, key,
2650                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2651                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2652                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2653                 }
2654         }
2655
2656 out:
2657         /* Revert back uid & gid for the the last time, and exit */
2658         /* no extra logging, as only the first already reported error matters */
2659         if (getuid() != saved_uid)
2660                 (void) setreuid(saved_uid, -1);
2661
2662         if (getgid() != saved_gid)
2663                 (void) setregid(saved_gid, -1);
2664
2665         return r;
2666 }
2667
2668 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2669         assert(array);
2670         assert(n);
2671
2672         if (!pair)
2673                 return;
2674
2675         if (pair[0] >= 0)
2676                 array[(*n)++] = pair[0];
2677         if (pair[1] >= 0)
2678                 array[(*n)++] = pair[1];
2679 }
2680
2681 static int close_remaining_fds(
2682                 const ExecParameters *params,
2683                 const ExecRuntime *runtime,
2684                 const DynamicCreds *dcreds,
2685                 int user_lookup_fd,
2686                 int socket_fd,
2687                 int exec_fd,
2688                 int *fds, size_t n_fds) {
2689
2690         size_t n_dont_close = 0;
2691         int dont_close[n_fds + 12];
2692
2693         assert(params);
2694
2695         if (params->stdin_fd >= 0)
2696                 dont_close[n_dont_close++] = params->stdin_fd;
2697         if (params->stdout_fd >= 0)
2698                 dont_close[n_dont_close++] = params->stdout_fd;
2699         if (params->stderr_fd >= 0)
2700                 dont_close[n_dont_close++] = params->stderr_fd;
2701
2702         if (socket_fd >= 0)
2703                 dont_close[n_dont_close++] = socket_fd;
2704         if (exec_fd >= 0)
2705                 dont_close[n_dont_close++] = exec_fd;
2706         if (n_fds > 0) {
2707                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2708                 n_dont_close += n_fds;
2709         }
2710
2711         if (runtime)
2712                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2713
2714         if (dcreds) {
2715                 if (dcreds->user)
2716                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2717                 if (dcreds->group)
2718                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2719         }
2720
2721         if (user_lookup_fd >= 0)
2722                 dont_close[n_dont_close++] = user_lookup_fd;
2723
2724         return close_all_fds(dont_close, n_dont_close);
2725 }
2726
2727 static int send_user_lookup(
2728                 Unit *unit,
2729                 int user_lookup_fd,
2730                 uid_t uid,
2731                 gid_t gid) {
2732
2733         assert(unit);
2734
2735         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2736          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2737          * specified. */
2738
2739         if (user_lookup_fd < 0)
2740                 return 0;
2741
2742         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2743                 return 0;
2744
2745         if (writev(user_lookup_fd,
2746                (struct iovec[]) {
2747                            IOVEC_INIT(&uid, sizeof(uid)),
2748                            IOVEC_INIT(&gid, sizeof(gid)),
2749                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2750                 return -errno;
2751
2752         return 0;
2753 }
2754
2755 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2756         int r;
2757
2758         assert(c);
2759         assert(home);
2760         assert(buf);
2761
2762         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2763
2764         if (*home)
2765                 return 0;
2766
2767         if (!c->working_directory_home)
2768                 return 0;
2769
2770         r = get_home_dir(buf);
2771         if (r < 0)
2772                 return r;
2773
2774         *home = *buf;
2775         return 1;
2776 }
2777
2778 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2779         _cleanup_strv_free_ char ** list = NULL;
2780         ExecDirectoryType t;
2781         int r;
2782
2783         assert(c);
2784         assert(p);
2785         assert(ret);
2786
2787         assert(c->dynamic_user);
2788
2789         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2790          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2791          * directories. */
2792
2793         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2794                 char **i;
2795
2796                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2797                         continue;
2798
2799                 if (!p->prefix[t])
2800                         continue;
2801
2802                 STRV_FOREACH(i, c->directories[t].paths) {
2803                         char *e;
2804
2805                         if (t == EXEC_DIRECTORY_RUNTIME)
2806                                 e = strjoin(p->prefix[t], "/", *i);
2807                         else
2808                                 e = strjoin(p->prefix[t], "/private/", *i);
2809                         if (!e)
2810                                 return -ENOMEM;
2811
2812                         r = strv_consume(&list, e);
2813                         if (r < 0)
2814                                 return r;
2815                 }
2816         }
2817
2818         *ret = TAKE_PTR(list);
2819
2820         return 0;
2821 }
2822
2823 static char *exec_command_line(char **argv);
2824
2825 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2826         bool using_subcgroup;
2827         char *p;
2828
2829         assert(params);
2830         assert(ret);
2831
2832         if (!params->cgroup_path)
2833                 return -EINVAL;
2834
2835         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2836          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2837          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2838          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2839          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2840          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2841          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2842          * flag, which is only passed for the former statements, not for the latter. */
2843
2844         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2845         if (using_subcgroup)
2846                 p = strjoin(params->cgroup_path, "/.control");
2847         else
2848                 p = strdup(params->cgroup_path);
2849         if (!p)
2850                 return -ENOMEM;
2851
2852         *ret = p;
2853         return using_subcgroup;
2854 }
2855
2856 static int exec_child(
2857                 Unit *unit,
2858                 const ExecCommand *command,
2859                 const ExecContext *context,
2860                 const ExecParameters *params,
2861                 ExecRuntime *runtime,
2862                 DynamicCreds *dcreds,
2863                 int socket_fd,
2864                 int named_iofds[3],
2865                 int *fds,
2866                 size_t n_socket_fds,
2867                 size_t n_storage_fds,
2868                 char **files_env,
2869                 int user_lookup_fd,
2870                 int *exit_status) {
2871
2872         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2873         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2874         _cleanup_free_ gid_t *supplementary_gids = NULL;
2875         const char *username = NULL, *groupname = NULL;
2876         _cleanup_free_ char *home_buffer = NULL;
2877         const char *home = NULL, *shell = NULL;
2878         char **final_argv = NULL;
2879         dev_t journal_stream_dev = 0;
2880         ino_t journal_stream_ino = 0;
2881         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2882                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2883                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2884                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2885 #if HAVE_SELINUX
2886         _cleanup_free_ char *mac_selinux_context_net = NULL;
2887         bool use_selinux = false;
2888 #endif
2889 #if ENABLE_SMACK
2890         bool use_smack = false;
2891 #endif
2892 #if HAVE_APPARMOR
2893         bool use_apparmor = false;
2894 #endif
2895         uid_t uid = UID_INVALID;
2896         gid_t gid = GID_INVALID;
2897         size_t n_fds;
2898         ExecDirectoryType dt;
2899         int secure_bits;
2900
2901         assert(unit);
2902         assert(command);
2903         assert(context);
2904         assert(params);
2905         assert(exit_status);
2906
2907         rename_process_from_path(command->path);
2908
2909         /* We reset exactly these signals, since they are the
2910          * only ones we set to SIG_IGN in the main daemon. All
2911          * others we leave untouched because we set them to
2912          * SIG_DFL or a valid handler initially, both of which
2913          * will be demoted to SIG_DFL. */
2914         (void) default_signals(SIGNALS_CRASH_HANDLER,
2915                                SIGNALS_IGNORE, -1);
2916
2917         if (context->ignore_sigpipe)
2918                 (void) ignore_signals(SIGPIPE, -1);
2919
2920         r = reset_signal_mask();
2921         if (r < 0) {
2922                 *exit_status = EXIT_SIGNAL_MASK;
2923                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2924         }
2925
2926         if (params->idle_pipe)
2927                 do_idle_pipe_dance(params->idle_pipe);
2928
2929         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2930          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2931          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2932          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2933
2934         log_forget_fds();
2935         log_set_open_when_needed(true);
2936
2937         /* In case anything used libc syslog(), close this here, too */
2938         closelog();
2939
2940         n_fds = n_socket_fds + n_storage_fds;
2941         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2942         if (r < 0) {
2943                 *exit_status = EXIT_FDS;
2944                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2945         }
2946
2947         if (!context->same_pgrp)
2948                 if (setsid() < 0) {
2949                         *exit_status = EXIT_SETSID;
2950                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2951                 }
2952
2953         exec_context_tty_reset(context, params);
2954
2955         if (unit_shall_confirm_spawn(unit)) {
2956                 const char *vc = params->confirm_spawn;
2957                 _cleanup_free_ char *cmdline = NULL;
2958
2959                 cmdline = exec_command_line(command->argv);
2960                 if (!cmdline) {
2961                         *exit_status = EXIT_MEMORY;
2962                         return log_oom();
2963                 }
2964
2965                 r = ask_for_confirmation(vc, unit, cmdline);
2966                 if (r != CONFIRM_EXECUTE) {
2967                         if (r == CONFIRM_PRETEND_SUCCESS) {
2968                                 *exit_status = EXIT_SUCCESS;
2969                                 return 0;
2970                         }
2971                         *exit_status = EXIT_CONFIRM;
2972                         log_unit_error(unit, "Execution cancelled by the user");
2973                         return -ECANCELED;
2974                 }
2975         }
2976
2977         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2978          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2979          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2980          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2981          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2982         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2983             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2984                 *exit_status = EXIT_MEMORY;
2985                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2986         }
2987
2988         if (context->dynamic_user && dcreds) {
2989                 _cleanup_strv_free_ char **suggested_paths = NULL;
2990
2991                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2992                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2993                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2994                         *exit_status = EXIT_USER;
2995                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2996                 }
2997
2998                 r = compile_suggested_paths(context, params, &suggested_paths);
2999                 if (r < 0) {
3000                         *exit_status = EXIT_MEMORY;
3001                         return log_oom();
3002                 }
3003
3004                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3005                 if (r < 0) {
3006                         *exit_status = EXIT_USER;
3007                         if (r == -EILSEQ) {
3008                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3009                                 return -EOPNOTSUPP;
3010                         }
3011                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3012                 }
3013
3014                 if (!uid_is_valid(uid)) {
3015                         *exit_status = EXIT_USER;
3016                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3017                         return -ESRCH;
3018                 }
3019
3020                 if (!gid_is_valid(gid)) {
3021                         *exit_status = EXIT_USER;
3022                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3023                         return -ESRCH;
3024                 }
3025
3026                 if (dcreds->user)
3027                         username = dcreds->user->name;
3028
3029         } else {
3030                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3031                 if (r < 0) {
3032                         *exit_status = EXIT_USER;
3033                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3034                 }
3035
3036                 r = get_fixed_group(context, &groupname, &gid);
3037                 if (r < 0) {
3038                         *exit_status = EXIT_GROUP;
3039                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3040                 }
3041         }
3042
3043         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3044         r = get_supplementary_groups(context, username, groupname, gid,
3045                                      &supplementary_gids, &ngids);
3046         if (r < 0) {
3047                 *exit_status = EXIT_GROUP;
3048                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3049         }
3050
3051         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3052         if (r < 0) {
3053                 *exit_status = EXIT_USER;
3054                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3055         }
3056
3057         user_lookup_fd = safe_close(user_lookup_fd);
3058
3059         r = acquire_home(context, uid, &home, &home_buffer);
3060         if (r < 0) {
3061                 *exit_status = EXIT_CHDIR;
3062                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3063         }
3064
3065         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3066          * must sure to drop O_NONBLOCK */
3067         if (socket_fd >= 0)
3068                 (void) fd_nonblock(socket_fd, false);
3069
3070         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3071          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3072         if (params->cgroup_path) {
3073                 _cleanup_free_ char *p = NULL;
3074
3075                 r = exec_parameters_get_cgroup_path(params, &p);
3076                 if (r < 0) {
3077                         *exit_status = EXIT_CGROUP;
3078                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3079                 }
3080
3081                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3082                 if (r < 0) {
3083                         *exit_status = EXIT_CGROUP;
3084                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3085                 }
3086         }
3087
3088         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3089                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3090                 if (r < 0) {
3091                         *exit_status = EXIT_NETWORK;
3092                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3093                 }
3094         }
3095
3096         r = setup_input(context, params, socket_fd, named_iofds);
3097         if (r < 0) {
3098                 *exit_status = EXIT_STDIN;
3099                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3100         }
3101
3102         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3103         if (r < 0) {
3104                 *exit_status = EXIT_STDOUT;
3105                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3106         }
3107
3108         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3109         if (r < 0) {
3110                 *exit_status = EXIT_STDERR;
3111                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3112         }
3113
3114         if (context->oom_score_adjust_set) {
3115                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3116                  * prohibit write access to this file, and we shouldn't trip up over that. */
3117                 r = set_oom_score_adjust(context->oom_score_adjust);
3118                 if (IN_SET(r, -EPERM, -EACCES))
3119                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3120                 else if (r < 0) {
3121                         *exit_status = EXIT_OOM_ADJUST;
3122                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3123                 }
3124         }
3125
3126         if (context->nice_set)
3127                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3128                         *exit_status = EXIT_NICE;
3129                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3130                 }
3131
3132         if (context->cpu_sched_set) {
3133                 struct sched_param param = {
3134                         .sched_priority = context->cpu_sched_priority,
3135                 };
3136
3137                 r = sched_setscheduler(0,
3138                                        context->cpu_sched_policy |
3139                                        (context->cpu_sched_reset_on_fork ?
3140                                         SCHED_RESET_ON_FORK : 0),
3141                                        &param);
3142                 if (r < 0) {
3143                         *exit_status = EXIT_SETSCHEDULER;
3144                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3145                 }
3146         }
3147
3148         if (context->cpuset)
3149                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
3150                         *exit_status = EXIT_CPUAFFINITY;
3151                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3152                 }
3153
3154         if (context->ioprio_set)
3155                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3156                         *exit_status = EXIT_IOPRIO;
3157                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3158                 }
3159
3160         if (context->timer_slack_nsec != NSEC_INFINITY)
3161                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3162                         *exit_status = EXIT_TIMERSLACK;
3163                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3164                 }
3165
3166         if (context->personality != PERSONALITY_INVALID) {
3167                 r = safe_personality(context->personality);
3168                 if (r < 0) {
3169                         *exit_status = EXIT_PERSONALITY;
3170                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3171                 }
3172         }
3173
3174         if (context->utmp_id)
3175                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3176                                       context->tty_path,
3177                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3178                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3179                                       USER_PROCESS,
3180                                       username);
3181
3182         if (uid_is_valid(uid)) {
3183                 r = chown_terminal(STDIN_FILENO, uid);
3184                 if (r < 0) {
3185                         *exit_status = EXIT_STDIN;
3186                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3187                 }
3188         }
3189
3190         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3191          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3192          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3193          * touch a single hierarchy too. */
3194         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3195                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3196                 if (r < 0) {
3197                         *exit_status = EXIT_CGROUP;
3198                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3199                 }
3200         }
3201
3202         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3203                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3204                 if (r < 0)
3205                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3206         }
3207
3208         r = build_environment(
3209                         unit,
3210                         context,
3211                         params,
3212                         n_fds,
3213                         home,
3214                         username,
3215                         shell,
3216                         journal_stream_dev,
3217                         journal_stream_ino,
3218                         &our_env);
3219         if (r < 0) {
3220                 *exit_status = EXIT_MEMORY;
3221                 return log_oom();
3222         }
3223
3224         r = build_pass_environment(context, &pass_env);
3225         if (r < 0) {
3226                 *exit_status = EXIT_MEMORY;
3227                 return log_oom();
3228         }
3229
3230         accum_env = strv_env_merge(5,
3231                                    params->environment,
3232                                    our_env,
3233                                    pass_env,
3234                                    context->environment,
3235                                    files_env,
3236                                    NULL);
3237         if (!accum_env) {
3238                 *exit_status = EXIT_MEMORY;
3239                 return log_oom();
3240         }
3241         accum_env = strv_env_clean(accum_env);
3242
3243         (void) umask(context->umask);
3244
3245         r = setup_keyring(unit, context, params, uid, gid);
3246         if (r < 0) {
3247                 *exit_status = EXIT_KEYRING;
3248                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3249         }
3250
3251         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3252         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3253
3254         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3255         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3256
3257         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3258         if (needs_ambient_hack)
3259                 needs_setuid = false;
3260         else
3261                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3262
3263         if (needs_sandboxing) {
3264                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3265                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3266                  * impacting our own code paths. */
3267
3268 #if HAVE_SELINUX
3269                 use_selinux = mac_selinux_use();
3270 #endif
3271 #if ENABLE_SMACK
3272                 use_smack = mac_smack_use();
3273 #endif
3274 #if HAVE_APPARMOR
3275                 use_apparmor = mac_apparmor_use();
3276 #endif
3277         }
3278
3279         if (needs_sandboxing) {
3280                 int which_failed;
3281
3282                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3283                  * is set here. (See below.) */
3284
3285                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3286                 if (r < 0) {
3287                         *exit_status = EXIT_LIMITS;
3288                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3289                 }
3290         }
3291
3292         if (needs_setuid) {
3293
3294                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3295                  * wins here. (See above.) */
3296
3297                 if (context->pam_name && username) {
3298                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3299                         if (r < 0) {
3300                                 *exit_status = EXIT_PAM;
3301                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3302                         }
3303                 }
3304         }
3305
3306         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3307
3308                 if (ns_type_supported(NAMESPACE_NET)) {
3309                         r = setup_netns(runtime->netns_storage_socket);
3310                         if (r < 0) {
3311                                 *exit_status = EXIT_NETWORK;
3312                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3313                         }
3314                 } else if (context->network_namespace_path) {
3315                         *exit_status = EXIT_NETWORK;
3316                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3317                 } else
3318                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3319         }
3320
3321         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3322         if (needs_mount_namespace) {
3323                 _cleanup_free_ char *error_path = NULL;
3324
3325                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3326                 if (r < 0) {
3327                         *exit_status = EXIT_NAMESPACE;
3328                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3329                                                     error_path ? ": " : "", strempty(error_path));
3330                 }
3331         }
3332
3333         if (context->protect_hostname) {
3334                 if (ns_type_supported(NAMESPACE_UTS)) {
3335                         if (unshare(CLONE_NEWUTS) < 0) {
3336                                 *exit_status = EXIT_NAMESPACE;
3337                                 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3338                         }
3339                 } else
3340                         log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3341 #if HAVE_SECCOMP
3342                 r = seccomp_protect_hostname();
3343                 if (r < 0) {
3344                         *exit_status = EXIT_SECCOMP;
3345                         return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3346                 }
3347 #endif
3348         }
3349
3350         /* Drop groups as early as possbile */
3351         if (needs_setuid) {
3352                 r = enforce_groups(gid, supplementary_gids, ngids);
3353                 if (r < 0) {
3354                         *exit_status = EXIT_GROUP;
3355                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3356                 }
3357         }
3358
3359         if (needs_sandboxing) {
3360 #if HAVE_SELINUX
3361                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3362                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3363                         if (r < 0) {
3364                                 *exit_status = EXIT_SELINUX_CONTEXT;
3365                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3366                         }
3367                 }
3368 #endif
3369
3370                 if (context->private_users) {
3371                         r = setup_private_users(uid, gid);
3372                         if (r < 0) {
3373                                 *exit_status = EXIT_USER;
3374                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3375                         }
3376                 }
3377         }
3378
3379         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3380          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3381          * however if we have it as we want to keep it open until the final execve(). */
3382
3383         if (params->exec_fd >= 0) {
3384                 exec_fd = params->exec_fd;
3385
3386                 if (exec_fd < 3 + (int) n_fds) {
3387                         int moved_fd;
3388
3389                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3390                          * process we are about to execute. */
3391
3392                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3393                         if (moved_fd < 0) {
3394                                 *exit_status = EXIT_FDS;
3395                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3396                         }
3397
3398                         safe_close(exec_fd);
3399                         exec_fd = moved_fd;
3400                 } else {
3401                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3402                         r = fd_cloexec(exec_fd, true);
3403                         if (r < 0) {
3404                                 *exit_status = EXIT_FDS;
3405                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3406                         }
3407                 }
3408
3409                 fds_with_exec_fd = newa(int, n_fds + 1);
3410                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3411                 fds_with_exec_fd[n_fds] = exec_fd;
3412                 n_fds_with_exec_fd = n_fds + 1;
3413         } else {
3414                 fds_with_exec_fd = fds;
3415                 n_fds_with_exec_fd = n_fds;
3416         }
3417
3418         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3419         if (r >= 0)
3420                 r = shift_fds(fds, n_fds);
3421         if (r >= 0)
3422                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3423         if (r < 0) {
3424                 *exit_status = EXIT_FDS;
3425                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3426         }
3427
3428         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3429          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3430          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3431          * came this far. */
3432
3433         secure_bits = context->secure_bits;
3434
3435         if (needs_sandboxing) {
3436                 uint64_t bset;
3437
3438                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3439                  * requested. (Note this is placed after the general resource limit initialization, see
3440                  * above, in order to take precedence.) */
3441                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3442                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3443                                 *exit_status = EXIT_LIMITS;
3444                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3445                         }
3446                 }
3447
3448 #if ENABLE_SMACK
3449                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3450                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3451                 if (use_smack) {
3452                         r = setup_smack(context, command);
3453                         if (r < 0) {
3454                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3455                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3456                         }
3457                 }
3458 #endif
3459
3460                 bset = context->capability_bounding_set;
3461                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3462                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3463                  * instead of us doing that */
3464                 if (needs_ambient_hack)
3465                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3466                                 (UINT64_C(1) << CAP_SETUID) |
3467                                 (UINT64_C(1) << CAP_SETGID);
3468
3469                 if (!cap_test_all(bset)) {
3470                         r = capability_bounding_set_drop(bset, false);
3471                         if (r < 0) {
3472                                 *exit_status = EXIT_CAPABILITIES;
3473                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3474                         }
3475                 }
3476
3477                 /* This is done before enforce_user, but ambient set
3478                  * does not survive over setresuid() if keep_caps is not set. */
3479                 if (!needs_ambient_hack &&
3480                     context->capability_ambient_set != 0) {
3481                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3482                         if (r < 0) {
3483                                 *exit_status = EXIT_CAPABILITIES;
3484                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3485                         }
3486                 }
3487         }
3488
3489         if (needs_setuid) {
3490                 if (uid_is_valid(uid)) {
3491                         r = enforce_user(context, uid);
3492                         if (r < 0) {
3493                                 *exit_status = EXIT_USER;
3494                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3495                         }
3496
3497                         if (!needs_ambient_hack &&
3498                             context->capability_ambient_set != 0) {
3499
3500                                 /* Fix the ambient capabilities after user change. */
3501                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3502                                 if (r < 0) {
3503                                         *exit_status = EXIT_CAPABILITIES;
3504                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3505                                 }
3506
3507                                 /* If we were asked to change user and ambient capabilities
3508                                  * were requested, we had to add keep-caps to the securebits
3509                                  * so that we would maintain the inherited capability set
3510                                  * through the setresuid(). Make sure that the bit is added
3511                                  * also to the context secure_bits so that we don't try to
3512                                  * drop the bit away next. */
3513
3514                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3515                         }
3516                 }
3517         }
3518
3519         /* Apply working directory here, because the working directory might be on NFS and only the user running
3520          * this service might have the correct privilege to change to the working directory */
3521         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3522         if (r < 0)
3523                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3524
3525         if (needs_sandboxing) {
3526                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3527                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3528                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3529                  * are restricted. */
3530
3531 #if HAVE_SELINUX
3532                 if (use_selinux) {
3533                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3534
3535                         if (exec_context) {
3536                                 r = setexeccon(exec_context);
3537                                 if (r < 0) {
3538                                         *exit_status = EXIT_SELINUX_CONTEXT;
3539                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3540                                 }
3541                         }
3542                 }
3543 #endif
3544
3545 #if HAVE_APPARMOR
3546                 if (use_apparmor && context->apparmor_profile) {
3547                         r = aa_change_onexec(context->apparmor_profile);
3548                         if (r < 0 && !context->apparmor_profile_ignore) {
3549                                 *exit_status = EXIT_APPARMOR_PROFILE;
3550                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3551                         }
3552                 }
3553 #endif
3554
3555                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3556                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3557                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3558                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3559                                 *exit_status = EXIT_SECUREBITS;
3560                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3561                         }
3562
3563                 if (context_has_no_new_privileges(context))
3564                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3565                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3566                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3567                         }
3568
3569 #if HAVE_SECCOMP
3570                 r = apply_address_families(unit, context);
3571                 if (r < 0) {
3572                         *exit_status = EXIT_ADDRESS_FAMILIES;
3573                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3574                 }
3575
3576                 r = apply_memory_deny_write_execute(unit, context);
3577                 if (r < 0) {
3578                         *exit_status = EXIT_SECCOMP;
3579                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3580                 }
3581
3582                 r = apply_restrict_realtime(unit, context);
3583                 if (r < 0) {
3584                         *exit_status = EXIT_SECCOMP;
3585                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3586                 }
3587
3588                 r = apply_restrict_suid_sgid(unit, context);
3589                 if (r < 0) {
3590                         *exit_status = EXIT_SECCOMP;
3591                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3592                 }
3593
3594                 r = apply_restrict_namespaces(unit, context);
3595                 if (r < 0) {
3596                         *exit_status = EXIT_SECCOMP;
3597                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3598                 }
3599
3600                 r = apply_protect_sysctl(unit, context);
3601                 if (r < 0) {
3602                         *exit_status = EXIT_SECCOMP;
3603                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3604                 }
3605
3606                 r = apply_protect_kernel_modules(unit, context);
3607                 if (r < 0) {
3608                         *exit_status = EXIT_SECCOMP;
3609                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3610                 }
3611
3612                 r = apply_private_devices(unit, context);
3613                 if (r < 0) {
3614                         *exit_status = EXIT_SECCOMP;
3615                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3616                 }
3617
3618                 r = apply_syscall_archs(unit, context);
3619                 if (r < 0) {
3620                         *exit_status = EXIT_SECCOMP;
3621                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3622                 }
3623
3624                 r = apply_lock_personality(unit, context);
3625                 if (r < 0) {
3626                         *exit_status = EXIT_SECCOMP;
3627                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3628                 }
3629
3630                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3631                  * by the filter as little as possible. */
3632                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3633                 if (r < 0) {
3634                         *exit_status = EXIT_SECCOMP;
3635                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3636                 }
3637 #endif
3638         }
3639
3640         if (!strv_isempty(context->unset_environment)) {
3641                 char **ee = NULL;
3642
3643                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3644                 if (!ee) {
3645                         *exit_status = EXIT_MEMORY;
3646                         return log_oom();
3647                 }
3648
3649                 strv_free_and_replace(accum_env, ee);
3650         }
3651
3652         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3653                 replaced_argv = replace_env_argv(command->argv, accum_env);
3654                 if (!replaced_argv) {
3655                         *exit_status = EXIT_MEMORY;
3656                         return log_oom();
3657                 }
3658                 final_argv = replaced_argv;
3659         } else
3660                 final_argv = command->argv;
3661
3662         if (DEBUG_LOGGING) {
3663                 _cleanup_free_ char *line;
3664
3665                 line = exec_command_line(final_argv);
3666                 if (line)
3667                         log_struct(LOG_DEBUG,
3668                                    "EXECUTABLE=%s", command->path,
3669                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3670                                    LOG_UNIT_ID(unit),
3671                                    LOG_UNIT_INVOCATION_ID(unit));
3672         }
3673
3674         if (exec_fd >= 0) {
3675                 uint8_t hot = 1;
3676
3677                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3678                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3679
3680                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3681                         *exit_status = EXIT_EXEC;
3682                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3683                 }
3684         }
3685
3686         execve(command->path, final_argv, accum_env);
3687         r = -errno;
3688
3689         if (exec_fd >= 0) {
3690                 uint8_t hot = 0;
3691
3692                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3693                  * that POLLHUP on it no longer means execve() succeeded. */
3694
3695                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3696                         *exit_status = EXIT_EXEC;
3697                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3698                 }
3699         }
3700
3701         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3702                 log_struct_errno(LOG_INFO, r,
3703                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3704                                  LOG_UNIT_ID(unit),
3705                                  LOG_UNIT_INVOCATION_ID(unit),
3706                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3707                                                   command->path),
3708                                  "EXECUTABLE=%s", command->path);
3709                 return 0;
3710         }
3711
3712         *exit_status = EXIT_EXEC;
3713         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3714 }
3715
3716 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3717 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3718
3719 int exec_spawn(Unit *unit,
3720                ExecCommand *command,
3721                const ExecContext *context,
3722                const ExecParameters *params,
3723                ExecRuntime *runtime,
3724                DynamicCreds *dcreds,
3725                pid_t *ret) {
3726
3727         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3728         _cleanup_free_ char *subcgroup_path = NULL;
3729         _cleanup_strv_free_ char **files_env = NULL;
3730         size_t n_storage_fds = 0, n_socket_fds = 0;
3731         _cleanup_free_ char *line = NULL;
3732         pid_t pid;
3733
3734         assert(unit);
3735         assert(command);
3736         assert(context);
3737         assert(ret);
3738         assert(params);
3739         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3740
3741         if (context->std_input == EXEC_INPUT_SOCKET ||
3742             context->std_output == EXEC_OUTPUT_SOCKET ||
3743             context->std_error == EXEC_OUTPUT_SOCKET) {
3744
3745                 if (params->n_socket_fds > 1) {
3746                         log_unit_error(unit, "Got more than one socket.");
3747                         return -EINVAL;
3748                 }
3749
3750                 if (params->n_socket_fds == 0) {
3751                         log_unit_error(unit, "Got no socket.");
3752                         return -EINVAL;
3753                 }
3754
3755                 socket_fd = params->fds[0];
3756         } else {
3757                 socket_fd = -1;
3758                 fds = params->fds;
3759                 n_socket_fds = params->n_socket_fds;
3760                 n_storage_fds = params->n_storage_fds;
3761         }
3762
3763         r = exec_context_named_iofds(context, params, named_iofds);
3764         if (r < 0)
3765                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3766
3767         r = exec_context_load_environment(unit, context, &files_env);
3768         if (r < 0)
3769                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3770
3771         line = exec_command_line(command->argv);
3772         if (!line)
3773                 return log_oom();
3774
3775         log_struct(LOG_DEBUG,
3776                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3777                    "EXECUTABLE=%s", command->path,
3778                    LOG_UNIT_ID(unit),
3779                    LOG_UNIT_INVOCATION_ID(unit));
3780
3781         if (params->cgroup_path) {
3782                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3783                 if (r < 0)
3784                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3785                 if (r > 0) { /* We are using a child cgroup */
3786                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3787                         if (r < 0)
3788                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3789                 }
3790         }
3791
3792         pid = fork();
3793         if (pid < 0)
3794                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3795
3796         if (pid == 0) {
3797                 int exit_status = EXIT_SUCCESS;
3798
3799                 r = exec_child(unit,
3800                                command,
3801                                context,
3802                                params,
3803                                runtime,
3804                                dcreds,
3805                                socket_fd,
3806                                named_iofds,
3807                                fds,
3808                                n_socket_fds,
3809                                n_storage_fds,
3810                                files_env,
3811                                unit->manager->user_lookup_fds[1],
3812                                &exit_status);
3813
3814                 if (r < 0)
3815                         log_struct_errno(LOG_ERR, r,
3816                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3817                                          LOG_UNIT_ID(unit),
3818                                          LOG_UNIT_INVOCATION_ID(unit),
3819                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3820                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3821                                                           command->path),
3822                                          "EXECUTABLE=%s", command->path);
3823
3824                 _exit(exit_status);
3825         }
3826
3827         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3828
3829         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3830          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3831          * process will be killed too). */
3832         if (subcgroup_path)
3833                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3834
3835         exec_status_start(&command->exec_status, pid);
3836
3837         *ret = pid;
3838         return 0;
3839 }
3840
3841 void exec_context_init(ExecContext *c) {
3842         ExecDirectoryType i;
3843
3844         assert(c);
3845
3846         c->umask = 0022;
3847         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3848         c->cpu_sched_policy = SCHED_OTHER;
3849         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3850         c->syslog_level_prefix = true;
3851         c->ignore_sigpipe = true;
3852         c->timer_slack_nsec = NSEC_INFINITY;
3853         c->personality = PERSONALITY_INVALID;
3854         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3855                 c->directories[i].mode = 0755;
3856         c->capability_bounding_set = CAP_ALL;
3857         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3858         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3859         c->log_level_max = -1;
3860 }
3861
3862 void exec_context_done(ExecContext *c) {
3863         ExecDirectoryType i;
3864         size_t l;
3865
3866         assert(c);
3867
3868         c->environment = strv_free(c->environment);
3869         c->environment_files = strv_free(c->environment_files);
3870         c->pass_environment = strv_free(c->pass_environment);
3871         c->unset_environment = strv_free(c->unset_environment);
3872
3873         rlimit_free_all(c->rlimit);
3874
3875         for (l = 0; l < 3; l++) {
3876                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3877                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3878         }
3879
3880         c->working_directory = mfree(c->working_directory);
3881         c->root_directory = mfree(c->root_directory);
3882         c->root_image = mfree(c->root_image);
3883         c->tty_path = mfree(c->tty_path);
3884         c->syslog_identifier = mfree(c->syslog_identifier);
3885         c->user = mfree(c->user);
3886         c->group = mfree(c->group);
3887
3888         c->supplementary_groups = strv_free(c->supplementary_groups);
3889
3890         c->pam_name = mfree(c->pam_name);
3891
3892         c->read_only_paths = strv_free(c->read_only_paths);
3893         c->read_write_paths = strv_free(c->read_write_paths);
3894         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3895
3896         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3897         c->bind_mounts = NULL;
3898         c->n_bind_mounts = 0;
3899         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3900         c->temporary_filesystems = NULL;
3901         c->n_temporary_filesystems = 0;
3902
3903         c->cpuset = cpu_set_mfree(c->cpuset);
3904
3905         c->utmp_id = mfree(c->utmp_id);
3906         c->selinux_context = mfree(c->selinux_context);
3907         c->apparmor_profile = mfree(c->apparmor_profile);
3908         c->smack_process_label = mfree(c->smack_process_label);
3909
3910         c->syscall_filter = hashmap_free(c->syscall_filter);
3911         c->syscall_archs = set_free(c->syscall_archs);
3912         c->address_families = set_free(c->address_families);
3913
3914         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3915                 c->directories[i].paths = strv_free(c->directories[i].paths);
3916
3917         c->log_level_max = -1;
3918
3919         exec_context_free_log_extra_fields(c);
3920
3921         c->log_rate_limit_interval_usec = 0;
3922         c->log_rate_limit_burst = 0;
3923
3924         c->stdin_data = mfree(c->stdin_data);
3925         c->stdin_data_size = 0;
3926
3927         c->network_namespace_path = mfree(c->network_namespace_path);
3928 }
3929
3930 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3931         char **i;
3932
3933         assert(c);
3934
3935         if (!runtime_prefix)
3936                 return 0;
3937
3938         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3939                 _cleanup_free_ char *p;
3940
3941                 p = path_join(runtime_prefix, *i);
3942                 if (!p)
3943                         return -ENOMEM;
3944
3945                 /* We execute this synchronously, since we need to be sure this is gone when we start the
3946                  * service next. */
3947                 (void) rm_rf(p, REMOVE_ROOT);
3948         }
3949
3950         return 0;
3951 }
3952
3953 static void exec_command_done(ExecCommand *c) {
3954         assert(c);
3955
3956         c->path = mfree(c->path);
3957         c->argv = strv_free(c->argv);
3958 }
3959
3960 void exec_command_done_array(ExecCommand *c, size_t n) {
3961         size_t i;
3962
3963         for (i = 0; i < n; i++)
3964                 exec_command_done(c+i);
3965 }
3966
3967 ExecCommand* exec_command_free_list(ExecCommand *c) {
3968         ExecCommand *i;
3969
3970         while ((i = c)) {
3971                 LIST_REMOVE(command, c, i);
3972                 exec_command_done(i);
3973                 free(i);
3974         }
3975
3976         return NULL;
3977 }
3978
3979 void exec_command_free_array(ExecCommand **c, size_t n) {
3980         size_t i;
3981
3982         for (i = 0; i < n; i++)
3983                 c[i] = exec_command_free_list(c[i]);
3984 }
3985
3986 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3987         size_t i;
3988
3989         for (i = 0; i < n; i++)
3990                 exec_status_reset(&c[i].exec_status);
3991 }
3992
3993 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3994         size_t i;
3995
3996         for (i = 0; i < n; i++) {
3997                 ExecCommand *z;
3998
3999                 LIST_FOREACH(command, z, c[i])
4000                         exec_status_reset(&z->exec_status);
4001         }
4002 }
4003
4004 typedef struct InvalidEnvInfo {
4005         const Unit *unit;
4006         const char *path;
4007 } InvalidEnvInfo;
4008
4009 static void invalid_env(const char *p, void *userdata) {
4010         InvalidEnvInfo *info = userdata;
4011
4012         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4013 }
4014
4015 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4016         assert(c);
4017
4018         switch (fd_index) {
4019
4020         case STDIN_FILENO:
4021                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4022                         return NULL;
4023
4024                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4025
4026         case STDOUT_FILENO:
4027                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4028                         return NULL;
4029
4030                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4031
4032         case STDERR_FILENO:
4033                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4034                         return NULL;
4035
4036                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4037
4038         default:
4039                 return NULL;
4040         }
4041 }
4042
4043 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]) {
4044         size_t i, targets;
4045         const char* stdio_fdname[3];
4046         size_t n_fds;
4047
4048         assert(c);
4049         assert(p);
4050
4051         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4052                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4053                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4054
4055         for (i = 0; i < 3; i++)
4056                 stdio_fdname[i] = exec_context_fdname(c, i);
4057
4058         n_fds = p->n_storage_fds + p->n_socket_fds;
4059
4060         for (i = 0; i < n_fds  && targets > 0; i++)
4061                 if (named_iofds[STDIN_FILENO] < 0 &&
4062                     c->std_input == EXEC_INPUT_NAMED_FD &&
4063                     stdio_fdname[STDIN_FILENO] &&
4064                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4065
4066                         named_iofds[STDIN_FILENO] = p->fds[i];
4067                         targets--;
4068
4069                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4070                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4071                            stdio_fdname[STDOUT_FILENO] &&
4072                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4073
4074                         named_iofds[STDOUT_FILENO] = p->fds[i];
4075                         targets--;
4076
4077                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4078                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4079                            stdio_fdname[STDERR_FILENO] &&
4080                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4081
4082                         named_iofds[STDERR_FILENO] = p->fds[i];
4083                         targets--;
4084                 }
4085
4086         return targets == 0 ? 0 : -ENOENT;
4087 }
4088
4089 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4090         char **i, **r = NULL;
4091
4092         assert(c);
4093         assert(l);
4094
4095         STRV_FOREACH(i, c->environment_files) {
4096                 char *fn;
4097                 int k;
4098                 unsigned n;
4099                 bool ignore = false;
4100                 char **p;
4101                 _cleanup_globfree_ glob_t pglob = {};
4102
4103                 fn = *i;
4104
4105                 if (fn[0] == '-') {
4106                         ignore = true;
4107                         fn++;
4108                 }
4109
4110                 if (!path_is_absolute(fn)) {
4111                         if (ignore)
4112                                 continue;
4113
4114                         strv_free(r);
4115                         return -EINVAL;
4116                 }
4117
4118                 /* Filename supports globbing, take all matching files */
4119                 k = safe_glob(fn, 0, &pglob);
4120                 if (k < 0) {
4121                         if (ignore)
4122                                 continue;
4123
4124                         strv_free(r);
4125                         return k;
4126                 }
4127
4128                 /* When we don't match anything, -ENOENT should be returned */
4129                 assert(pglob.gl_pathc > 0);
4130
4131                 for (n = 0; n < pglob.gl_pathc; n++) {
4132                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4133                         if (k < 0) {
4134                                 if (ignore)
4135                                         continue;
4136
4137                                 strv_free(r);
4138                                 return k;
4139                         }
4140                         /* Log invalid environment variables with filename */
4141                         if (p) {
4142                                 InvalidEnvInfo info = {
4143                                         .unit = unit,
4144                                         .path = pglob.gl_pathv[n]
4145                                 };
4146
4147                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4148                         }
4149
4150                         if (!r)
4151                                 r = p;
4152                         else {
4153                                 char **m;
4154
4155                                 m = strv_env_merge(2, r, p);
4156                                 strv_free(r);
4157                                 strv_free(p);
4158                                 if (!m)
4159                                         return -ENOMEM;
4160
4161                                 r = m;
4162                         }
4163                 }
4164         }
4165
4166         *l = r;
4167
4168         return 0;
4169 }
4170
4171 static bool tty_may_match_dev_console(const char *tty) {
4172         _cleanup_free_ char *resolved = NULL;
4173
4174         if (!tty)
4175                 return true;
4176
4177         tty = skip_dev_prefix(tty);
4178
4179         /* trivial identity? */
4180         if (streq(tty, "console"))
4181                 return true;
4182
4183         if (resolve_dev_console(&resolved) < 0)
4184                 return true; /* if we could not resolve, assume it may */
4185
4186         /* "tty0" means the active VC, so it may be the same sometimes */
4187         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4188 }
4189
4190 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4191         assert(ec);
4192
4193         return ec->tty_reset ||
4194                 ec->tty_vhangup ||
4195                 ec->tty_vt_disallocate ||
4196                 is_terminal_input(ec->std_input) ||
4197                 is_terminal_output(ec->std_output) ||
4198                 is_terminal_output(ec->std_error);
4199 }
4200
4201 bool exec_context_may_touch_console(const ExecContext *ec) {
4202
4203         return exec_context_may_touch_tty(ec) &&
4204                tty_may_match_dev_console(exec_context_tty_path(ec));
4205 }
4206
4207 static void strv_fprintf(FILE *f, char **l) {
4208         char **g;
4209
4210         assert(f);
4211
4212         STRV_FOREACH(g, l)
4213                 fprintf(f, " %s", *g);
4214 }
4215
4216 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4217         ExecDirectoryType dt;
4218         char **e, **d;
4219         unsigned i;
4220         int r;
4221
4222         assert(c);
4223         assert(f);
4224
4225         prefix = strempty(prefix);
4226
4227         fprintf(f,
4228                 "%sUMask: %04o\n"
4229                 "%sWorkingDirectory: %s\n"
4230                 "%sRootDirectory: %s\n"
4231                 "%sNonBlocking: %s\n"
4232                 "%sPrivateTmp: %s\n"
4233                 "%sPrivateDevices: %s\n"
4234                 "%sProtectKernelTunables: %s\n"
4235                 "%sProtectKernelModules: %s\n"
4236                 "%sProtectControlGroups: %s\n"
4237                 "%sPrivateNetwork: %s\n"
4238                 "%sPrivateUsers: %s\n"
4239                 "%sProtectHome: %s\n"
4240                 "%sProtectSystem: %s\n"
4241                 "%sMountAPIVFS: %s\n"
4242                 "%sIgnoreSIGPIPE: %s\n"
4243                 "%sMemoryDenyWriteExecute: %s\n"
4244                 "%sRestrictRealtime: %s\n"
4245                 "%sRestrictSUIDSGID: %s\n"
4246                 "%sKeyringMode: %s\n"
4247                 "%sProtectHostname: %s\n",
4248                 prefix, c->umask,
4249                 prefix, c->working_directory ? c->working_directory : "/",
4250                 prefix, c->root_directory ? c->root_directory : "/",
4251                 prefix, yes_no(c->non_blocking),
4252                 prefix, yes_no(c->private_tmp),
4253                 prefix, yes_no(c->private_devices),
4254                 prefix, yes_no(c->protect_kernel_tunables),
4255                 prefix, yes_no(c->protect_kernel_modules),
4256                 prefix, yes_no(c->protect_control_groups),
4257                 prefix, yes_no(c->private_network),
4258                 prefix, yes_no(c->private_users),
4259                 prefix, protect_home_to_string(c->protect_home),
4260                 prefix, protect_system_to_string(c->protect_system),
4261                 prefix, yes_no(c->mount_apivfs),
4262                 prefix, yes_no(c->ignore_sigpipe),
4263                 prefix, yes_no(c->memory_deny_write_execute),
4264                 prefix, yes_no(c->restrict_realtime),
4265                 prefix, yes_no(c->restrict_suid_sgid),
4266                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4267                 prefix, yes_no(c->protect_hostname));
4268
4269         if (c->root_image)
4270                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4271
4272         STRV_FOREACH(e, c->environment)
4273                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4274
4275         STRV_FOREACH(e, c->environment_files)
4276                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4277
4278         STRV_FOREACH(e, c->pass_environment)
4279                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4280
4281         STRV_FOREACH(e, c->unset_environment)
4282                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4283
4284         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4285
4286         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4287                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4288
4289                 STRV_FOREACH(d, c->directories[dt].paths)
4290                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4291         }
4292
4293         if (c->nice_set)
4294                 fprintf(f,
4295                         "%sNice: %i\n",
4296                         prefix, c->nice);
4297
4298         if (c->oom_score_adjust_set)
4299                 fprintf(f,
4300                         "%sOOMScoreAdjust: %i\n",
4301                         prefix, c->oom_score_adjust);
4302
4303         for (i = 0; i < RLIM_NLIMITS; i++)
4304                 if (c->rlimit[i]) {
4305                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4306                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4307                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4308                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4309                 }
4310
4311         if (c->ioprio_set) {
4312                 _cleanup_free_ char *class_str = NULL;
4313
4314                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4315                 if (r >= 0)
4316                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4317
4318                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4319         }
4320
4321         if (c->cpu_sched_set) {
4322                 _cleanup_free_ char *policy_str = NULL;
4323
4324                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4325                 if (r >= 0)
4326                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4327
4328                 fprintf(f,
4329                         "%sCPUSchedulingPriority: %i\n"
4330                         "%sCPUSchedulingResetOnFork: %s\n",
4331                         prefix, c->cpu_sched_priority,
4332                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4333         }
4334
4335         if (c->cpuset) {
4336                 fprintf(f, "%sCPUAffinity:", prefix);
4337                 for (i = 0; i < c->cpuset_ncpus; i++)
4338                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4339                                 fprintf(f, " %u", i);
4340                 fputs("\n", f);
4341         }
4342
4343         if (c->timer_slack_nsec != NSEC_INFINITY)
4344                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4345
4346         fprintf(f,
4347                 "%sStandardInput: %s\n"
4348                 "%sStandardOutput: %s\n"
4349                 "%sStandardError: %s\n",
4350                 prefix, exec_input_to_string(c->std_input),
4351                 prefix, exec_output_to_string(c->std_output),
4352                 prefix, exec_output_to_string(c->std_error));
4353
4354         if (c->std_input == EXEC_INPUT_NAMED_FD)
4355                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4356         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4357                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4358         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4359                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4360
4361         if (c->std_input == EXEC_INPUT_FILE)
4362                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4363         if (c->std_output == EXEC_OUTPUT_FILE)
4364                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4365         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4366                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4367         if (c->std_error == EXEC_OUTPUT_FILE)
4368                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4369         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4370                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4371
4372         if (c->tty_path)
4373                 fprintf(f,
4374                         "%sTTYPath: %s\n"
4375                         "%sTTYReset: %s\n"
4376                         "%sTTYVHangup: %s\n"
4377                         "%sTTYVTDisallocate: %s\n",
4378                         prefix, c->tty_path,
4379                         prefix, yes_no(c->tty_reset),
4380                         prefix, yes_no(c->tty_vhangup),
4381                         prefix, yes_no(c->tty_vt_disallocate));
4382
4383         if (IN_SET(c->std_output,
4384                    EXEC_OUTPUT_SYSLOG,
4385                    EXEC_OUTPUT_KMSG,
4386                    EXEC_OUTPUT_JOURNAL,
4387                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4388                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4389                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4390             IN_SET(c->std_error,
4391                    EXEC_OUTPUT_SYSLOG,
4392                    EXEC_OUTPUT_KMSG,
4393                    EXEC_OUTPUT_JOURNAL,
4394                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4395                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4396                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4397
4398                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4399
4400                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4401                 if (r >= 0)
4402                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4403
4404                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4405                 if (r >= 0)
4406                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4407         }
4408
4409         if (c->log_level_max >= 0) {
4410                 _cleanup_free_ char *t = NULL;
4411
4412                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4413
4414                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4415         }
4416
4417         if (c->log_rate_limit_interval_usec > 0) {
4418                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4419
4420                 fprintf(f,
4421                         "%sLogRateLimitIntervalSec: %s\n",
4422                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4423         }
4424
4425         if (c->log_rate_limit_burst > 0)
4426                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4427
4428         if (c->n_log_extra_fields > 0) {
4429                 size_t j;
4430
4431                 for (j = 0; j < c->n_log_extra_fields; j++) {
4432                         fprintf(f, "%sLogExtraFields: ", prefix);
4433                         fwrite(c->log_extra_fields[j].iov_base,
4434                                1, c->log_extra_fields[j].iov_len,
4435                                f);
4436                         fputc('\n', f);
4437                 }
4438         }
4439
4440         if (c->secure_bits) {
4441                 _cleanup_free_ char *str = NULL;
4442
4443                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4444                 if (r >= 0)
4445                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4446         }
4447
4448         if (c->capability_bounding_set != CAP_ALL) {
4449                 _cleanup_free_ char *str = NULL;
4450
4451                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4452                 if (r >= 0)
4453                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4454         }
4455
4456         if (c->capability_ambient_set != 0) {
4457                 _cleanup_free_ char *str = NULL;
4458
4459                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4460                 if (r >= 0)
4461                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4462         }
4463
4464         if (c->user)
4465                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4466         if (c->group)
4467                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4468
4469         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4470
4471         if (!strv_isempty(c->supplementary_groups)) {
4472                 fprintf(f, "%sSupplementaryGroups:", prefix);
4473                 strv_fprintf(f, c->supplementary_groups);
4474                 fputs("\n", f);
4475         }
4476
4477         if (c->pam_name)
4478                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4479
4480         if (!strv_isempty(c->read_write_paths)) {
4481                 fprintf(f, "%sReadWritePaths:", prefix);
4482                 strv_fprintf(f, c->read_write_paths);
4483                 fputs("\n", f);
4484         }
4485
4486         if (!strv_isempty(c->read_only_paths)) {
4487                 fprintf(f, "%sReadOnlyPaths:", prefix);
4488                 strv_fprintf(f, c->read_only_paths);
4489                 fputs("\n", f);
4490         }
4491
4492         if (!strv_isempty(c->inaccessible_paths)) {
4493                 fprintf(f, "%sInaccessiblePaths:", prefix);
4494                 strv_fprintf(f, c->inaccessible_paths);
4495                 fputs("\n", f);
4496         }
4497
4498         if (c->n_bind_mounts > 0)
4499                 for (i = 0; i < c->n_bind_mounts; i++)
4500                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4501                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4502                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4503                                 c->bind_mounts[i].source,
4504                                 c->bind_mounts[i].destination,
4505                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4506
4507         if (c->n_temporary_filesystems > 0)
4508                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4509                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4510
4511                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4512                                 t->path,
4513                                 isempty(t->options) ? "" : ":",
4514                                 strempty(t->options));
4515                 }
4516
4517         if (c->utmp_id)
4518                 fprintf(f,
4519                         "%sUtmpIdentifier: %s\n",
4520                         prefix, c->utmp_id);
4521
4522         if (c->selinux_context)
4523                 fprintf(f,
4524                         "%sSELinuxContext: %s%s\n",
4525                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4526
4527         if (c->apparmor_profile)
4528                 fprintf(f,
4529                         "%sAppArmorProfile: %s%s\n",
4530                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4531
4532         if (c->smack_process_label)
4533                 fprintf(f,
4534                         "%sSmackProcessLabel: %s%s\n",
4535                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4536
4537         if (c->personality != PERSONALITY_INVALID)
4538                 fprintf(f,
4539                         "%sPersonality: %s\n",
4540                         prefix, strna(personality_to_string(c->personality)));
4541
4542         fprintf(f,
4543                 "%sLockPersonality: %s\n",
4544                 prefix, yes_no(c->lock_personality));
4545
4546         if (c->syscall_filter) {
4547 #if HAVE_SECCOMP
4548                 Iterator j;
4549                 void *id, *val;
4550                 bool first = true;
4551 #endif
4552
4553                 fprintf(f,
4554                         "%sSystemCallFilter: ",
4555                         prefix);
4556
4557                 if (!c->syscall_whitelist)
4558                         fputc('~', f);
4559
4560 #if HAVE_SECCOMP
4561                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4562                         _cleanup_free_ char *name = NULL;
4563                         const char *errno_name = NULL;
4564                         int num = PTR_TO_INT(val);
4565
4566                         if (first)
4567                                 first = false;
4568                         else
4569                                 fputc(' ', f);
4570
4571                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4572                         fputs(strna(name), f);
4573
4574                         if (num >= 0) {
4575                                 errno_name = errno_to_name(num);
4576                                 if (errno_name)
4577                                         fprintf(f, ":%s", errno_name);
4578                                 else
4579                                         fprintf(f, ":%d", num);
4580                         }
4581                 }
4582 #endif
4583
4584                 fputc('\n', f);
4585         }
4586
4587         if (c->syscall_archs) {
4588 #if HAVE_SECCOMP
4589                 Iterator j;
4590                 void *id;
4591 #endif
4592
4593                 fprintf(f,
4594                         "%sSystemCallArchitectures:",
4595                         prefix);
4596
4597 #if HAVE_SECCOMP
4598                 SET_FOREACH(id, c->syscall_archs, j)
4599                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4600 #endif
4601                 fputc('\n', f);
4602         }
4603
4604         if (exec_context_restrict_namespaces_set(c)) {
4605                 _cleanup_free_ char *s = NULL;
4606
4607                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4608                 if (r >= 0)
4609                         fprintf(f, "%sRestrictNamespaces: %s\n",
4610                                 prefix, s);
4611         }
4612
4613         if (c->network_namespace_path)
4614                 fprintf(f,
4615                         "%sNetworkNamespacePath: %s\n",
4616                         prefix, c->network_namespace_path);
4617
4618         if (c->syscall_errno > 0) {
4619                 const char *errno_name;
4620
4621                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4622
4623                 errno_name = errno_to_name(c->syscall_errno);
4624                 if (errno_name)
4625                         fprintf(f, "%s\n", errno_name);
4626                 else
4627                         fprintf(f, "%d\n", c->syscall_errno);
4628         }
4629 }
4630
4631 bool exec_context_maintains_privileges(const ExecContext *c) {
4632         assert(c);
4633
4634         /* Returns true if the process forked off would run under
4635          * an unchanged UID or as root. */
4636
4637         if (!c->user)
4638                 return true;
4639
4640         if (streq(c->user, "root") || streq(c->user, "0"))
4641                 return true;
4642
4643         return false;
4644 }
4645
4646 int exec_context_get_effective_ioprio(const ExecContext *c) {
4647         int p;
4648
4649         assert(c);
4650
4651         if (c->ioprio_set)
4652                 return c->ioprio;
4653
4654         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4655         if (p < 0)
4656                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4657
4658         return p;
4659 }
4660
4661 void exec_context_free_log_extra_fields(ExecContext *c) {
4662         size_t l;
4663
4664         assert(c);
4665
4666         for (l = 0; l < c->n_log_extra_fields; l++)
4667                 free(c->log_extra_fields[l].iov_base);
4668         c->log_extra_fields = mfree(c->log_extra_fields);
4669         c->n_log_extra_fields = 0;
4670 }
4671
4672 void exec_context_revert_tty(ExecContext *c) {
4673         int r;
4674
4675         assert(c);
4676
4677         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4678         exec_context_tty_reset(c, NULL);
4679
4680         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4681          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4682          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4683
4684         if (exec_context_may_touch_tty(c)) {
4685                 const char *path;
4686
4687                 path = exec_context_tty_path(c);
4688                 if (path) {
4689                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4690                         if (r < 0 && r != -ENOENT)
4691                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4692                 }
4693         }
4694 }
4695
4696 void exec_status_start(ExecStatus *s, pid_t pid) {
4697         assert(s);
4698
4699         *s = (ExecStatus) {
4700                 .pid = pid,
4701         };
4702
4703         dual_timestamp_get(&s->start_timestamp);
4704 }
4705
4706 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4707         assert(s);
4708
4709         if (s->pid != pid) {
4710                 *s = (ExecStatus) {
4711                         .pid = pid,
4712                 };
4713         }
4714
4715         dual_timestamp_get(&s->exit_timestamp);
4716
4717         s->code = code;
4718         s->status = status;
4719
4720         if (context && context->utmp_id)
4721                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4722 }
4723
4724 void exec_status_reset(ExecStatus *s) {
4725         assert(s);
4726
4727         *s = (ExecStatus) {};
4728 }
4729
4730 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4731         char buf[FORMAT_TIMESTAMP_MAX];
4732
4733         assert(s);
4734         assert(f);
4735
4736         if (s->pid <= 0)
4737                 return;
4738
4739         prefix = strempty(prefix);
4740
4741         fprintf(f,
4742                 "%sPID: "PID_FMT"\n",
4743                 prefix, s->pid);
4744
4745         if (dual_timestamp_is_set(&s->start_timestamp))
4746                 fprintf(f,
4747                         "%sStart Timestamp: %s\n",
4748                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4749
4750         if (dual_timestamp_is_set(&s->exit_timestamp))
4751                 fprintf(f,
4752                         "%sExit Timestamp: %s\n"
4753                         "%sExit Code: %s\n"
4754                         "%sExit Status: %i\n",
4755                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4756                         prefix, sigchld_code_to_string(s->code),
4757                         prefix, s->status);
4758 }
4759
4760 static char *exec_command_line(char **argv) {
4761         size_t k;
4762         char *n, *p, **a;
4763         bool first = true;
4764
4765         assert(argv);
4766
4767         k = 1;
4768         STRV_FOREACH(a, argv)
4769                 k += strlen(*a)+3;
4770
4771         n = new(char, k);
4772         if (!n)
4773                 return NULL;
4774
4775         p = n;
4776         STRV_FOREACH(a, argv) {
4777
4778                 if (!first)
4779                         *(p++) = ' ';
4780                 else
4781                         first = false;
4782
4783                 if (strpbrk(*a, WHITESPACE)) {
4784                         *(p++) = '\'';
4785                         p = stpcpy(p, *a);
4786                         *(p++) = '\'';
4787                 } else
4788                         p = stpcpy(p, *a);
4789
4790         }
4791
4792         *p = 0;
4793
4794         /* FIXME: this doesn't really handle arguments that have
4795          * spaces and ticks in them */
4796
4797         return n;
4798 }
4799
4800 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4801         _cleanup_free_ char *cmd = NULL;
4802         const char *prefix2;
4803
4804         assert(c);
4805         assert(f);
4806
4807         prefix = strempty(prefix);
4808         prefix2 = strjoina(prefix, "\t");
4809
4810         cmd = exec_command_line(c->argv);
4811         fprintf(f,
4812                 "%sCommand Line: %s\n",
4813                 prefix, cmd ? cmd : strerror(ENOMEM));
4814
4815         exec_status_dump(&c->exec_status, f, prefix2);
4816 }
4817
4818 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4819         assert(f);
4820
4821         prefix = strempty(prefix);
4822
4823         LIST_FOREACH(command, c, c)
4824                 exec_command_dump(c, f, prefix);
4825 }
4826
4827 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4828         ExecCommand *end;
4829
4830         assert(l);
4831         assert(e);
4832
4833         if (*l) {
4834                 /* It's kind of important, that we keep the order here */
4835                 LIST_FIND_TAIL(command, *l, end);
4836                 LIST_INSERT_AFTER(command, *l, end, e);
4837         } else
4838               *l = e;
4839 }
4840
4841 int exec_command_set(ExecCommand *c, const char *path, ...) {
4842         va_list ap;
4843         char **l, *p;
4844
4845         assert(c);
4846         assert(path);
4847
4848         va_start(ap, path);
4849         l = strv_new_ap(path, ap);
4850         va_end(ap);
4851
4852         if (!l)
4853                 return -ENOMEM;
4854
4855         p = strdup(path);
4856         if (!p) {
4857                 strv_free(l);
4858                 return -ENOMEM;
4859         }
4860
4861         free_and_replace(c->path, p);
4862
4863         return strv_free_and_replace(c->argv, l);
4864 }
4865
4866 int exec_command_append(ExecCommand *c, const char *path, ...) {
4867         _cleanup_strv_free_ char **l = NULL;
4868         va_list ap;
4869         int r;
4870
4871         assert(c);
4872         assert(path);
4873
4874         va_start(ap, path);
4875         l = strv_new_ap(path, ap);
4876         va_end(ap);
4877
4878         if (!l)
4879                 return -ENOMEM;
4880
4881         r = strv_extend_strv(&c->argv, l, false);
4882         if (r < 0)
4883                 return r;
4884
4885         return 0;
4886 }
4887
4888 static void *remove_tmpdir_thread(void *p) {
4889         _cleanup_free_ char *path = p;
4890
4891         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4892         return NULL;
4893 }
4894
4895 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4896         int r;
4897
4898         if (!rt)
4899                 return NULL;
4900
4901         if (rt->manager)
4902                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4903
4904         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4905         if (destroy && rt->tmp_dir) {
4906                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4907
4908                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4909                 if (r < 0) {
4910                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4911                         free(rt->tmp_dir);
4912                 }
4913
4914                 rt->tmp_dir = NULL;
4915         }
4916
4917         if (destroy && rt->var_tmp_dir) {
4918                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4919
4920                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4921                 if (r < 0) {
4922                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4923                         free(rt->var_tmp_dir);
4924                 }
4925
4926                 rt->var_tmp_dir = NULL;
4927         }
4928
4929         rt->id = mfree(rt->id);
4930         rt->tmp_dir = mfree(rt->tmp_dir);
4931         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4932         safe_close_pair(rt->netns_storage_socket);
4933         return mfree(rt);
4934 }
4935
4936 static void exec_runtime_freep(ExecRuntime **rt) {
4937         (void) exec_runtime_free(*rt, false);
4938 }
4939
4940 static int exec_runtime_allocate(ExecRuntime **ret) {
4941         ExecRuntime *n;
4942
4943         assert(ret);
4944
4945         n = new(ExecRuntime, 1);
4946         if (!n)
4947                 return -ENOMEM;
4948
4949         *n = (ExecRuntime) {
4950                 .netns_storage_socket = { -1, -1 },
4951         };
4952
4953         *ret = n;
4954         return 0;
4955 }
4956
4957 static int exec_runtime_add(
4958                 Manager *m,
4959                 const char *id,
4960                 const char *tmp_dir,
4961                 const char *var_tmp_dir,
4962                 const int netns_storage_socket[2],
4963                 ExecRuntime **ret) {
4964
4965         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4966         int r;
4967
4968         assert(m);
4969         assert(id);
4970
4971         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4972         if (r < 0)
4973                 return r;
4974
4975         r = exec_runtime_allocate(&rt);
4976         if (r < 0)
4977                 return r;
4978
4979         rt->id = strdup(id);
4980         if (!rt->id)
4981                 return -ENOMEM;
4982
4983         if (tmp_dir) {
4984                 rt->tmp_dir = strdup(tmp_dir);
4985                 if (!rt->tmp_dir)
4986                         return -ENOMEM;
4987
4988                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4989                 assert(var_tmp_dir);
4990                 rt->var_tmp_dir = strdup(var_tmp_dir);
4991                 if (!rt->var_tmp_dir)
4992                         return -ENOMEM;
4993         }
4994
4995         if (netns_storage_socket) {
4996                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4997                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4998         }
4999
5000         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5001         if (r < 0)
5002                 return r;
5003
5004         rt->manager = m;
5005
5006         if (ret)
5007                 *ret = rt;
5008
5009         /* do not remove created ExecRuntime object when the operation succeeds. */
5010         rt = NULL;
5011         return 0;
5012 }
5013
5014 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5015         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5016         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5017         int r;
5018
5019         assert(m);
5020         assert(c);
5021         assert(id);
5022
5023         /* It is not necessary to create ExecRuntime object. */
5024         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5025                 return 0;
5026
5027         if (c->private_tmp) {
5028                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5029                 if (r < 0)
5030                         return r;
5031         }
5032
5033         if (c->private_network || c->network_namespace_path) {
5034                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5035                         return -errno;
5036         }
5037
5038         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5039         if (r < 0)
5040                 return r;
5041
5042         /* Avoid cleanup */
5043         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5044         return 1;
5045 }
5046
5047 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5048         ExecRuntime *rt;
5049         int r;
5050
5051         assert(m);
5052         assert(id);
5053         assert(ret);
5054
5055         rt = hashmap_get(m->exec_runtime_by_id, id);
5056         if (rt)
5057                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5058                 goto ref;
5059
5060         if (!create)
5061                 return 0;
5062
5063         /* If not found, then create a new object. */
5064         r = exec_runtime_make(m, c, id, &rt);
5065         if (r <= 0)
5066                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5067                 return r;
5068
5069 ref:
5070         /* increment reference counter. */
5071         rt->n_ref++;
5072         *ret = rt;
5073         return 1;
5074 }
5075
5076 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5077         if (!rt)
5078                 return NULL;
5079
5080         assert(rt->n_ref > 0);
5081
5082         rt->n_ref--;
5083         if (rt->n_ref > 0)
5084                 return NULL;
5085
5086         return exec_runtime_free(rt, destroy);
5087 }
5088
5089 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5090         ExecRuntime *rt;
5091         Iterator i;
5092
5093         assert(m);
5094         assert(f);
5095         assert(fds);
5096
5097         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5098                 fprintf(f, "exec-runtime=%s", rt->id);
5099
5100                 if (rt->tmp_dir)
5101                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5102
5103                 if (rt->var_tmp_dir)
5104                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5105
5106                 if (rt->netns_storage_socket[0] >= 0) {
5107                         int copy;
5108
5109                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5110                         if (copy < 0)
5111                                 return copy;
5112
5113                         fprintf(f, " netns-socket-0=%i", copy);
5114                 }
5115
5116                 if (rt->netns_storage_socket[1] >= 0) {
5117                         int copy;
5118
5119                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5120                         if (copy < 0)
5121                                 return copy;
5122
5123                         fprintf(f, " netns-socket-1=%i", copy);
5124                 }
5125
5126                 fputc('\n', f);
5127         }
5128
5129         return 0;
5130 }
5131
5132 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5133         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5134         ExecRuntime *rt;
5135         int r;
5136
5137         /* This is for the migration from old (v237 or earlier) deserialization text.
5138          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5139          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5140          * so or not from the serialized text, then we always creates a new object owned by this. */
5141
5142         assert(u);
5143         assert(key);
5144         assert(value);
5145
5146         /* Manager manages ExecRuntime objects by the unit id.
5147          * So, we omit the serialized text when the unit does not have id (yet?)... */
5148         if (isempty(u->id)) {
5149                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5150                 return 0;
5151         }
5152
5153         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5154         if (r < 0) {
5155                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5156                 return 0;
5157         }
5158
5159         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5160         if (!rt) {
5161                 r = exec_runtime_allocate(&rt_create);
5162                 if (r < 0)
5163                         return log_oom();
5164
5165                 rt_create->id = strdup(u->id);
5166                 if (!rt_create->id)
5167                         return log_oom();
5168
5169                 rt = rt_create;
5170         }
5171
5172         if (streq(key, "tmp-dir")) {
5173                 char *copy;
5174
5175                 copy = strdup(value);
5176                 if (!copy)
5177                         return log_oom();
5178
5179                 free_and_replace(rt->tmp_dir, copy);
5180
5181         } else if (streq(key, "var-tmp-dir")) {
5182                 char *copy;
5183
5184                 copy = strdup(value);
5185                 if (!copy)
5186                         return log_oom();
5187
5188                 free_and_replace(rt->var_tmp_dir, copy);
5189
5190         } else if (streq(key, "netns-socket-0")) {
5191                 int fd;
5192
5193                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5194                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5195                         return 0;
5196                 }
5197
5198                 safe_close(rt->netns_storage_socket[0]);
5199                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5200
5201         } else if (streq(key, "netns-socket-1")) {
5202                 int fd;
5203
5204                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5205                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5206                         return 0;
5207                 }
5208
5209                 safe_close(rt->netns_storage_socket[1]);
5210                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5211         } else
5212                 return 0;
5213
5214         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5215         if (rt_create) {
5216                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5217                 if (r < 0) {
5218                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5219                         return 0;
5220                 }
5221
5222                 rt_create->manager = u->manager;
5223
5224                 /* Avoid cleanup */
5225                 rt_create = NULL;
5226         }
5227
5228         return 1;
5229 }
5230
5231 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5232         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5233         int r, fd0 = -1, fd1 = -1;
5234         const char *p, *v = value;
5235         size_t n;
5236
5237         assert(m);
5238         assert(value);
5239         assert(fds);
5240
5241         n = strcspn(v, " ");
5242         id = strndupa(v, n);
5243         if (v[n] != ' ')
5244                 goto finalize;
5245         p = v + n + 1;
5246
5247         v = startswith(p, "tmp-dir=");
5248         if (v) {
5249                 n = strcspn(v, " ");
5250                 tmp_dir = strndupa(v, n);
5251                 if (v[n] != ' ')
5252                         goto finalize;
5253                 p = v + n + 1;
5254         }
5255
5256         v = startswith(p, "var-tmp-dir=");
5257         if (v) {
5258                 n = strcspn(v, " ");
5259                 var_tmp_dir = strndupa(v, n);
5260                 if (v[n] != ' ')
5261                         goto finalize;
5262                 p = v + n + 1;
5263         }
5264
5265         v = startswith(p, "netns-socket-0=");
5266         if (v) {
5267                 char *buf;
5268
5269                 n = strcspn(v, " ");
5270                 buf = strndupa(v, n);
5271                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5272                         log_debug("Unable to process exec-runtime netns fd specification.");
5273                         return;
5274                 }
5275                 fd0 = fdset_remove(fds, fd0);
5276                 if (v[n] != ' ')
5277                         goto finalize;
5278                 p = v + n + 1;
5279         }
5280
5281         v = startswith(p, "netns-socket-1=");
5282         if (v) {
5283                 char *buf;
5284
5285                 n = strcspn(v, " ");
5286                 buf = strndupa(v, n);
5287                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5288                         log_debug("Unable to process exec-runtime netns fd specification.");
5289                         return;
5290                 }
5291                 fd1 = fdset_remove(fds, fd1);
5292         }
5293
5294 finalize:
5295
5296         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5297         if (r < 0)
5298                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5299 }
5300
5301 void exec_runtime_vacuum(Manager *m) {
5302         ExecRuntime *rt;
5303         Iterator i;
5304
5305         assert(m);
5306
5307         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5308
5309         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5310                 if (rt->n_ref > 0)
5311                         continue;
5312
5313                 (void) exec_runtime_free(rt, false);
5314         }
5315 }
5316
5317 void exec_params_clear(ExecParameters *p) {
5318         if (!p)
5319                 return;
5320
5321         strv_free(p->environment);
5322 }
5323
5324 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5325         [EXEC_INPUT_NULL] = "null",
5326         [EXEC_INPUT_TTY] = "tty",
5327         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5328         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5329         [EXEC_INPUT_SOCKET] = "socket",
5330         [EXEC_INPUT_NAMED_FD] = "fd",
5331         [EXEC_INPUT_DATA] = "data",
5332         [EXEC_INPUT_FILE] = "file",
5333 };
5334
5335 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5336
5337 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5338         [EXEC_OUTPUT_INHERIT] = "inherit",
5339         [EXEC_OUTPUT_NULL] = "null",
5340         [EXEC_OUTPUT_TTY] = "tty",
5341         [EXEC_OUTPUT_SYSLOG] = "syslog",
5342         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5343         [EXEC_OUTPUT_KMSG] = "kmsg",
5344         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5345         [EXEC_OUTPUT_JOURNAL] = "journal",
5346         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5347         [EXEC_OUTPUT_SOCKET] = "socket",
5348         [EXEC_OUTPUT_NAMED_FD] = "fd",
5349         [EXEC_OUTPUT_FILE] = "file",
5350         [EXEC_OUTPUT_FILE_APPEND] = "append",
5351 };
5352
5353 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5354
5355 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5356         [EXEC_UTMP_INIT] = "init",
5357         [EXEC_UTMP_LOGIN] = "login",
5358         [EXEC_UTMP_USER] = "user",
5359 };
5360
5361 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5362
5363 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5364         [EXEC_PRESERVE_NO] = "no",
5365         [EXEC_PRESERVE_YES] = "yes",
5366         [EXEC_PRESERVE_RESTART] = "restart",
5367 };
5368
5369 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5370
5371 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5372         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5373         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5374         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5375         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5376         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5377 };
5378
5379 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5380
5381 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5382         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5383         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5384         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5385         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5386         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5387 };
5388
5389 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5390
5391 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5392         [EXEC_KEYRING_INHERIT] = "inherit",
5393         [EXEC_KEYRING_PRIVATE] = "private",
5394         [EXEC_KEYRING_SHARED] = "shared",
5395 };
5396
5397 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);