src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "execute.h"
  57 #include "exit-status.h"
  58 #include "fd-util.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "missing.h"
  69 #include "mkdir.h"
  70 #include "namespace.h"
  71 #include "parse-util.h"
  72 #include "path-util.h"
  73 #include "process-util.h"
  74 #include "rlimit-util.h"
  75 #include "rm-rf.h"
  76 #if HAVE_SECCOMP
  77 #include "seccomp-util.h"
  78 #endif
  79 #include "securebits.h"
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "util.h"
  96 #include "utmp-wtmp.h"
  97
  98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 100
 101 /* This assumes there is a 'tty' group */
 102 #define TTY_MODE 0620
 103
 104 #define SNDBUF_SIZE (8*1024*1024)
 105
 106 static int shift_fds(int fds[], size_t n_fds) {
 107         int start, restart_from;
 108
 109         if (n_fds <= 0)
 110                 return 0;
 111
 112         /* Modifies the fds array! (sorts it) */
 113
 114         assert(fds);
 115
 116         start = 0;
 117         for (;;) {
 118                 int i;
 119
 120                 restart_from = -1;
 121
 122                 for (i = start; i < (int) n_fds; i++) {
 123                         int nfd;
 124
 125                         /* Already at right index? */
 126                         if (fds[i] == i+3)
 127                                 continue;
 128
 129                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 130                         if (nfd < 0)
 131                                 return -errno;
 132
 133                         safe_close(fds[i]);
 134                         fds[i] = nfd;
 135
 136                         /* Hmm, the fd we wanted isn't free? Then
 137                          * let's remember that and try again from here */
 138                         if (nfd != i+3 && restart_from < 0)
 139                                 restart_from = i;
 140                 }
 141
 142                 if (restart_from < 0)
 143                         break;
 144
 145                 start = restart_from;
 146         }
 147
 148         return 0;
 149 }
 150
 151 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 152         size_t i, n_fds;
 153         int r;
 154
 155         n_fds = n_socket_fds + n_storage_fds;
 156         if (n_fds <= 0)
 157                 return 0;
 158
 159         assert(fds);
 160
 161         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 162          * O_NONBLOCK only applies to socket activation though. */
 163
 164         for (i = 0; i < n_fds; i++) {
 165
 166                 if (i < n_socket_fds) {
 167                         r = fd_nonblock(fds[i], nonblock);
 168                         if (r < 0)
 169                                 return r;
 170                 }
 171
 172                 /* We unconditionally drop FD_CLOEXEC from the fds,
 173                  * since after all we want to pass these fds to our
 174                  * children */
 175
 176                 r = fd_cloexec(fds[i], false);
 177                 if (r < 0)
 178                         return r;
 179         }
 180
 181         return 0;
 182 }
 183
 184 static const char *exec_context_tty_path(const ExecContext *context) {
 185         assert(context);
 186
 187         if (context->stdio_as_fds)
 188                 return NULL;
 189
 190         if (context->tty_path)
 191                 return context->tty_path;
 192
 193         return "/dev/console";
 194 }
 195
 196 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 197         const char *path;
 198
 199         assert(context);
 200
 201         path = exec_context_tty_path(context);
 202
 203         if (context->tty_vhangup) {
 204                 if (p && p->stdin_fd >= 0)
 205                         (void) terminal_vhangup_fd(p->stdin_fd);
 206                 else if (path)
 207                         (void) terminal_vhangup(path);
 208         }
 209
 210         if (context->tty_reset) {
 211                 if (p && p->stdin_fd >= 0)
 212                         (void) reset_terminal_fd(p->stdin_fd, true);
 213                 else if (path)
 214                         (void) reset_terminal(path);
 215         }
 216
 217         if (context->tty_vt_disallocate && path)
 218                 (void) vt_disallocate(path);
 219 }
 220
 221 static bool is_terminal_input(ExecInput i) {
 222         return IN_SET(i,
 223                       EXEC_INPUT_TTY,
 224                       EXEC_INPUT_TTY_FORCE,
 225                       EXEC_INPUT_TTY_FAIL);
 226 }
 227
 228 static bool is_terminal_output(ExecOutput o) {
 229         return IN_SET(o,
 230                       EXEC_OUTPUT_TTY,
 231                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 232                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 233                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 234 }
 235
 236 static bool is_syslog_output(ExecOutput o) {
 237         return IN_SET(o,
 238                       EXEC_OUTPUT_SYSLOG,
 239                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 240 }
 241
 242 static bool is_kmsg_output(ExecOutput o) {
 243         return IN_SET(o,
 244                       EXEC_OUTPUT_KMSG,
 245                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 246 }
 247
 248 static bool exec_context_needs_term(const ExecContext *c) {
 249         assert(c);
 250
 251         /* Return true if the execution context suggests we should set $TERM to something useful. */
 252
 253         if (is_terminal_input(c->std_input))
 254                 return true;
 255
 256         if (is_terminal_output(c->std_output))
 257                 return true;
 258
 259         if (is_terminal_output(c->std_error))
 260                 return true;
 261
 262         return !!c->tty_path;
 263 }
 264
 265 static int open_null_as(int flags, int nfd) {
 266         int fd;
 267
 268         assert(nfd >= 0);
 269
 270         fd = open("/dev/null", flags|O_NOCTTY);
 271         if (fd < 0)
 272                 return -errno;
 273
 274         return move_fd(fd, nfd, false);
 275 }
 276
 277 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 278         static const union sockaddr_union sa = {
 279                 .un.sun_family = AF_UNIX,
 280                 .un.sun_path = "/run/systemd/journal/stdout",
 281         };
 282         uid_t olduid = UID_INVALID;
 283         gid_t oldgid = GID_INVALID;
 284         int r;
 285
 286         if (gid_is_valid(gid)) {
 287                 oldgid = getgid();
 288
 289                 if (setegid(gid) < 0)
 290                         return -errno;
 291         }
 292
 293         if (uid_is_valid(uid)) {
 294                 olduid = getuid();
 295
 296                 if (seteuid(uid) < 0) {
 297                         r = -errno;
 298                         goto restore_gid;
 299                 }
 300         }
 301
 302         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 303
 304         /* If we fail to restore the uid or gid, things will likely
 305            fail later on. This should only happen if an LSM interferes. */
 306
 307         if (uid_is_valid(uid))
 308                 (void) seteuid(olduid);
 309
 310  restore_gid:
 311         if (gid_is_valid(gid))
 312                 (void) setegid(oldgid);
 313
 314         return r;
 315 }
 316
 317 static int connect_logger_as(
 318                 const Unit *unit,
 319                 const ExecContext *context,
 320                 const ExecParameters *params,
 321                 ExecOutput output,
 322                 const char *ident,
 323                 int nfd,
 324                 uid_t uid,
 325                 gid_t gid) {
 326
 327         _cleanup_close_ int fd = -1;
 328         int r;
 329
 330         assert(context);
 331         assert(params);
 332         assert(output < _EXEC_OUTPUT_MAX);
 333         assert(ident);
 334         assert(nfd >= 0);
 335
 336         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 337         if (fd < 0)
 338                 return -errno;
 339
 340         r = connect_journal_socket(fd, uid, gid);
 341         if (r < 0)
 342                 return r;
 343
 344         if (shutdown(fd, SHUT_RD) < 0)
 345                 return -errno;
 346
 347         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 348
 349         if (dprintf(fd,
 350                 "%s\n"
 351                 "%s\n"
 352                 "%i\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n"
 356                 "%i\n",
 357                 context->syslog_identifier ?: ident,
 358                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 359                 context->syslog_priority,
 360                 !!context->syslog_level_prefix,
 361                 is_syslog_output(output),
 362                 is_kmsg_output(output),
 363                 is_terminal_output(output)) < 0)
 364                 return -errno;
 365
 366         return move_fd(TAKE_FD(fd), nfd, false);
 367 }
 368
 369 static int open_terminal_as(const char *path, int flags, int nfd) {
 370         int fd;
 371
 372         assert(path);
 373         assert(nfd >= 0);
 374
 375         fd = open_terminal(path, flags | O_NOCTTY);
 376         if (fd < 0)
 377                 return fd;
 378
 379         return move_fd(fd, nfd, false);
 380 }
 381
 382 static int acquire_path(const char *path, int flags, mode_t mode) {
 383         union sockaddr_union sa = {};
 384         _cleanup_close_ int fd = -1;
 385         int r, salen;
 386
 387         assert(path);
 388
 389         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 390                 flags |= O_CREAT;
 391
 392         fd = open(path, flags|O_NOCTTY, mode);
 393         if (fd >= 0)
 394                 return TAKE_FD(fd);
 395
 396         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 397                 return -errno;
 398         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 399                 return -ENXIO;
 400
 401         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 402
 403         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 404         if (fd < 0)
 405                 return -errno;
 406
 407         salen = sockaddr_un_set_path(&sa.un, path);
 408         if (salen < 0)
 409                 return salen;
 410
 411         if (connect(fd, &sa.sa, salen) < 0)
 412                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 413                                                            * indication that his wasn't an AF_UNIX socket after all */
 414
 415         if ((flags & O_ACCMODE) == O_RDONLY)
 416                 r = shutdown(fd, SHUT_WR);
 417         else if ((flags & O_ACCMODE) == O_WRONLY)
 418                 r = shutdown(fd, SHUT_RD);
 419         else
 420                 return TAKE_FD(fd);
 421         if (r < 0)
 422                 return -errno;
 423
 424         return TAKE_FD(fd);
 425 }
 426
 427 static int fixup_input(
 428                 const ExecContext *context,
 429                 int socket_fd,
 430                 bool apply_tty_stdin) {
 431
 432         ExecInput std_input;
 433
 434         assert(context);
 435
 436         std_input = context->std_input;
 437
 438         if (is_terminal_input(std_input) && !apply_tty_stdin)
 439                 return EXEC_INPUT_NULL;
 440
 441         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 442                 return EXEC_INPUT_NULL;
 443
 444         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 445                 return EXEC_INPUT_NULL;
 446
 447         return std_input;
 448 }
 449
 450 static int fixup_output(ExecOutput std_output, int socket_fd) {
 451
 452         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 453                 return EXEC_OUTPUT_INHERIT;
 454
 455         return std_output;
 456 }
 457
 458 static int setup_input(
 459                 const ExecContext *context,
 460                 const ExecParameters *params,
 461                 int socket_fd,
 462                 int named_iofds[3]) {
 463
 464         ExecInput i;
 465
 466         assert(context);
 467         assert(params);
 468
 469         if (params->stdin_fd >= 0) {
 470                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 471                         return -errno;
 472
 473                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 474                 if (isatty(STDIN_FILENO)) {
 475                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 476                         (void) reset_terminal_fd(STDIN_FILENO, true);
 477                 }
 478
 479                 return STDIN_FILENO;
 480         }
 481
 482         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 483
 484         switch (i) {
 485
 486         case EXEC_INPUT_NULL:
 487                 return open_null_as(O_RDONLY, STDIN_FILENO);
 488
 489         case EXEC_INPUT_TTY:
 490         case EXEC_INPUT_TTY_FORCE:
 491         case EXEC_INPUT_TTY_FAIL: {
 492                 int fd;
 493
 494                 fd = acquire_terminal(exec_context_tty_path(context),
 495                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 496                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 497                                                                   ACQUIRE_TERMINAL_WAIT,
 498                                       USEC_INFINITY);
 499                 if (fd < 0)
 500                         return fd;
 501
 502                 return move_fd(fd, STDIN_FILENO, false);
 503         }
 504
 505         case EXEC_INPUT_SOCKET:
 506                 assert(socket_fd >= 0);
 507
 508                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 509
 510         case EXEC_INPUT_NAMED_FD:
 511                 assert(named_iofds[STDIN_FILENO] >= 0);
 512
 513                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 514                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 515
 516         case EXEC_INPUT_DATA: {
 517                 int fd;
 518
 519                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 520                 if (fd < 0)
 521                         return fd;
 522
 523                 return move_fd(fd, STDIN_FILENO, false);
 524         }
 525
 526         case EXEC_INPUT_FILE: {
 527                 bool rw;
 528                 int fd;
 529
 530                 assert(context->stdio_file[STDIN_FILENO]);
 531
 532                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 533                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 534
 535                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 536                 if (fd < 0)
 537                         return fd;
 538
 539                 return move_fd(fd, STDIN_FILENO, false);
 540         }
 541
 542         default:
 543                 assert_not_reached("Unknown input type");
 544         }
 545 }
 546
 547 static bool can_inherit_stderr_from_stdout(
 548                 const ExecContext *context,
 549                 ExecOutput o,
 550                 ExecOutput e) {
 551
 552         assert(context);
 553
 554         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 555          * stderr fd */
 556
 557         if (e == EXEC_OUTPUT_INHERIT)
 558                 return true;
 559         if (e != o)
 560                 return false;
 561
 562         if (e == EXEC_OUTPUT_NAMED_FD)
 563                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 564
 565         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 566                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 567
 568         return true;
 569 }
 570
 571 static int setup_output(
 572                 const Unit *unit,
 573                 const ExecContext *context,
 574                 const ExecParameters *params,
 575                 int fileno,
 576                 int socket_fd,
 577                 int named_iofds[3],
 578                 const char *ident,
 579                 uid_t uid,
 580                 gid_t gid,
 581                 dev_t *journal_stream_dev,
 582                 ino_t *journal_stream_ino) {
 583
 584         ExecOutput o;
 585         ExecInput i;
 586         int r;
 587
 588         assert(unit);
 589         assert(context);
 590         assert(params);
 591         assert(ident);
 592         assert(journal_stream_dev);
 593         assert(journal_stream_ino);
 594
 595         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 596
 597                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 598                         return -errno;
 599
 600                 return STDOUT_FILENO;
 601         }
 602
 603         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 604                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 605                         return -errno;
 606
 607                 return STDERR_FILENO;
 608         }
 609
 610         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 611         o = fixup_output(context->std_output, socket_fd);
 612
 613         if (fileno == STDERR_FILENO) {
 614                 ExecOutput e;
 615                 e = fixup_output(context->std_error, socket_fd);
 616
 617                 /* This expects the input and output are already set up */
 618
 619                 /* Don't change the stderr file descriptor if we inherit all
 620                  * the way and are not on a tty */
 621                 if (e == EXEC_OUTPUT_INHERIT &&
 622                     o == EXEC_OUTPUT_INHERIT &&
 623                     i == EXEC_INPUT_NULL &&
 624                     !is_terminal_input(context->std_input) &&
 625                     getppid () != 1)
 626                         return fileno;
 627
 628                 /* Duplicate from stdout if possible */
 629                 if (can_inherit_stderr_from_stdout(context, o, e))
 630                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 631
 632                 o = e;
 633
 634         } else if (o == EXEC_OUTPUT_INHERIT) {
 635                 /* If input got downgraded, inherit the original value */
 636                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 637                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 638
 639                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 640                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 641                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 642
 643                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 644                 if (getppid() != 1)
 645                         return fileno;
 646
 647                 /* We need to open /dev/null here anew, to get the right access mode. */
 648                 return open_null_as(O_WRONLY, fileno);
 649         }
 650
 651         switch (o) {
 652
 653         case EXEC_OUTPUT_NULL:
 654                 return open_null_as(O_WRONLY, fileno);
 655
 656         case EXEC_OUTPUT_TTY:
 657                 if (is_terminal_input(i))
 658                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 659
 660                 /* We don't reset the terminal if this is just about output */
 661                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 662
 663         case EXEC_OUTPUT_SYSLOG:
 664         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 665         case EXEC_OUTPUT_KMSG:
 666         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 667         case EXEC_OUTPUT_JOURNAL:
 668         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 669                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 670                 if (r < 0) {
 671                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 672                         r = open_null_as(O_WRONLY, fileno);
 673                 } else {
 674                         struct stat st;
 675
 676                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 677                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 678                          * services to detect whether they are connected to the journal or not.
 679                          *
 680                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 681                          * about STDERR as that's usually the best way to do logging. */
 682
 683                         if (fstat(fileno, &st) >= 0 &&
 684                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 685                                 *journal_stream_dev = st.st_dev;
 686                                 *journal_stream_ino = st.st_ino;
 687                         }
 688                 }
 689                 return r;
 690
 691         case EXEC_OUTPUT_SOCKET:
 692                 assert(socket_fd >= 0);
 693
 694                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 695
 696         case EXEC_OUTPUT_NAMED_FD:
 697                 assert(named_iofds[fileno] >= 0);
 698
 699                 (void) fd_nonblock(named_iofds[fileno], false);
 700                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 701
 702         case EXEC_OUTPUT_FILE:
 703         case EXEC_OUTPUT_FILE_APPEND: {
 704                 bool rw;
 705                 int fd, flags;
 706
 707                 assert(context->stdio_file[fileno]);
 708
 709                 rw = context->std_input == EXEC_INPUT_FILE &&
 710                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 711
 712                 if (rw)
 713                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 714
 715                 flags = O_WRONLY;
 716                 if (o == EXEC_OUTPUT_FILE_APPEND)
 717                         flags |= O_APPEND;
 718
 719                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 720                 if (fd < 0)
 721                         return fd;
 722
 723                 return move_fd(fd, fileno, 0);
 724         }
 725
 726         default:
 727                 assert_not_reached("Unknown error type");
 728         }
 729 }
 730
 731 static int chown_terminal(int fd, uid_t uid) {
 732         struct stat st;
 733
 734         assert(fd >= 0);
 735
 736         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 737         if (isatty(fd) < 1)
 738                 return 0;
 739
 740         /* This might fail. What matters are the results. */
 741         (void) fchown(fd, uid, -1);
 742         (void) fchmod(fd, TTY_MODE);
 743
 744         if (fstat(fd, &st) < 0)
 745                 return -errno;
 746
 747         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 748                 return -EPERM;
 749
 750         return 0;
 751 }
 752
 753 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 754         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 755         int r;
 756
 757         assert(_saved_stdin);
 758         assert(_saved_stdout);
 759
 760         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 761         if (saved_stdin < 0)
 762                 return -errno;
 763
 764         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 765         if (saved_stdout < 0)
 766                 return -errno;
 767
 768         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 769         if (fd < 0)
 770                 return fd;
 771
 772         r = chown_terminal(fd, getuid());
 773         if (r < 0)
 774                 return r;
 775
 776         r = reset_terminal_fd(fd, true);
 777         if (r < 0)
 778                 return r;
 779
 780         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 781         fd = -1;
 782         if (r < 0)
 783                 return r;
 784
 785         *_saved_stdin = saved_stdin;
 786         *_saved_stdout = saved_stdout;
 787
 788         saved_stdin = saved_stdout = -1;
 789
 790         return 0;
 791 }
 792
 793 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 794         assert(err < 0);
 795
 796         if (err == -ETIMEDOUT)
 797                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 798         else {
 799                 errno = -err;
 800                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 801         }
 802 }
 803
 804 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 805         _cleanup_close_ int fd = -1;
 806
 807         assert(vc);
 808
 809         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 810         if (fd < 0)
 811                 return;
 812
 813         write_confirm_error_fd(err, fd, u);
 814 }
 815
 816 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 817         int r = 0;
 818
 819         assert(saved_stdin);
 820         assert(saved_stdout);
 821
 822         release_terminal();
 823
 824         if (*saved_stdin >= 0)
 825                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 826                         r = -errno;
 827
 828         if (*saved_stdout >= 0)
 829                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 830                         r = -errno;
 831
 832         *saved_stdin = safe_close(*saved_stdin);
 833         *saved_stdout = safe_close(*saved_stdout);
 834
 835         return r;
 836 }
 837
 838 enum {
 839         CONFIRM_PRETEND_FAILURE = -1,
 840         CONFIRM_PRETEND_SUCCESS =  0,
 841         CONFIRM_EXECUTE = 1,
 842 };
 843
 844 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 845         int saved_stdout = -1, saved_stdin = -1, r;
 846         _cleanup_free_ char *e = NULL;
 847         char c;
 848
 849         /* For any internal errors, assume a positive response. */
 850         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 851         if (r < 0) {
 852                 write_confirm_error(r, vc, u);
 853                 return CONFIRM_EXECUTE;
 854         }
 855
 856         /* confirm_spawn might have been disabled while we were sleeping. */
 857         if (manager_is_confirm_spawn_disabled(u->manager)) {
 858                 r = 1;
 859                 goto restore_stdio;
 860         }
 861
 862         e = ellipsize(cmdline, 60, 100);
 863         if (!e) {
 864                 log_oom();
 865                 r = CONFIRM_EXECUTE;
 866                 goto restore_stdio;
 867         }
 868
 869         for (;;) {
 870                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 871                 if (r < 0) {
 872                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 873                         r = CONFIRM_EXECUTE;
 874                         goto restore_stdio;
 875                 }
 876
 877                 switch (c) {
 878                 case 'c':
 879                         printf("Resuming normal execution.\n");
 880                         manager_disable_confirm_spawn();
 881                         r = 1;
 882                         break;
 883                 case 'D':
 884                         unit_dump(u, stdout, "  ");
 885                         continue; /* ask again */
 886                 case 'f':
 887                         printf("Failing execution.\n");
 888                         r = CONFIRM_PRETEND_FAILURE;
 889                         break;
 890                 case 'h':
 891                         printf("  c - continue, proceed without asking anymore\n"
 892                                "  D - dump, show the state of the unit\n"
 893                                "  f - fail, don't execute the command and pretend it failed\n"
 894                                "  h - help\n"
 895                                "  i - info, show a short summary of the unit\n"
 896                                "  j - jobs, show jobs that are in progress\n"
 897                                "  s - skip, don't execute the command and pretend it succeeded\n"
 898                                "  y - yes, execute the command\n");
 899                         continue; /* ask again */
 900                 case 'i':
 901                         printf("  Description: %s\n"
 902                                "  Unit:        %s\n"
 903                                "  Command:     %s\n",
 904                                u->id, u->description, cmdline);
 905                         continue; /* ask again */
 906                 case 'j':
 907                         manager_dump_jobs(u->manager, stdout, "  ");
 908                         continue; /* ask again */
 909                 case 'n':
 910                         /* 'n' was removed in favor of 'f'. */
 911                         printf("Didn't understand 'n', did you mean 'f'?\n");
 912                         continue; /* ask again */
 913                 case 's':
 914                         printf("Skipping execution.\n");
 915                         r = CONFIRM_PRETEND_SUCCESS;
 916                         break;
 917                 case 'y':
 918                         r = CONFIRM_EXECUTE;
 919                         break;
 920                 default:
 921                         assert_not_reached("Unhandled choice");
 922                 }
 923                 break;
 924         }
 925
 926 restore_stdio:
 927         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 928         return r;
 929 }
 930
 931 static int get_fixed_user(const ExecContext *c, const char **user,
 932                           uid_t *uid, gid_t *gid,
 933                           const char **home, const char **shell) {
 934         int r;
 935         const char *name;
 936
 937         assert(c);
 938
 939         if (!c->user)
 940                 return 0;
 941
 942         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 943          * (i.e. are "/" or "/bin/nologin"). */
 944
 945         name = c->user;
 946         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 947         if (r < 0)
 948                 return r;
 949
 950         *user = name;
 951         return 0;
 952 }
 953
 954 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 955         int r;
 956         const char *name;
 957
 958         assert(c);
 959
 960         if (!c->group)
 961                 return 0;
 962
 963         name = c->group;
 964         r = get_group_creds(&name, gid, 0);
 965         if (r < 0)
 966                 return r;
 967
 968         *group = name;
 969         return 0;
 970 }
 971
 972 static int get_supplementary_groups(const ExecContext *c, const char *user,
 973                                     const char *group, gid_t gid,
 974                                     gid_t **supplementary_gids, int *ngids) {
 975         char **i;
 976         int r, k = 0;
 977         int ngroups_max;
 978         bool keep_groups = false;
 979         gid_t *groups = NULL;
 980         _cleanup_free_ gid_t *l_gids = NULL;
 981
 982         assert(c);
 983
 984         /*
 985          * If user is given, then lookup GID and supplementary groups list.
 986          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 987          * here and as early as possible so we keep the list of supplementary
 988          * groups of the caller.
 989          */
 990         if (user && gid_is_valid(gid) && gid != 0) {
 991                 /* First step, initialize groups from /etc/groups */
 992                 if (initgroups(user, gid) < 0)
 993                         return -errno;
 994
 995                 keep_groups = true;
 996         }
 997
 998         if (strv_isempty(c->supplementary_groups))
 999                 return 0;
1000
1001         /*
1002          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1003          * be positive, otherwise fail.
1004          */
1005         errno = 0;
1006         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1007         if (ngroups_max <= 0) {
1008                 if (errno > 0)
1009                         return -errno;
1010                 else
1011                         return -EOPNOTSUPP; /* For all other values */
1012         }
1013
1014         l_gids = new(gid_t, ngroups_max);
1015         if (!l_gids)
1016                 return -ENOMEM;
1017
1018         if (keep_groups) {
1019                 /*
1020                  * Lookup the list of groups that the user belongs to, we
1021                  * avoid NSS lookups here too for gid=0.
1022                  */
1023                 k = ngroups_max;
1024                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1025                         return -EINVAL;
1026         } else
1027                 k = 0;
1028
1029         STRV_FOREACH(i, c->supplementary_groups) {
1030                 const char *g;
1031
1032                 if (k >= ngroups_max)
1033                         return -E2BIG;
1034
1035                 g = *i;
1036                 r = get_group_creds(&g, l_gids+k, 0);
1037                 if (r < 0)
1038                         return r;
1039
1040                 k++;
1041         }
1042
1043         /*
1044          * Sets ngids to zero to drop all supplementary groups, happens
1045          * when we are under root and SupplementaryGroups= is empty.
1046          */
1047         if (k == 0) {
1048                 *ngids = 0;
1049                 return 0;
1050         }
1051
1052         /* Otherwise get the final list of supplementary groups */
1053         groups = memdup(l_gids, sizeof(gid_t) * k);
1054         if (!groups)
1055                 return -ENOMEM;
1056
1057         *supplementary_gids = groups;
1058         *ngids = k;
1059
1060         groups = NULL;
1061
1062         return 0;
1063 }
1064
1065 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1066         int r;
1067
1068         /* Handle SupplementaryGroups= if it is not empty */
1069         if (ngids > 0) {
1070                 r = maybe_setgroups(ngids, supplementary_gids);
1071                 if (r < 0)
1072                         return r;
1073         }
1074
1075         if (gid_is_valid(gid)) {
1076                 /* Then set our gids */
1077                 if (setresgid(gid, gid, gid) < 0)
1078                         return -errno;
1079         }
1080
1081         return 0;
1082 }
1083
1084 static int enforce_user(const ExecContext *context, uid_t uid) {
1085         assert(context);
1086
1087         if (!uid_is_valid(uid))
1088                 return 0;
1089
1090         /* Sets (but doesn't look up) the uid and make sure we keep the
1091          * capabilities while doing so. */
1092
1093         if (context->capability_ambient_set != 0) {
1094
1095                 /* First step: If we need to keep capabilities but
1096                  * drop privileges we need to make sure we keep our
1097                  * caps, while we drop privileges. */
1098                 if (uid != 0) {
1099                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1100
1101                         if (prctl(PR_GET_SECUREBITS) != sb)
1102                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1103                                         return -errno;
1104                 }
1105         }
1106
1107         /* Second step: actually set the uids */
1108         if (setresuid(uid, uid, uid) < 0)
1109                 return -errno;
1110
1111         /* At this point we should have all necessary capabilities but
1112            are otherwise a normal user. However, the caps might got
1113            corrupted due to the setresuid() so we need clean them up
1114            later. This is done outside of this call. */
1115
1116         return 0;
1117 }
1118
1119 #if HAVE_PAM
1120
1121 static int null_conv(
1122                 int num_msg,
1123                 const struct pam_message **msg,
1124                 struct pam_response **resp,
1125                 void *appdata_ptr) {
1126
1127         /* We don't support conversations */
1128
1129         return PAM_CONV_ERR;
1130 }
1131
1132 #endif
1133
1134 static int setup_pam(
1135                 const char *name,
1136                 const char *user,
1137                 uid_t uid,
1138                 gid_t gid,
1139                 const char *tty,
1140                 char ***env,
1141                 int fds[], size_t n_fds) {
1142
1143 #if HAVE_PAM
1144
1145         static const struct pam_conv conv = {
1146                 .conv = null_conv,
1147                 .appdata_ptr = NULL
1148         };
1149
1150         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1151         pam_handle_t *handle = NULL;
1152         sigset_t old_ss;
1153         int pam_code = PAM_SUCCESS, r;
1154         char **nv, **e = NULL;
1155         bool close_session = false;
1156         pid_t pam_pid = 0, parent_pid;
1157         int flags = 0;
1158
1159         assert(name);
1160         assert(user);
1161         assert(env);
1162
1163         /* We set up PAM in the parent process, then fork. The child
1164          * will then stay around until killed via PR_GET_PDEATHSIG or
1165          * systemd via the cgroup logic. It will then remove the PAM
1166          * session again. The parent process will exec() the actual
1167          * daemon. We do things this way to ensure that the main PID
1168          * of the daemon is the one we initially fork()ed. */
1169
1170         r = barrier_create(&barrier);
1171         if (r < 0)
1172                 goto fail;
1173
1174         if (log_get_max_level() < LOG_DEBUG)
1175                 flags |= PAM_SILENT;
1176
1177         pam_code = pam_start(name, user, &conv, &handle);
1178         if (pam_code != PAM_SUCCESS) {
1179                 handle = NULL;
1180                 goto fail;
1181         }
1182
1183         if (!tty) {
1184                 _cleanup_free_ char *q = NULL;
1185
1186                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1187                  * out if that's the case, and read the TTY off it. */
1188
1189                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1190                         tty = strjoina("/dev/", q);
1191         }
1192
1193         if (tty) {
1194                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1195                 if (pam_code != PAM_SUCCESS)
1196                         goto fail;
1197         }
1198
1199         STRV_FOREACH(nv, *env) {
1200                 pam_code = pam_putenv(handle, *nv);
1201                 if (pam_code != PAM_SUCCESS)
1202                         goto fail;
1203         }
1204
1205         pam_code = pam_acct_mgmt(handle, flags);
1206         if (pam_code != PAM_SUCCESS)
1207                 goto fail;
1208
1209         pam_code = pam_open_session(handle, flags);
1210         if (pam_code != PAM_SUCCESS)
1211                 goto fail;
1212
1213         close_session = true;
1214
1215         e = pam_getenvlist(handle);
1216         if (!e) {
1217                 pam_code = PAM_BUF_ERR;
1218                 goto fail;
1219         }
1220
1221         /* Block SIGTERM, so that we know that it won't get lost in
1222          * the child */
1223
1224         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1225
1226         parent_pid = getpid_cached();
1227
1228         r = safe_fork("(sd-pam)", 0, &pam_pid);
1229         if (r < 0)
1230                 goto fail;
1231         if (r == 0) {
1232                 int sig, ret = EXIT_PAM;
1233
1234                 /* The child's job is to reset the PAM session on
1235                  * termination */
1236                 barrier_set_role(&barrier, BARRIER_CHILD);
1237
1238                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1239                  * are open here that have been opened by PAM. */
1240                 (void) close_many(fds, n_fds);
1241
1242                 /* Drop privileges - we don't need any to pam_close_session
1243                  * and this will make PR_SET_PDEATHSIG work in most cases.
1244                  * If this fails, ignore the error - but expect sd-pam threads
1245                  * to fail to exit normally */
1246
1247                 r = maybe_setgroups(0, NULL);
1248                 if (r < 0)
1249                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1250                 if (setresgid(gid, gid, gid) < 0)
1251                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1252                 if (setresuid(uid, uid, uid) < 0)
1253                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1254
1255                 (void) ignore_signals(SIGPIPE, -1);
1256
1257                 /* Wait until our parent died. This will only work if
1258                  * the above setresuid() succeeds, otherwise the kernel
1259                  * will not allow unprivileged parents kill their privileged
1260                  * children this way. We rely on the control groups kill logic
1261                  * to do the rest for us. */
1262                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1263                         goto child_finish;
1264
1265                 /* Tell the parent that our setup is done. This is especially
1266                  * important regarding dropping privileges. Otherwise, unit
1267                  * setup might race against our setresuid(2) call.
1268                  *
1269                  * If the parent aborted, we'll detect this below, hence ignore
1270                  * return failure here. */
1271                 (void) barrier_place(&barrier);
1272
1273                 /* Check if our parent process might already have died? */
1274                 if (getppid() == parent_pid) {
1275                         sigset_t ss;
1276
1277                         assert_se(sigemptyset(&ss) >= 0);
1278                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1279
1280                         for (;;) {
1281                                 if (sigwait(&ss, &sig) < 0) {
1282                                         if (errno == EINTR)
1283                                                 continue;
1284
1285                                         goto child_finish;
1286                                 }
1287
1288                                 assert(sig == SIGTERM);
1289                                 break;
1290                         }
1291                 }
1292
1293                 /* If our parent died we'll end the session */
1294                 if (getppid() != parent_pid) {
1295                         pam_code = pam_close_session(handle, flags);
1296                         if (pam_code != PAM_SUCCESS)
1297                                 goto child_finish;
1298                 }
1299
1300                 ret = 0;
1301
1302         child_finish:
1303                 pam_end(handle, pam_code | flags);
1304                 _exit(ret);
1305         }
1306
1307         barrier_set_role(&barrier, BARRIER_PARENT);
1308
1309         /* If the child was forked off successfully it will do all the
1310          * cleanups, so forget about the handle here. */
1311         handle = NULL;
1312
1313         /* Unblock SIGTERM again in the parent */
1314         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1315
1316         /* We close the log explicitly here, since the PAM modules
1317          * might have opened it, but we don't want this fd around. */
1318         closelog();
1319
1320         /* Synchronously wait for the child to initialize. We don't care for
1321          * errors as we cannot recover. However, warn loudly if it happens. */
1322         if (!barrier_place_and_sync(&barrier))
1323                 log_error("PAM initialization failed");
1324
1325         return strv_free_and_replace(*env, e);
1326
1327 fail:
1328         if (pam_code != PAM_SUCCESS) {
1329                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1330                 r = -EPERM;  /* PAM errors do not map to errno */
1331         } else
1332                 log_error_errno(r, "PAM failed: %m");
1333
1334         if (handle) {
1335                 if (close_session)
1336                         pam_code = pam_close_session(handle, flags);
1337
1338                 pam_end(handle, pam_code | flags);
1339         }
1340
1341         strv_free(e);
1342         closelog();
1343
1344         return r;
1345 #else
1346         return 0;
1347 #endif
1348 }
1349
1350 static void rename_process_from_path(const char *path) {
1351         char process_name[11];
1352         const char *p;
1353         size_t l;
1354
1355         /* This resulting string must fit in 10 chars (i.e. the length
1356          * of "/sbin/init") to look pretty in /bin/ps */
1357
1358         p = basename(path);
1359         if (isempty(p)) {
1360                 rename_process("(...)");
1361                 return;
1362         }
1363
1364         l = strlen(p);
1365         if (l > 8) {
1366                 /* The end of the process name is usually more
1367                  * interesting, since the first bit might just be
1368                  * "systemd-" */
1369                 p = p + l - 8;
1370                 l = 8;
1371         }
1372
1373         process_name[0] = '(';
1374         memcpy(process_name+1, p, l);
1375         process_name[1+l] = ')';
1376         process_name[1+l+1] = 0;
1377
1378         rename_process(process_name);
1379 }
1380
1381 static bool context_has_address_families(const ExecContext *c) {
1382         assert(c);
1383
1384         return c->address_families_whitelist ||
1385                 !set_isempty(c->address_families);
1386 }
1387
1388 static bool context_has_syscall_filters(const ExecContext *c) {
1389         assert(c);
1390
1391         return c->syscall_whitelist ||
1392                 !hashmap_isempty(c->syscall_filter);
1393 }
1394
1395 static bool context_has_no_new_privileges(const ExecContext *c) {
1396         assert(c);
1397
1398         if (c->no_new_privileges)
1399                 return true;
1400
1401         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1402                 return false;
1403
1404         /* We need NNP if we have any form of seccomp and are unprivileged */
1405         return context_has_address_families(c) ||
1406                 c->memory_deny_write_execute ||
1407                 c->restrict_realtime ||
1408                 exec_context_restrict_namespaces_set(c) ||
1409                 c->protect_kernel_tunables ||
1410                 c->protect_kernel_modules ||
1411                 c->private_devices ||
1412                 context_has_syscall_filters(c) ||
1413                 !set_isempty(c->syscall_archs) ||
1414                 c->lock_personality;
1415 }
1416
1417 #if HAVE_SECCOMP
1418
1419 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1420
1421         if (is_seccomp_available())
1422                 return false;
1423
1424         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1425         return true;
1426 }
1427
1428 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1429         uint32_t negative_action, default_action, action;
1430         int r;
1431
1432         assert(u);
1433         assert(c);
1434
1435         if (!context_has_syscall_filters(c))
1436                 return 0;
1437
1438         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1439                 return 0;
1440
1441         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1442
1443         if (c->syscall_whitelist) {
1444                 default_action = negative_action;
1445                 action = SCMP_ACT_ALLOW;
1446         } else {
1447                 default_action = SCMP_ACT_ALLOW;
1448                 action = negative_action;
1449         }
1450
1451         if (needs_ambient_hack) {
1452                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1453                 if (r < 0)
1454                         return r;
1455         }
1456
1457         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1458 }
1459
1460 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1461         assert(u);
1462         assert(c);
1463
1464         if (set_isempty(c->syscall_archs))
1465                 return 0;
1466
1467         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1468                 return 0;
1469
1470         return seccomp_restrict_archs(c->syscall_archs);
1471 }
1472
1473 static int apply_address_families(const Unit* u, const ExecContext *c) {
1474         assert(u);
1475         assert(c);
1476
1477         if (!context_has_address_families(c))
1478                 return 0;
1479
1480         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1481                 return 0;
1482
1483         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1484 }
1485
1486 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1487         assert(u);
1488         assert(c);
1489
1490         if (!c->memory_deny_write_execute)
1491                 return 0;
1492
1493         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1494                 return 0;
1495
1496         return seccomp_memory_deny_write_execute();
1497 }
1498
1499 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1500         assert(u);
1501         assert(c);
1502
1503         if (!c->restrict_realtime)
1504                 return 0;
1505
1506         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1507                 return 0;
1508
1509         return seccomp_restrict_realtime();
1510 }
1511
1512 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1513         assert(u);
1514         assert(c);
1515
1516         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1517          * let's protect even those systems where this is left on in the kernel. */
1518
1519         if (!c->protect_kernel_tunables)
1520                 return 0;
1521
1522         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1523                 return 0;
1524
1525         return seccomp_protect_sysctl();
1526 }
1527
1528 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1529         assert(u);
1530         assert(c);
1531
1532         /* Turn off module syscalls on ProtectKernelModules=yes */
1533
1534         if (!c->protect_kernel_modules)
1535                 return 0;
1536
1537         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1538                 return 0;
1539
1540         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1541 }
1542
1543 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1544         assert(u);
1545         assert(c);
1546
1547         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1548
1549         if (!c->private_devices)
1550                 return 0;
1551
1552         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1553                 return 0;
1554
1555         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1556 }
1557
1558 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1559         assert(u);
1560         assert(c);
1561
1562         if (!exec_context_restrict_namespaces_set(c))
1563                 return 0;
1564
1565         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1566                 return 0;
1567
1568         return seccomp_restrict_namespaces(c->restrict_namespaces);
1569 }
1570
1571 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1572         unsigned long personality;
1573         int r;
1574
1575         assert(u);
1576         assert(c);
1577
1578         if (!c->lock_personality)
1579                 return 0;
1580
1581         if (skip_seccomp_unavailable(u, "LockPersonality="))
1582                 return 0;
1583
1584         personality = c->personality;
1585
1586         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1587         if (personality == PERSONALITY_INVALID) {
1588
1589                 r = opinionated_personality(&personality);
1590                 if (r < 0)
1591                         return r;
1592         }
1593
1594         return seccomp_lock_personality(personality);
1595 }
1596
1597 #endif
1598
1599 static void do_idle_pipe_dance(int idle_pipe[4]) {
1600         assert(idle_pipe);
1601
1602         idle_pipe[1] = safe_close(idle_pipe[1]);
1603         idle_pipe[2] = safe_close(idle_pipe[2]);
1604
1605         if (idle_pipe[0] >= 0) {
1606                 int r;
1607
1608                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1609
1610                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1611                         ssize_t n;
1612
1613                         /* Signal systemd that we are bored and want to continue. */
1614                         n = write(idle_pipe[3], "x", 1);
1615                         if (n > 0)
1616                                 /* Wait for systemd to react to the signal above. */
1617                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1618                 }
1619
1620                 idle_pipe[0] = safe_close(idle_pipe[0]);
1621
1622         }
1623
1624         idle_pipe[3] = safe_close(idle_pipe[3]);
1625 }
1626
1627 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1628
1629 static int build_environment(
1630                 const Unit *u,
1631                 const ExecContext *c,
1632                 const ExecParameters *p,
1633                 size_t n_fds,
1634                 const char *home,
1635                 const char *username,
1636                 const char *shell,
1637                 dev_t journal_stream_dev,
1638                 ino_t journal_stream_ino,
1639                 char ***ret) {
1640
1641         _cleanup_strv_free_ char **our_env = NULL;
1642         ExecDirectoryType t;
1643         size_t n_env = 0;
1644         char *x;
1645
1646         assert(u);
1647         assert(c);
1648         assert(p);
1649         assert(ret);
1650
1651         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1652         if (!our_env)
1653                 return -ENOMEM;
1654
1655         if (n_fds > 0) {
1656                 _cleanup_free_ char *joined = NULL;
1657
1658                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1659                         return -ENOMEM;
1660                 our_env[n_env++] = x;
1661
1662                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1663                         return -ENOMEM;
1664                 our_env[n_env++] = x;
1665
1666                 joined = strv_join(p->fd_names, ":");
1667                 if (!joined)
1668                         return -ENOMEM;
1669
1670                 x = strjoin("LISTEN_FDNAMES=", joined);
1671                 if (!x)
1672                         return -ENOMEM;
1673                 our_env[n_env++] = x;
1674         }
1675
1676         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1677                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1678                         return -ENOMEM;
1679                 our_env[n_env++] = x;
1680
1681                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1682                         return -ENOMEM;
1683                 our_env[n_env++] = x;
1684         }
1685
1686         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1687          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1688          * check the database directly. */
1689         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1690                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1691                 if (!x)
1692                         return -ENOMEM;
1693                 our_env[n_env++] = x;
1694         }
1695
1696         if (home) {
1697                 x = strappend("HOME=", home);
1698                 if (!x)
1699                         return -ENOMEM;
1700                 our_env[n_env++] = x;
1701         }
1702
1703         if (username) {
1704                 x = strappend("LOGNAME=", username);
1705                 if (!x)
1706                         return -ENOMEM;
1707                 our_env[n_env++] = x;
1708
1709                 x = strappend("USER=", username);
1710                 if (!x)
1711                         return -ENOMEM;
1712                 our_env[n_env++] = x;
1713         }
1714
1715         if (shell) {
1716                 x = strappend("SHELL=", shell);
1717                 if (!x)
1718                         return -ENOMEM;
1719                 our_env[n_env++] = x;
1720         }
1721
1722         if (!sd_id128_is_null(u->invocation_id)) {
1723                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1724                         return -ENOMEM;
1725
1726                 our_env[n_env++] = x;
1727         }
1728
1729         if (exec_context_needs_term(c)) {
1730                 const char *tty_path, *term = NULL;
1731
1732                 tty_path = exec_context_tty_path(c);
1733
1734                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1735                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1736                  * passes to PID 1 ends up all the way in the console login shown. */
1737
1738                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1739                         term = getenv("TERM");
1740                 if (!term)
1741                         term = default_term_for_tty(tty_path);
1742
1743                 x = strappend("TERM=", term);
1744                 if (!x)
1745                         return -ENOMEM;
1746                 our_env[n_env++] = x;
1747         }
1748
1749         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1750                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1751                         return -ENOMEM;
1752
1753                 our_env[n_env++] = x;
1754         }
1755
1756         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1757                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1758                 const char *n;
1759
1760                 if (!p->prefix[t])
1761                         continue;
1762
1763                 if (strv_isempty(c->directories[t].paths))
1764                         continue;
1765
1766                 n = exec_directory_env_name_to_string(t);
1767                 if (!n)
1768                         continue;
1769
1770                 pre = strjoin(p->prefix[t], "/");
1771                 if (!pre)
1772                         return -ENOMEM;
1773
1774                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1775                 if (!joined)
1776                         return -ENOMEM;
1777
1778                 x = strjoin(n, "=", joined);
1779                 if (!x)
1780                         return -ENOMEM;
1781
1782                 our_env[n_env++] = x;
1783         }
1784
1785         our_env[n_env++] = NULL;
1786         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1787
1788         *ret = TAKE_PTR(our_env);
1789
1790         return 0;
1791 }
1792
1793 static int build_pass_environment(const ExecContext *c, char ***ret) {
1794         _cleanup_strv_free_ char **pass_env = NULL;
1795         size_t n_env = 0, n_bufsize = 0;
1796         char **i;
1797
1798         STRV_FOREACH(i, c->pass_environment) {
1799                 _cleanup_free_ char *x = NULL;
1800                 char *v;
1801
1802                 v = getenv(*i);
1803                 if (!v)
1804                         continue;
1805                 x = strjoin(*i, "=", v);
1806                 if (!x)
1807                         return -ENOMEM;
1808
1809                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1810                         return -ENOMEM;
1811
1812                 pass_env[n_env++] = TAKE_PTR(x);
1813                 pass_env[n_env] = NULL;
1814         }
1815
1816         *ret = TAKE_PTR(pass_env);
1817
1818         return 0;
1819 }
1820
1821 static bool exec_needs_mount_namespace(
1822                 const ExecContext *context,
1823                 const ExecParameters *params,
1824                 const ExecRuntime *runtime) {
1825
1826         assert(context);
1827         assert(params);
1828
1829         if (context->root_image)
1830                 return true;
1831
1832         if (!strv_isempty(context->read_write_paths) ||
1833             !strv_isempty(context->read_only_paths) ||
1834             !strv_isempty(context->inaccessible_paths))
1835                 return true;
1836
1837         if (context->n_bind_mounts > 0)
1838                 return true;
1839
1840         if (context->n_temporary_filesystems > 0)
1841                 return true;
1842
1843         if (context->mount_flags != 0)
1844                 return true;
1845
1846         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1847                 return true;
1848
1849         if (context->private_devices ||
1850             context->private_mounts ||
1851             context->protect_system != PROTECT_SYSTEM_NO ||
1852             context->protect_home != PROTECT_HOME_NO ||
1853             context->protect_kernel_tunables ||
1854             context->protect_kernel_modules ||
1855             context->protect_control_groups)
1856                 return true;
1857
1858         if (context->root_directory) {
1859                 ExecDirectoryType t;
1860
1861                 if (context->mount_apivfs)
1862                         return true;
1863
1864                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1865                         if (!params->prefix[t])
1866                                 continue;
1867
1868                         if (!strv_isempty(context->directories[t].paths))
1869                                 return true;
1870                 }
1871         }
1872
1873         if (context->dynamic_user &&
1874             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1875              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1876              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1877                 return true;
1878
1879         return false;
1880 }
1881
1882 static int setup_private_users(uid_t uid, gid_t gid) {
1883         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1884         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1885         _cleanup_close_ int unshare_ready_fd = -1;
1886         _cleanup_(sigkill_waitp) pid_t pid = 0;
1887         uint64_t c = 1;
1888         ssize_t n;
1889         int r;
1890
1891         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1892          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1893          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1894          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1895          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1896          * continues execution normally. */
1897
1898         if (uid != 0 && uid_is_valid(uid)) {
1899                 r = asprintf(&uid_map,
1900                              "0 0 1\n"                      /* Map root → root */
1901                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1902                              uid, uid);
1903                 if (r < 0)
1904                         return -ENOMEM;
1905         } else {
1906                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1907                 if (!uid_map)
1908                         return -ENOMEM;
1909         }
1910
1911         if (gid != 0 && gid_is_valid(gid)) {
1912                 r = asprintf(&gid_map,
1913                              "0 0 1\n"                      /* Map root → root */
1914                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1915                              gid, gid);
1916                 if (r < 0)
1917                         return -ENOMEM;
1918         } else {
1919                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1920                 if (!gid_map)
1921                         return -ENOMEM;
1922         }
1923
1924         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1925          * namespace. */
1926         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1927         if (unshare_ready_fd < 0)
1928                 return -errno;
1929
1930         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1931          * failed. */
1932         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1933                 return -errno;
1934
1935         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1936         if (r < 0)
1937                 return r;
1938         if (r == 0) {
1939                 _cleanup_close_ int fd = -1;
1940                 const char *a;
1941                 pid_t ppid;
1942
1943                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1944                  * here, after the parent opened its own user namespace. */
1945
1946                 ppid = getppid();
1947                 errno_pipe[0] = safe_close(errno_pipe[0]);
1948
1949                 /* Wait until the parent unshared the user namespace */
1950                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1951                         r = -errno;
1952                         goto child_fail;
1953                 }
1954
1955                 /* Disable the setgroups() system call in the child user namespace, for good. */
1956                 a = procfs_file_alloca(ppid, "setgroups");
1957                 fd = open(a, O_WRONLY|O_CLOEXEC);
1958                 if (fd < 0) {
1959                         if (errno != ENOENT) {
1960                                 r = -errno;
1961                                 goto child_fail;
1962                         }
1963
1964                         /* If the file is missing the kernel is too old, let's continue anyway. */
1965                 } else {
1966                         if (write(fd, "deny\n", 5) < 0) {
1967                                 r = -errno;
1968                                 goto child_fail;
1969                         }
1970
1971                         fd = safe_close(fd);
1972                 }
1973
1974                 /* First write the GID map */
1975                 a = procfs_file_alloca(ppid, "gid_map");
1976                 fd = open(a, O_WRONLY|O_CLOEXEC);
1977                 if (fd < 0) {
1978                         r = -errno;
1979                         goto child_fail;
1980                 }
1981                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1982                         r = -errno;
1983                         goto child_fail;
1984                 }
1985                 fd = safe_close(fd);
1986
1987                 /* The write the UID map */
1988                 a = procfs_file_alloca(ppid, "uid_map");
1989                 fd = open(a, O_WRONLY|O_CLOEXEC);
1990                 if (fd < 0) {
1991                         r = -errno;
1992                         goto child_fail;
1993                 }
1994                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1995                         r = -errno;
1996                         goto child_fail;
1997                 }
1998
1999                 _exit(EXIT_SUCCESS);
2000
2001         child_fail:
2002                 (void) write(errno_pipe[1], &r, sizeof(r));
2003                 _exit(EXIT_FAILURE);
2004         }
2005
2006         errno_pipe[1] = safe_close(errno_pipe[1]);
2007
2008         if (unshare(CLONE_NEWUSER) < 0)
2009                 return -errno;
2010
2011         /* Let the child know that the namespace is ready now */
2012         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2013                 return -errno;
2014
2015         /* Try to read an error code from the child */
2016         n = read(errno_pipe[0], &r, sizeof(r));
2017         if (n < 0)
2018                 return -errno;
2019         if (n == sizeof(r)) { /* an error code was sent to us */
2020                 if (r < 0)
2021                         return r;
2022                 return -EIO;
2023         }
2024         if (n != 0) /* on success we should have read 0 bytes */
2025                 return -EIO;
2026
2027         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2028         pid = 0;
2029         if (r < 0)
2030                 return r;
2031         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2032                 return -EIO;
2033
2034         return 0;
2035 }
2036
2037 static int setup_exec_directory(
2038                 const ExecContext *context,
2039                 const ExecParameters *params,
2040                 uid_t uid,
2041                 gid_t gid,
2042                 ExecDirectoryType type,
2043                 int *exit_status) {
2044
2045         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2046                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2047                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2048                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2049                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2050                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2051         };
2052         char **rt;
2053         int r;
2054
2055         assert(context);
2056         assert(params);
2057         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2058         assert(exit_status);
2059
2060         if (!params->prefix[type])
2061                 return 0;
2062
2063         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2064                 if (!uid_is_valid(uid))
2065                         uid = 0;
2066                 if (!gid_is_valid(gid))
2067                         gid = 0;
2068         }
2069
2070         STRV_FOREACH(rt, context->directories[type].paths) {
2071                 _cleanup_free_ char *p = NULL, *pp = NULL;
2072
2073                 p = strjoin(params->prefix[type], "/", *rt);
2074                 if (!p) {
2075                         r = -ENOMEM;
2076                         goto fail;
2077                 }
2078
2079                 r = mkdir_parents_label(p, 0755);
2080                 if (r < 0)
2081                         goto fail;
2082
2083                 if (context->dynamic_user &&
2084                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2085                         _cleanup_free_ char *private_root = NULL;
2086
2087                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2088                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2089                          * whose UID is later on reused. To lock this down we use the same trick used by container
2090                          * managers to prohibit host users to get access to files of the same UID in containers: we
2091                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2092                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2093                          * to make this directory permeable for the service itself.
2094                          *
2095                          * Specifically: for a service which wants a special directory "foo/" we first create a
2096                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2097                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2098                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2099                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2100                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2101                          * disabling the access boundary for the service and making sure it only gets access to the
2102                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2103                          *
2104                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2105                          * owned by the service itself.
2106                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2107                          * files or sockets with other services. */
2108
2109                         private_root = strjoin(params->prefix[type], "/private");
2110                         if (!private_root) {
2111                                 r = -ENOMEM;
2112                                 goto fail;
2113                         }
2114
2115                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2116                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2117                         if (r < 0)
2118                                 goto fail;
2119
2120                         pp = strjoin(private_root, "/", *rt);
2121                         if (!pp) {
2122                                 r = -ENOMEM;
2123                                 goto fail;
2124                         }
2125
2126                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2127                         r = mkdir_parents_label(pp, 0755);
2128                         if (r < 0)
2129                                 goto fail;
2130
2131                         if (is_dir(p, false) > 0 &&
2132                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2133
2134                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2135                                  * it over. Most likely the service has been upgraded from one that didn't use
2136                                  * DynamicUser=1, to one that does. */
2137
2138                                 if (rename(p, pp) < 0) {
2139                                         r = -errno;
2140                                         goto fail;
2141                                 }
2142                         } else {
2143                                 /* Otherwise, create the actual directory for the service */
2144
2145                                 r = mkdir_label(pp, context->directories[type].mode);
2146                                 if (r < 0 && r != -EEXIST)
2147                                         goto fail;
2148                         }
2149
2150                         /* And link it up from the original place */
2151                         r = symlink_idempotent(pp, p, true);
2152                         if (r < 0)
2153                                 goto fail;
2154
2155                         /* Lock down the access mode */
2156                         if (chmod(pp, context->directories[type].mode) < 0) {
2157                                 r = -errno;
2158                                 goto fail;
2159                         }
2160                 } else {
2161                         r = mkdir_label(p, context->directories[type].mode);
2162                         if (r < 0 && r != -EEXIST)
2163                                 goto fail;
2164                         if (r == -EEXIST && !context->dynamic_user)
2165                                 continue;
2166                 }
2167
2168                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2169                  * a service, and shall not be writable. */
2170                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2171                         continue;
2172
2173                 /* Then, change the ownership of the whole tree, if necessary */
2174                 r = path_chown_recursive(pp ?: p, uid, gid);
2175                 if (r < 0)
2176                         goto fail;
2177         }
2178
2179         return 0;
2180
2181 fail:
2182         *exit_status = exit_status_table[type];
2183         return r;
2184 }
2185
2186 #if ENABLE_SMACK
2187 static int setup_smack(
2188                 const ExecContext *context,
2189                 const ExecCommand *command) {
2190
2191         int r;
2192
2193         assert(context);
2194         assert(command);
2195
2196         if (context->smack_process_label) {
2197                 r = mac_smack_apply_pid(0, context->smack_process_label);
2198                 if (r < 0)
2199                         return r;
2200         }
2201 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2202         else {
2203                 _cleanup_free_ char *exec_label = NULL;
2204
2205                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2206                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2207                         return r;
2208
2209                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2210                 if (r < 0)
2211                         return r;
2212         }
2213 #endif
2214
2215         return 0;
2216 }
2217 #endif
2218
2219 static int compile_bind_mounts(
2220                 const ExecContext *context,
2221                 const ExecParameters *params,
2222                 BindMount **ret_bind_mounts,
2223                 size_t *ret_n_bind_mounts,
2224                 char ***ret_empty_directories) {
2225
2226         _cleanup_strv_free_ char **empty_directories = NULL;
2227         BindMount *bind_mounts;
2228         size_t n, h = 0, i;
2229         ExecDirectoryType t;
2230         int r;
2231
2232         assert(context);
2233         assert(params);
2234         assert(ret_bind_mounts);
2235         assert(ret_n_bind_mounts);
2236         assert(ret_empty_directories);
2237
2238         n = context->n_bind_mounts;
2239         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2240                 if (!params->prefix[t])
2241                         continue;
2242
2243                 n += strv_length(context->directories[t].paths);
2244         }
2245
2246         if (n <= 0) {
2247                 *ret_bind_mounts = NULL;
2248                 *ret_n_bind_mounts = 0;
2249                 *ret_empty_directories = NULL;
2250                 return 0;
2251         }
2252
2253         bind_mounts = new(BindMount, n);
2254         if (!bind_mounts)
2255                 return -ENOMEM;
2256
2257         for (i = 0; i < context->n_bind_mounts; i++) {
2258                 BindMount *item = context->bind_mounts + i;
2259                 char *s, *d;
2260
2261                 s = strdup(item->source);
2262                 if (!s) {
2263                         r = -ENOMEM;
2264                         goto finish;
2265                 }
2266
2267                 d = strdup(item->destination);
2268                 if (!d) {
2269                         free(s);
2270                         r = -ENOMEM;
2271                         goto finish;
2272                 }
2273
2274                 bind_mounts[h++] = (BindMount) {
2275                         .source = s,
2276                         .destination = d,
2277                         .read_only = item->read_only,
2278                         .recursive = item->recursive,
2279                         .ignore_enoent = item->ignore_enoent,
2280                 };
2281         }
2282
2283         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2284                 char **suffix;
2285
2286                 if (!params->prefix[t])
2287                         continue;
2288
2289                 if (strv_isempty(context->directories[t].paths))
2290                         continue;
2291
2292                 if (context->dynamic_user &&
2293                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2294                     !(context->root_directory || context->root_image)) {
2295                         char *private_root;
2296
2297                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2298                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2299                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2300
2301                         private_root = strjoin(params->prefix[t], "/private");
2302                         if (!private_root) {
2303                                 r = -ENOMEM;
2304                                 goto finish;
2305                         }
2306
2307                         r = strv_consume(&empty_directories, private_root);
2308                         if (r < 0)
2309                                 goto finish;
2310                 }
2311
2312                 STRV_FOREACH(suffix, context->directories[t].paths) {
2313                         char *s, *d;
2314
2315                         if (context->dynamic_user &&
2316                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2317                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2318                         else
2319                                 s = strjoin(params->prefix[t], "/", *suffix);
2320                         if (!s) {
2321                                 r = -ENOMEM;
2322                                 goto finish;
2323                         }
2324
2325                         if (context->dynamic_user &&
2326                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2327                             (context->root_directory || context->root_image))
2328                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2329                                  * directory is not created on the root directory. So, let's bind-mount the directory
2330                                  * on the 'non-private' place. */
2331                                 d = strjoin(params->prefix[t], "/", *suffix);
2332                         else
2333                                 d = strdup(s);
2334                         if (!d) {
2335                                 free(s);
2336                                 r = -ENOMEM;
2337                                 goto finish;
2338                         }
2339
2340                         bind_mounts[h++] = (BindMount) {
2341                                 .source = s,
2342                                 .destination = d,
2343                                 .read_only = false,
2344                                 .recursive = true,
2345                                 .ignore_enoent = false,
2346                         };
2347                 }
2348         }
2349
2350         assert(h == n);
2351
2352         *ret_bind_mounts = bind_mounts;
2353         *ret_n_bind_mounts = n;
2354         *ret_empty_directories = TAKE_PTR(empty_directories);
2355
2356         return (int) n;
2357
2358 finish:
2359         bind_mount_free_many(bind_mounts, h);
2360         return r;
2361 }
2362
2363 static int apply_mount_namespace(
2364                 const Unit *u,
2365                 const ExecCommand *command,
2366                 const ExecContext *context,
2367                 const ExecParameters *params,
2368                 const ExecRuntime *runtime) {
2369
2370         _cleanup_strv_free_ char **empty_directories = NULL;
2371         char *tmp = NULL, *var = NULL;
2372         const char *root_dir = NULL, *root_image = NULL;
2373         NamespaceInfo ns_info;
2374         bool needs_sandboxing;
2375         BindMount *bind_mounts = NULL;
2376         size_t n_bind_mounts = 0;
2377         int r;
2378
2379         assert(context);
2380
2381         /* The runtime struct only contains the parent of the private /tmp,
2382          * which is non-accessible to world users. Inside of it there's a /tmp
2383          * that is sticky, and that's the one we want to use here. */
2384
2385         if (context->private_tmp && runtime) {
2386                 if (runtime->tmp_dir)
2387                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2388                 if (runtime->var_tmp_dir)
2389                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2390         }
2391
2392         if (params->flags & EXEC_APPLY_CHROOT) {
2393                 root_image = context->root_image;
2394
2395                 if (!root_image)
2396                         root_dir = context->root_directory;
2397         }
2398
2399         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2400         if (r < 0)
2401                 return r;
2402
2403         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2404         if (needs_sandboxing)
2405                 ns_info = (NamespaceInfo) {
2406                         .ignore_protect_paths = false,
2407                         .private_dev = context->private_devices,
2408                         .protect_control_groups = context->protect_control_groups,
2409                         .protect_kernel_tunables = context->protect_kernel_tunables,
2410                         .protect_kernel_modules = context->protect_kernel_modules,
2411                         .mount_apivfs = context->mount_apivfs,
2412                         .private_mounts = context->private_mounts,
2413                 };
2414         else if (!context->dynamic_user && root_dir)
2415                 /*
2416                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2417                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2418                  * fail if we are enable to apply the sandbox inside the mount namespace.
2419                  */
2420                 ns_info = (NamespaceInfo) {
2421                         .ignore_protect_paths = true,
2422                 };
2423         else
2424                 ns_info = (NamespaceInfo) {};
2425
2426         r = setup_namespace(root_dir, root_image,
2427                             &ns_info, context->read_write_paths,
2428                             needs_sandboxing ? context->read_only_paths : NULL,
2429                             needs_sandboxing ? context->inaccessible_paths : NULL,
2430                             empty_directories,
2431                             bind_mounts,
2432                             n_bind_mounts,
2433                             context->temporary_filesystems,
2434                             context->n_temporary_filesystems,
2435                             tmp,
2436                             var,
2437                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2438                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2439                             context->mount_flags,
2440                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2441
2442         bind_mount_free_many(bind_mounts, n_bind_mounts);
2443
2444         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2445          * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
2446          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2447          * completely different execution environment. */
2448         if (r == -ENOANO) {
2449                 if (n_bind_mounts == 0 &&
2450                     context->n_temporary_filesystems == 0 &&
2451                     !root_dir && !root_image &&
2452                     !context->dynamic_user) {
2453                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2454                         return 0;
2455                 }
2456
2457                 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2458                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2459                                n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2460
2461                 return -EOPNOTSUPP;
2462         }
2463
2464         return r;
2465 }
2466
2467 static int apply_working_directory(
2468                 const ExecContext *context,
2469                 const ExecParameters *params,
2470                 const char *home,
2471                 const bool needs_mount_ns,
2472                 int *exit_status) {
2473
2474         const char *d, *wd;
2475
2476         assert(context);
2477         assert(exit_status);
2478
2479         if (context->working_directory_home) {
2480
2481                 if (!home) {
2482                         *exit_status = EXIT_CHDIR;
2483                         return -ENXIO;
2484                 }
2485
2486                 wd = home;
2487
2488         } else if (context->working_directory)
2489                 wd = context->working_directory;
2490         else
2491                 wd = "/";
2492
2493         if (params->flags & EXEC_APPLY_CHROOT) {
2494                 if (!needs_mount_ns && context->root_directory)
2495                         if (chroot(context->root_directory) < 0) {
2496                                 *exit_status = EXIT_CHROOT;
2497                                 return -errno;
2498                         }
2499
2500                 d = wd;
2501         } else
2502                 d = prefix_roota(context->root_directory, wd);
2503
2504         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2505                 *exit_status = EXIT_CHDIR;
2506                 return -errno;
2507         }
2508
2509         return 0;
2510 }
2511
2512 static int setup_keyring(
2513                 const Unit *u,
2514                 const ExecContext *context,
2515                 const ExecParameters *p,
2516                 uid_t uid, gid_t gid) {
2517
2518         key_serial_t keyring;
2519         int r = 0;
2520         uid_t saved_uid;
2521         gid_t saved_gid;
2522
2523         assert(u);
2524         assert(context);
2525         assert(p);
2526
2527         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2528          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2529          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2530          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2531          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2532          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2533
2534         if (!(p->flags & EXEC_NEW_KEYRING))
2535                 return 0;
2536
2537         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2538                 return 0;
2539
2540         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2541          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2542          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2543          * & group is just as nasty as acquiring a reference to the user keyring. */
2544
2545         saved_uid = getuid();
2546         saved_gid = getgid();
2547
2548         if (gid_is_valid(gid) && gid != saved_gid) {
2549                 if (setregid(gid, -1) < 0)
2550                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2551         }
2552
2553         if (uid_is_valid(uid) && uid != saved_uid) {
2554                 if (setreuid(uid, -1) < 0) {
2555                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2556                         goto out;
2557                 }
2558         }
2559
2560         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2561         if (keyring == -1) {
2562                 if (errno == ENOSYS)
2563                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2564                 else if (IN_SET(errno, EACCES, EPERM))
2565                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2566                 else if (errno == EDQUOT)
2567                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2568                 else
2569                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2570
2571                 goto out;
2572         }
2573
2574         /* When requested link the user keyring into the session keyring. */
2575         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2576
2577                 if (keyctl(KEYCTL_LINK,
2578                            KEY_SPEC_USER_KEYRING,
2579                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2580                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2581                         goto out;
2582                 }
2583         }
2584
2585         /* Restore uid/gid back */
2586         if (uid_is_valid(uid) && uid != saved_uid) {
2587                 if (setreuid(saved_uid, -1) < 0) {
2588                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2589                         goto out;
2590                 }
2591         }
2592
2593         if (gid_is_valid(gid) && gid != saved_gid) {
2594                 if (setregid(saved_gid, -1) < 0)
2595                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2596         }
2597
2598         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2599         if (!sd_id128_is_null(u->invocation_id)) {
2600                 key_serial_t key;
2601
2602                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2603                 if (key == -1)
2604                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2605                 else {
2606                         if (keyctl(KEYCTL_SETPERM, key,
2607                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2608                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2609                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2610                 }
2611         }
2612
2613 out:
2614         /* Revert back uid & gid for the the last time, and exit */
2615         /* no extra logging, as only the first already reported error matters */
2616         if (getuid() != saved_uid)
2617                 (void) setreuid(saved_uid, -1);
2618
2619         if (getgid() != saved_gid)
2620                 (void) setregid(saved_gid, -1);
2621
2622         return r;
2623 }
2624
2625 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2626         assert(array);
2627         assert(n);
2628
2629         if (!pair)
2630                 return;
2631
2632         if (pair[0] >= 0)
2633                 array[(*n)++] = pair[0];
2634         if (pair[1] >= 0)
2635                 array[(*n)++] = pair[1];
2636 }
2637
2638 static int close_remaining_fds(
2639                 const ExecParameters *params,
2640                 const ExecRuntime *runtime,
2641                 const DynamicCreds *dcreds,
2642                 int user_lookup_fd,
2643                 int socket_fd,
2644                 int exec_fd,
2645                 int *fds, size_t n_fds) {
2646
2647         size_t n_dont_close = 0;
2648         int dont_close[n_fds + 12];
2649
2650         assert(params);
2651
2652         if (params->stdin_fd >= 0)
2653                 dont_close[n_dont_close++] = params->stdin_fd;
2654         if (params->stdout_fd >= 0)
2655                 dont_close[n_dont_close++] = params->stdout_fd;
2656         if (params->stderr_fd >= 0)
2657                 dont_close[n_dont_close++] = params->stderr_fd;
2658
2659         if (socket_fd >= 0)
2660                 dont_close[n_dont_close++] = socket_fd;
2661         if (exec_fd >= 0)
2662                 dont_close[n_dont_close++] = exec_fd;
2663         if (n_fds > 0) {
2664                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2665                 n_dont_close += n_fds;
2666         }
2667
2668         if (runtime)
2669                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2670
2671         if (dcreds) {
2672                 if (dcreds->user)
2673                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2674                 if (dcreds->group)
2675                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2676         }
2677
2678         if (user_lookup_fd >= 0)
2679                 dont_close[n_dont_close++] = user_lookup_fd;
2680
2681         return close_all_fds(dont_close, n_dont_close);
2682 }
2683
2684 static int send_user_lookup(
2685                 Unit *unit,
2686                 int user_lookup_fd,
2687                 uid_t uid,
2688                 gid_t gid) {
2689
2690         assert(unit);
2691
2692         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2693          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2694          * specified. */
2695
2696         if (user_lookup_fd < 0)
2697                 return 0;
2698
2699         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2700                 return 0;
2701
2702         if (writev(user_lookup_fd,
2703                (struct iovec[]) {
2704                            IOVEC_INIT(&uid, sizeof(uid)),
2705                            IOVEC_INIT(&gid, sizeof(gid)),
2706                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2707                 return -errno;
2708
2709         return 0;
2710 }
2711
2712 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2713         int r;
2714
2715         assert(c);
2716         assert(home);
2717         assert(buf);
2718
2719         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2720
2721         if (*home)
2722                 return 0;
2723
2724         if (!c->working_directory_home)
2725                 return 0;
2726
2727         if (uid == 0) {
2728                 /* Hardcode /root as home directory for UID 0 */
2729                 *home = "/root";
2730                 return 1;
2731         }
2732
2733         r = get_home_dir(buf);
2734         if (r < 0)
2735                 return r;
2736
2737         *home = *buf;
2738         return 1;
2739 }
2740
2741 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2742         _cleanup_strv_free_ char ** list = NULL;
2743         ExecDirectoryType t;
2744         int r;
2745
2746         assert(c);
2747         assert(p);
2748         assert(ret);
2749
2750         assert(c->dynamic_user);
2751
2752         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2753          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2754          * directories. */
2755
2756         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2757                 char **i;
2758
2759                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2760                         continue;
2761
2762                 if (!p->prefix[t])
2763                         continue;
2764
2765                 STRV_FOREACH(i, c->directories[t].paths) {
2766                         char *e;
2767
2768                         if (t == EXEC_DIRECTORY_RUNTIME)
2769                                 e = strjoin(p->prefix[t], "/", *i);
2770                         else
2771                                 e = strjoin(p->prefix[t], "/private/", *i);
2772                         if (!e)
2773                                 return -ENOMEM;
2774
2775                         r = strv_consume(&list, e);
2776                         if (r < 0)
2777                                 return r;
2778                 }
2779         }
2780
2781         *ret = TAKE_PTR(list);
2782
2783         return 0;
2784 }
2785
2786 static char *exec_command_line(char **argv);
2787
2788 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2789         bool using_subcgroup;
2790         char *p;
2791
2792         assert(params);
2793         assert(ret);
2794
2795         if (!params->cgroup_path)
2796                 return -EINVAL;
2797
2798         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2799          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2800          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2801          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2802          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2803          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2804          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2805          * flag, which is only passed for the former statements, not for the latter. */
2806
2807         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2808         if (using_subcgroup)
2809                 p = strjoin(params->cgroup_path, "/.control");
2810         else
2811                 p = strdup(params->cgroup_path);
2812         if (!p)
2813                 return -ENOMEM;
2814
2815         *ret = p;
2816         return using_subcgroup;
2817 }
2818
2819 static int exec_child(
2820                 Unit *unit,
2821                 const ExecCommand *command,
2822                 const ExecContext *context,
2823                 const ExecParameters *params,
2824                 ExecRuntime *runtime,
2825                 DynamicCreds *dcreds,
2826                 int socket_fd,
2827                 int named_iofds[3],
2828                 int *fds,
2829                 size_t n_socket_fds,
2830                 size_t n_storage_fds,
2831                 char **files_env,
2832                 int user_lookup_fd,
2833                 int *exit_status) {
2834
2835         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2836         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2837         _cleanup_free_ gid_t *supplementary_gids = NULL;
2838         const char *username = NULL, *groupname = NULL;
2839         _cleanup_free_ char *home_buffer = NULL;
2840         const char *home = NULL, *shell = NULL;
2841         dev_t journal_stream_dev = 0;
2842         ino_t journal_stream_ino = 0;
2843         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2844                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2845                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2846                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2847 #if HAVE_SELINUX
2848         _cleanup_free_ char *mac_selinux_context_net = NULL;
2849         bool use_selinux = false;
2850 #endif
2851 #if ENABLE_SMACK
2852         bool use_smack = false;
2853 #endif
2854 #if HAVE_APPARMOR
2855         bool use_apparmor = false;
2856 #endif
2857         uid_t uid = UID_INVALID;
2858         gid_t gid = GID_INVALID;
2859         size_t n_fds;
2860         ExecDirectoryType dt;
2861         int secure_bits;
2862
2863         assert(unit);
2864         assert(command);
2865         assert(context);
2866         assert(params);
2867         assert(exit_status);
2868
2869         rename_process_from_path(command->path);
2870
2871         /* We reset exactly these signals, since they are the
2872          * only ones we set to SIG_IGN in the main daemon. All
2873          * others we leave untouched because we set them to
2874          * SIG_DFL or a valid handler initially, both of which
2875          * will be demoted to SIG_DFL. */
2876         (void) default_signals(SIGNALS_CRASH_HANDLER,
2877                                SIGNALS_IGNORE, -1);
2878
2879         if (context->ignore_sigpipe)
2880                 (void) ignore_signals(SIGPIPE, -1);
2881
2882         r = reset_signal_mask();
2883         if (r < 0) {
2884                 *exit_status = EXIT_SIGNAL_MASK;
2885                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2886         }
2887
2888         if (params->idle_pipe)
2889                 do_idle_pipe_dance(params->idle_pipe);
2890
2891         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2892          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2893          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2894          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2895
2896         log_forget_fds();
2897         log_set_open_when_needed(true);
2898
2899         /* In case anything used libc syslog(), close this here, too */
2900         closelog();
2901
2902         n_fds = n_socket_fds + n_storage_fds;
2903         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2904         if (r < 0) {
2905                 *exit_status = EXIT_FDS;
2906                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2907         }
2908
2909         if (!context->same_pgrp)
2910                 if (setsid() < 0) {
2911                         *exit_status = EXIT_SETSID;
2912                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2913                 }
2914
2915         exec_context_tty_reset(context, params);
2916
2917         if (unit_shall_confirm_spawn(unit)) {
2918                 const char *vc = params->confirm_spawn;
2919                 _cleanup_free_ char *cmdline = NULL;
2920
2921                 cmdline = exec_command_line(command->argv);
2922                 if (!cmdline) {
2923                         *exit_status = EXIT_MEMORY;
2924                         return log_oom();
2925                 }
2926
2927                 r = ask_for_confirmation(vc, unit, cmdline);
2928                 if (r != CONFIRM_EXECUTE) {
2929                         if (r == CONFIRM_PRETEND_SUCCESS) {
2930                                 *exit_status = EXIT_SUCCESS;
2931                                 return 0;
2932                         }
2933                         *exit_status = EXIT_CONFIRM;
2934                         log_unit_error(unit, "Execution cancelled by the user");
2935                         return -ECANCELED;
2936                 }
2937         }
2938
2939         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2940          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2941          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2942          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2943          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2944         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2945             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2946                 *exit_status = EXIT_MEMORY;
2947                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2948         }
2949
2950         if (context->dynamic_user && dcreds) {
2951                 _cleanup_strv_free_ char **suggested_paths = NULL;
2952
2953                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2954                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2955                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2956                         *exit_status = EXIT_USER;
2957                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2958                 }
2959
2960                 r = compile_suggested_paths(context, params, &suggested_paths);
2961                 if (r < 0) {
2962                         *exit_status = EXIT_MEMORY;
2963                         return log_oom();
2964                 }
2965
2966                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2967                 if (r < 0) {
2968                         *exit_status = EXIT_USER;
2969                         if (r == -EILSEQ) {
2970                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2971                                 return -EOPNOTSUPP;
2972                         }
2973                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2974                 }
2975
2976                 if (!uid_is_valid(uid)) {
2977                         *exit_status = EXIT_USER;
2978                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2979                         return -ESRCH;
2980                 }
2981
2982                 if (!gid_is_valid(gid)) {
2983                         *exit_status = EXIT_USER;
2984                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2985                         return -ESRCH;
2986                 }
2987
2988                 if (dcreds->user)
2989                         username = dcreds->user->name;
2990
2991         } else {
2992                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2993                 if (r < 0) {
2994                         *exit_status = EXIT_USER;
2995                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2996                 }
2997
2998                 r = get_fixed_group(context, &groupname, &gid);
2999                 if (r < 0) {
3000                         *exit_status = EXIT_GROUP;
3001                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3002                 }
3003         }
3004
3005         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3006         r = get_supplementary_groups(context, username, groupname, gid,
3007                                      &supplementary_gids, &ngids);
3008         if (r < 0) {
3009                 *exit_status = EXIT_GROUP;
3010                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3011         }
3012
3013         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3014         if (r < 0) {
3015                 *exit_status = EXIT_USER;
3016                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3017         }
3018
3019         user_lookup_fd = safe_close(user_lookup_fd);
3020
3021         r = acquire_home(context, uid, &home, &home_buffer);
3022         if (r < 0) {
3023                 *exit_status = EXIT_CHDIR;
3024                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3025         }
3026
3027         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3028          * must sure to drop O_NONBLOCK */
3029         if (socket_fd >= 0)
3030                 (void) fd_nonblock(socket_fd, false);
3031
3032         r = setup_input(context, params, socket_fd, named_iofds);
3033         if (r < 0) {
3034                 *exit_status = EXIT_STDIN;
3035                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3036         }
3037
3038         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3039         if (r < 0) {
3040                 *exit_status = EXIT_STDOUT;
3041                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3042         }
3043
3044         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3045         if (r < 0) {
3046                 *exit_status = EXIT_STDERR;
3047                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3048         }
3049
3050         if (params->cgroup_path) {
3051                 _cleanup_free_ char *p = NULL;
3052
3053                 r = exec_parameters_get_cgroup_path(params, &p);
3054                 if (r < 0) {
3055                         *exit_status = EXIT_CGROUP;
3056                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3057                 }
3058
3059                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3060                 if (r < 0) {
3061                         *exit_status = EXIT_CGROUP;
3062                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3063                 }
3064         }
3065
3066         if (context->oom_score_adjust_set) {
3067                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3068                  * prohibit write access to this file, and we shouldn't trip up over that. */
3069                 r = set_oom_score_adjust(context->oom_score_adjust);
3070                 if (IN_SET(r, -EPERM, -EACCES))
3071                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3072                 else if (r < 0) {
3073                         *exit_status = EXIT_OOM_ADJUST;
3074                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3075                 }
3076         }
3077
3078         if (context->nice_set)
3079                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3080                         *exit_status = EXIT_NICE;
3081                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3082                 }
3083
3084         if (context->cpu_sched_set) {
3085                 struct sched_param param = {
3086                         .sched_priority = context->cpu_sched_priority,
3087                 };
3088
3089                 r = sched_setscheduler(0,
3090                                        context->cpu_sched_policy |
3091                                        (context->cpu_sched_reset_on_fork ?
3092                                         SCHED_RESET_ON_FORK : 0),
3093                                        &param);
3094                 if (r < 0) {
3095                         *exit_status = EXIT_SETSCHEDULER;
3096                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3097                 }
3098         }
3099
3100         if (context->cpuset)
3101                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
3102                         *exit_status = EXIT_CPUAFFINITY;
3103                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3104                 }
3105
3106         if (context->ioprio_set)
3107                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3108                         *exit_status = EXIT_IOPRIO;
3109                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3110                 }
3111
3112         if (context->timer_slack_nsec != NSEC_INFINITY)
3113                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3114                         *exit_status = EXIT_TIMERSLACK;
3115                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3116                 }
3117
3118         if (context->personality != PERSONALITY_INVALID) {
3119                 r = safe_personality(context->personality);
3120                 if (r < 0) {
3121                         *exit_status = EXIT_PERSONALITY;
3122                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3123                 }
3124         }
3125
3126         if (context->utmp_id)
3127                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3128                                       context->tty_path,
3129                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3130                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3131                                       USER_PROCESS,
3132                                       username);
3133
3134         if (context->user) {
3135                 r = chown_terminal(STDIN_FILENO, uid);
3136                 if (r < 0) {
3137                         *exit_status = EXIT_STDIN;
3138                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3139                 }
3140         }
3141
3142         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3143          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3144          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3145          * touch a single hierarchy too. */
3146         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3147                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3148                 if (r < 0) {
3149                         *exit_status = EXIT_CGROUP;
3150                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3151                 }
3152         }
3153
3154         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3155                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3156                 if (r < 0)
3157                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3158         }
3159
3160         r = build_environment(
3161                         unit,
3162                         context,
3163                         params,
3164                         n_fds,
3165                         home,
3166                         username,
3167                         shell,
3168                         journal_stream_dev,
3169                         journal_stream_ino,
3170                         &our_env);
3171         if (r < 0) {
3172                 *exit_status = EXIT_MEMORY;
3173                 return log_oom();
3174         }
3175
3176         r = build_pass_environment(context, &pass_env);
3177         if (r < 0) {
3178                 *exit_status = EXIT_MEMORY;
3179                 return log_oom();
3180         }
3181
3182         accum_env = strv_env_merge(5,
3183                                    params->environment,
3184                                    our_env,
3185                                    pass_env,
3186                                    context->environment,
3187                                    files_env,
3188                                    NULL);
3189         if (!accum_env) {
3190                 *exit_status = EXIT_MEMORY;
3191                 return log_oom();
3192         }
3193         accum_env = strv_env_clean(accum_env);
3194
3195         (void) umask(context->umask);
3196
3197         r = setup_keyring(unit, context, params, uid, gid);
3198         if (r < 0) {
3199                 *exit_status = EXIT_KEYRING;
3200                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3201         }
3202
3203         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3204         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3205
3206         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3207         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3208
3209         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3210         if (needs_ambient_hack)
3211                 needs_setuid = false;
3212         else
3213                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3214
3215         if (needs_sandboxing) {
3216                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3217                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3218                  * impacting our own code paths. */
3219
3220 #if HAVE_SELINUX
3221                 use_selinux = mac_selinux_use();
3222 #endif
3223 #if ENABLE_SMACK
3224                 use_smack = mac_smack_use();
3225 #endif
3226 #if HAVE_APPARMOR
3227                 use_apparmor = mac_apparmor_use();
3228 #endif
3229         }
3230
3231         if (needs_setuid) {
3232                 if (context->pam_name && username) {
3233                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3234                         if (r < 0) {
3235                                 *exit_status = EXIT_PAM;
3236                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3237                         }
3238                 }
3239         }
3240
3241         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3242                 if (ns_type_supported(NAMESPACE_NET)) {
3243                         r = setup_netns(runtime->netns_storage_socket);
3244                         if (r < 0) {
3245                                 *exit_status = EXIT_NETWORK;
3246                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3247                         }
3248                 } else
3249                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3250         }
3251
3252         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3253         if (needs_mount_namespace) {
3254                 r = apply_mount_namespace(unit, command, context, params, runtime);
3255                 if (r < 0) {
3256                         *exit_status = EXIT_NAMESPACE;
3257                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3258                 }
3259         }
3260
3261         /* Drop groups as early as possbile */
3262         if (needs_setuid) {
3263                 r = enforce_groups(gid, supplementary_gids, ngids);
3264                 if (r < 0) {
3265                         *exit_status = EXIT_GROUP;
3266                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3267                 }
3268         }
3269
3270         if (needs_sandboxing) {
3271 #if HAVE_SELINUX
3272                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3273                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3274                         if (r < 0) {
3275                                 *exit_status = EXIT_SELINUX_CONTEXT;
3276                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3277                         }
3278                 }
3279 #endif
3280
3281                 if (context->private_users) {
3282                         r = setup_private_users(uid, gid);
3283                         if (r < 0) {
3284                                 *exit_status = EXIT_USER;
3285                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3286                         }
3287                 }
3288         }
3289
3290         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3291          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3292          * however if we have it as we want to keep it open until the final execve(). */
3293
3294         if (params->exec_fd >= 0) {
3295                 exec_fd = params->exec_fd;
3296
3297                 if (exec_fd < 3 + (int) n_fds) {
3298                         int moved_fd;
3299
3300                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3301                          * process we are about to execute. */
3302
3303                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3304                         if (moved_fd < 0) {
3305                                 *exit_status = EXIT_FDS;
3306                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3307                         }
3308
3309                         safe_close(exec_fd);
3310                         exec_fd = moved_fd;
3311                 } else {
3312                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3313                         r = fd_cloexec(exec_fd, true);
3314                         if (r < 0) {
3315                                 *exit_status = EXIT_FDS;
3316                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3317                         }
3318                 }
3319
3320                 fds_with_exec_fd = newa(int, n_fds + 1);
3321                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3322                 fds_with_exec_fd[n_fds] = exec_fd;
3323                 n_fds_with_exec_fd = n_fds + 1;
3324         } else {
3325                 fds_with_exec_fd = fds;
3326                 n_fds_with_exec_fd = n_fds;
3327         }
3328
3329         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3330         if (r >= 0)
3331                 r = shift_fds(fds, n_fds);
3332         if (r >= 0)
3333                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3334         if (r < 0) {
3335                 *exit_status = EXIT_FDS;
3336                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3337         }
3338
3339         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3340          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3341          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3342          * came this far. */
3343
3344         secure_bits = context->secure_bits;
3345
3346         if (needs_sandboxing) {
3347                 uint64_t bset;
3348                 int which_failed;
3349
3350                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3351                 if (r < 0) {
3352                         *exit_status = EXIT_LIMITS;
3353                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3354                 }
3355
3356                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3357                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3358                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3359                                 *exit_status = EXIT_LIMITS;
3360                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3361                         }
3362                 }
3363
3364 #if ENABLE_SMACK
3365                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3366                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3367                 if (use_smack) {
3368                         r = setup_smack(context, command);
3369                         if (r < 0) {
3370                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3371                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3372                         }
3373                 }
3374 #endif
3375
3376                 bset = context->capability_bounding_set;
3377                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3378                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3379                  * instead of us doing that */
3380                 if (needs_ambient_hack)
3381                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3382                                 (UINT64_C(1) << CAP_SETUID) |
3383                                 (UINT64_C(1) << CAP_SETGID);
3384
3385                 if (!cap_test_all(bset)) {
3386                         r = capability_bounding_set_drop(bset, false);
3387                         if (r < 0) {
3388                                 *exit_status = EXIT_CAPABILITIES;
3389                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3390                         }
3391                 }
3392
3393                 /* This is done before enforce_user, but ambient set
3394                  * does not survive over setresuid() if keep_caps is not set. */
3395                 if (!needs_ambient_hack &&
3396                     context->capability_ambient_set != 0) {
3397                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3398                         if (r < 0) {
3399                                 *exit_status = EXIT_CAPABILITIES;
3400                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3401                         }
3402                 }
3403         }
3404
3405         if (needs_setuid) {
3406                 if (context->user) {
3407                         r = enforce_user(context, uid);
3408                         if (r < 0) {
3409                                 *exit_status = EXIT_USER;
3410                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3411                         }
3412
3413                         if (!needs_ambient_hack &&
3414                             context->capability_ambient_set != 0) {
3415
3416                                 /* Fix the ambient capabilities after user change. */
3417                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3418                                 if (r < 0) {
3419                                         *exit_status = EXIT_CAPABILITIES;
3420                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3421                                 }
3422
3423                                 /* If we were asked to change user and ambient capabilities
3424                                  * were requested, we had to add keep-caps to the securebits
3425                                  * so that we would maintain the inherited capability set
3426                                  * through the setresuid(). Make sure that the bit is added
3427                                  * also to the context secure_bits so that we don't try to
3428                                  * drop the bit away next. */
3429
3430                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3431                         }
3432                 }
3433         }
3434
3435         /* Apply working directory here, because the working directory might be on NFS and only the user running
3436          * this service might have the correct privilege to change to the working directory */
3437         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3438         if (r < 0)
3439                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3440
3441         if (needs_sandboxing) {
3442                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3443                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3444                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3445                  * are restricted. */
3446
3447 #if HAVE_SELINUX
3448                 if (use_selinux) {
3449                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3450
3451                         if (exec_context) {
3452                                 r = setexeccon(exec_context);
3453                                 if (r < 0) {
3454                                         *exit_status = EXIT_SELINUX_CONTEXT;
3455                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3456                                 }
3457                         }
3458                 }
3459 #endif
3460
3461 #if HAVE_APPARMOR
3462                 if (use_apparmor && context->apparmor_profile) {
3463                         r = aa_change_onexec(context->apparmor_profile);
3464                         if (r < 0 && !context->apparmor_profile_ignore) {
3465                                 *exit_status = EXIT_APPARMOR_PROFILE;
3466                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3467                         }
3468                 }
3469 #endif
3470
3471                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3472                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3473                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3474                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3475                                 *exit_status = EXIT_SECUREBITS;
3476                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3477                         }
3478
3479                 if (context_has_no_new_privileges(context))
3480                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3481                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3482                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3483                         }
3484
3485 #if HAVE_SECCOMP
3486                 r = apply_address_families(unit, context);
3487                 if (r < 0) {
3488                         *exit_status = EXIT_ADDRESS_FAMILIES;
3489                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3490                 }
3491
3492                 r = apply_memory_deny_write_execute(unit, context);
3493                 if (r < 0) {
3494                         *exit_status = EXIT_SECCOMP;
3495                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3496                 }
3497
3498                 r = apply_restrict_realtime(unit, context);
3499                 if (r < 0) {
3500                         *exit_status = EXIT_SECCOMP;
3501                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3502                 }
3503
3504                 r = apply_restrict_namespaces(unit, context);
3505                 if (r < 0) {
3506                         *exit_status = EXIT_SECCOMP;
3507                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3508                 }
3509
3510                 r = apply_protect_sysctl(unit, context);
3511                 if (r < 0) {
3512                         *exit_status = EXIT_SECCOMP;
3513                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3514                 }
3515
3516                 r = apply_protect_kernel_modules(unit, context);
3517                 if (r < 0) {
3518                         *exit_status = EXIT_SECCOMP;
3519                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3520                 }
3521
3522                 r = apply_private_devices(unit, context);
3523                 if (r < 0) {
3524                         *exit_status = EXIT_SECCOMP;
3525                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3526                 }
3527
3528                 r = apply_syscall_archs(unit, context);
3529                 if (r < 0) {
3530                         *exit_status = EXIT_SECCOMP;
3531                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3532                 }
3533
3534                 r = apply_lock_personality(unit, context);
3535                 if (r < 0) {
3536                         *exit_status = EXIT_SECCOMP;
3537                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3538                 }
3539
3540                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3541                  * by the filter as little as possible. */
3542                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3543                 if (r < 0) {
3544                         *exit_status = EXIT_SECCOMP;
3545                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3546                 }
3547 #endif
3548         }
3549
3550         if (!strv_isempty(context->unset_environment)) {
3551                 char **ee = NULL;
3552
3553                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3554                 if (!ee) {
3555                         *exit_status = EXIT_MEMORY;
3556                         return log_oom();
3557                 }
3558
3559                 strv_free_and_replace(accum_env, ee);
3560         }
3561
3562         final_argv = replace_env_argv(command->argv, accum_env);
3563         if (!final_argv) {
3564                 *exit_status = EXIT_MEMORY;
3565                 return log_oom();
3566         }
3567
3568         if (DEBUG_LOGGING) {
3569                 _cleanup_free_ char *line;
3570
3571                 line = exec_command_line(final_argv);
3572                 if (line)
3573                         log_struct(LOG_DEBUG,
3574                                    "EXECUTABLE=%s", command->path,
3575                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3576                                    LOG_UNIT_ID(unit),
3577                                    LOG_UNIT_INVOCATION_ID(unit));
3578         }
3579
3580         if (exec_fd >= 0) {
3581                 uint8_t hot = 1;
3582
3583                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3584                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3585
3586                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3587                         *exit_status = EXIT_EXEC;
3588                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3589                 }
3590         }
3591
3592         execve(command->path, final_argv, accum_env);
3593         r = -errno;
3594
3595         if (exec_fd >= 0) {
3596                 uint8_t hot = 0;
3597
3598                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3599                  * that POLLHUP on it no longer means execve() succeeded. */
3600
3601                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3602                         *exit_status = EXIT_EXEC;
3603                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3604                 }
3605         }
3606
3607         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3608                 log_struct_errno(LOG_INFO, r,
3609                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3610                                  LOG_UNIT_ID(unit),
3611                                  LOG_UNIT_INVOCATION_ID(unit),
3612                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3613                                                   command->path),
3614                                  "EXECUTABLE=%s", command->path);
3615                 return 0;
3616         }
3617
3618         *exit_status = EXIT_EXEC;
3619         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3620 }
3621
3622 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3623 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3624
3625 int exec_spawn(Unit *unit,
3626                ExecCommand *command,
3627                const ExecContext *context,
3628                const ExecParameters *params,
3629                ExecRuntime *runtime,
3630                DynamicCreds *dcreds,
3631                pid_t *ret) {
3632
3633         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3634         _cleanup_free_ char *subcgroup_path = NULL;
3635         _cleanup_strv_free_ char **files_env = NULL;
3636         size_t n_storage_fds = 0, n_socket_fds = 0;
3637         _cleanup_free_ char *line = NULL;
3638         pid_t pid;
3639
3640         assert(unit);
3641         assert(command);
3642         assert(context);
3643         assert(ret);
3644         assert(params);
3645         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3646
3647         if (context->std_input == EXEC_INPUT_SOCKET ||
3648             context->std_output == EXEC_OUTPUT_SOCKET ||
3649             context->std_error == EXEC_OUTPUT_SOCKET) {
3650
3651                 if (params->n_socket_fds > 1) {
3652                         log_unit_error(unit, "Got more than one socket.");
3653                         return -EINVAL;
3654                 }
3655
3656                 if (params->n_socket_fds == 0) {
3657                         log_unit_error(unit, "Got no socket.");
3658                         return -EINVAL;
3659                 }
3660
3661                 socket_fd = params->fds[0];
3662         } else {
3663                 socket_fd = -1;
3664                 fds = params->fds;
3665                 n_socket_fds = params->n_socket_fds;
3666                 n_storage_fds = params->n_storage_fds;
3667         }
3668
3669         r = exec_context_named_iofds(context, params, named_iofds);
3670         if (r < 0)
3671                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3672
3673         r = exec_context_load_environment(unit, context, &files_env);
3674         if (r < 0)
3675                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3676
3677         line = exec_command_line(command->argv);
3678         if (!line)
3679                 return log_oom();
3680
3681         log_struct(LOG_DEBUG,
3682                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3683                    "EXECUTABLE=%s", command->path,
3684                    LOG_UNIT_ID(unit),
3685                    LOG_UNIT_INVOCATION_ID(unit));
3686
3687         if (params->cgroup_path) {
3688                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3689                 if (r < 0)
3690                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3691                 if (r > 0) { /* We are using a child cgroup */
3692                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3693                         if (r < 0)
3694                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3695                 }
3696         }
3697
3698         pid = fork();
3699         if (pid < 0)
3700                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3701
3702         if (pid == 0) {
3703                 int exit_status = EXIT_SUCCESS;
3704
3705                 r = exec_child(unit,
3706                                command,
3707                                context,
3708                                params,
3709                                runtime,
3710                                dcreds,
3711                                socket_fd,
3712                                named_iofds,
3713                                fds,
3714                                n_socket_fds,
3715                                n_storage_fds,
3716                                files_env,
3717                                unit->manager->user_lookup_fds[1],
3718                                &exit_status);
3719
3720                 if (r < 0)
3721                         log_struct_errno(LOG_ERR, r,
3722                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3723                                          LOG_UNIT_ID(unit),
3724                                          LOG_UNIT_INVOCATION_ID(unit),
3725                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3726                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3727                                                           command->path),
3728                                          "EXECUTABLE=%s", command->path);
3729
3730                 _exit(exit_status);
3731         }
3732
3733         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3734
3735         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3736          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3737          * process will be killed too). */
3738         if (subcgroup_path)
3739                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3740
3741         exec_status_start(&command->exec_status, pid);
3742
3743         *ret = pid;
3744         return 0;
3745 }
3746
3747 void exec_context_init(ExecContext *c) {
3748         ExecDirectoryType i;
3749
3750         assert(c);
3751
3752         c->umask = 0022;
3753         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3754         c->cpu_sched_policy = SCHED_OTHER;
3755         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3756         c->syslog_level_prefix = true;
3757         c->ignore_sigpipe = true;
3758         c->timer_slack_nsec = NSEC_INFINITY;
3759         c->personality = PERSONALITY_INVALID;
3760         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3761                 c->directories[i].mode = 0755;
3762         c->capability_bounding_set = CAP_ALL;
3763         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3764         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3765         c->log_level_max = -1;
3766 }
3767
3768 void exec_context_done(ExecContext *c) {
3769         ExecDirectoryType i;
3770         size_t l;
3771
3772         assert(c);
3773
3774         c->environment = strv_free(c->environment);
3775         c->environment_files = strv_free(c->environment_files);
3776         c->pass_environment = strv_free(c->pass_environment);
3777         c->unset_environment = strv_free(c->unset_environment);
3778
3779         rlimit_free_all(c->rlimit);
3780
3781         for (l = 0; l < 3; l++) {
3782                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3783                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3784         }
3785
3786         c->working_directory = mfree(c->working_directory);
3787         c->root_directory = mfree(c->root_directory);
3788         c->root_image = mfree(c->root_image);
3789         c->tty_path = mfree(c->tty_path);
3790         c->syslog_identifier = mfree(c->syslog_identifier);
3791         c->user = mfree(c->user);
3792         c->group = mfree(c->group);
3793
3794         c->supplementary_groups = strv_free(c->supplementary_groups);
3795
3796         c->pam_name = mfree(c->pam_name);
3797
3798         c->read_only_paths = strv_free(c->read_only_paths);
3799         c->read_write_paths = strv_free(c->read_write_paths);
3800         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3801
3802         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3803         c->bind_mounts = NULL;
3804         c->n_bind_mounts = 0;
3805         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3806         c->temporary_filesystems = NULL;
3807         c->n_temporary_filesystems = 0;
3808
3809         c->cpuset = cpu_set_mfree(c->cpuset);
3810
3811         c->utmp_id = mfree(c->utmp_id);
3812         c->selinux_context = mfree(c->selinux_context);
3813         c->apparmor_profile = mfree(c->apparmor_profile);
3814         c->smack_process_label = mfree(c->smack_process_label);
3815
3816         c->syscall_filter = hashmap_free(c->syscall_filter);
3817         c->syscall_archs = set_free(c->syscall_archs);
3818         c->address_families = set_free(c->address_families);
3819
3820         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3821                 c->directories[i].paths = strv_free(c->directories[i].paths);
3822
3823         c->log_level_max = -1;
3824
3825         exec_context_free_log_extra_fields(c);
3826
3827         c->log_rate_limit_interval_usec = 0;
3828         c->log_rate_limit_burst = 0;
3829
3830         c->stdin_data = mfree(c->stdin_data);
3831         c->stdin_data_size = 0;
3832 }
3833
3834 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3835         char **i;
3836
3837         assert(c);
3838
3839         if (!runtime_prefix)
3840                 return 0;
3841
3842         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3843                 _cleanup_free_ char *p;
3844
3845                 p = strjoin(runtime_prefix, "/", *i);
3846                 if (!p)
3847                         return -ENOMEM;
3848
3849                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3850                  * next. */
3851                 (void) rm_rf(p, REMOVE_ROOT);
3852         }
3853
3854         return 0;
3855 }
3856
3857 static void exec_command_done(ExecCommand *c) {
3858         assert(c);
3859
3860         c->path = mfree(c->path);
3861         c->argv = strv_free(c->argv);
3862 }
3863
3864 void exec_command_done_array(ExecCommand *c, size_t n) {
3865         size_t i;
3866
3867         for (i = 0; i < n; i++)
3868                 exec_command_done(c+i);
3869 }
3870
3871 ExecCommand* exec_command_free_list(ExecCommand *c) {
3872         ExecCommand *i;
3873
3874         while ((i = c)) {
3875                 LIST_REMOVE(command, c, i);
3876                 exec_command_done(i);
3877                 free(i);
3878         }
3879
3880         return NULL;
3881 }
3882
3883 void exec_command_free_array(ExecCommand **c, size_t n) {
3884         size_t i;
3885
3886         for (i = 0; i < n; i++)
3887                 c[i] = exec_command_free_list(c[i]);
3888 }
3889
3890 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3891         size_t i;
3892
3893         for (i = 0; i < n; i++)
3894                 exec_status_reset(&c[i].exec_status);
3895 }
3896
3897 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3898         size_t i;
3899
3900         for (i = 0; i < n; i++) {
3901                 ExecCommand *z;
3902
3903                 LIST_FOREACH(command, z, c[i])
3904                         exec_status_reset(&z->exec_status);
3905         }
3906 }
3907
3908 typedef struct InvalidEnvInfo {
3909         const Unit *unit;
3910         const char *path;
3911 } InvalidEnvInfo;
3912
3913 static void invalid_env(const char *p, void *userdata) {
3914         InvalidEnvInfo *info = userdata;
3915
3916         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3917 }
3918
3919 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3920         assert(c);
3921
3922         switch (fd_index) {
3923
3924         case STDIN_FILENO:
3925                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3926                         return NULL;
3927
3928                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3929
3930         case STDOUT_FILENO:
3931                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3932                         return NULL;
3933
3934                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3935
3936         case STDERR_FILENO:
3937                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3938                         return NULL;
3939
3940                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3941
3942         default:
3943                 return NULL;
3944         }
3945 }
3946
3947 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3948         size_t i, targets;
3949         const char* stdio_fdname[3];
3950         size_t n_fds;
3951
3952         assert(c);
3953         assert(p);
3954
3955         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3956                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3957                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3958
3959         for (i = 0; i < 3; i++)
3960                 stdio_fdname[i] = exec_context_fdname(c, i);
3961
3962         n_fds = p->n_storage_fds + p->n_socket_fds;
3963
3964         for (i = 0; i < n_fds  && targets > 0; i++)
3965                 if (named_iofds[STDIN_FILENO] < 0 &&
3966                     c->std_input == EXEC_INPUT_NAMED_FD &&
3967                     stdio_fdname[STDIN_FILENO] &&
3968                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3969
3970                         named_iofds[STDIN_FILENO] = p->fds[i];
3971                         targets--;
3972
3973                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3974                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3975                            stdio_fdname[STDOUT_FILENO] &&
3976                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3977
3978                         named_iofds[STDOUT_FILENO] = p->fds[i];
3979                         targets--;
3980
3981                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3982                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3983                            stdio_fdname[STDERR_FILENO] &&
3984                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3985
3986                         named_iofds[STDERR_FILENO] = p->fds[i];
3987                         targets--;
3988                 }
3989
3990         return targets == 0 ? 0 : -ENOENT;
3991 }
3992
3993 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3994         char **i, **r = NULL;
3995
3996         assert(c);
3997         assert(l);
3998
3999         STRV_FOREACH(i, c->environment_files) {
4000                 char *fn;
4001                 int k;
4002                 unsigned n;
4003                 bool ignore = false;
4004                 char **p;
4005                 _cleanup_globfree_ glob_t pglob = {};
4006
4007                 fn = *i;
4008
4009                 if (fn[0] == '-') {
4010                         ignore = true;
4011                         fn++;
4012                 }
4013
4014                 if (!path_is_absolute(fn)) {
4015                         if (ignore)
4016                                 continue;
4017
4018                         strv_free(r);
4019                         return -EINVAL;
4020                 }
4021
4022                 /* Filename supports globbing, take all matching files */
4023                 k = safe_glob(fn, 0, &pglob);
4024                 if (k < 0) {
4025                         if (ignore)
4026                                 continue;
4027
4028                         strv_free(r);
4029                         return k;
4030                 }
4031
4032                 /* When we don't match anything, -ENOENT should be returned */
4033                 assert(pglob.gl_pathc > 0);
4034
4035                 for (n = 0; n < pglob.gl_pathc; n++) {
4036                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4037                         if (k < 0) {
4038                                 if (ignore)
4039                                         continue;
4040
4041                                 strv_free(r);
4042                                 return k;
4043                         }
4044                         /* Log invalid environment variables with filename */
4045                         if (p) {
4046                                 InvalidEnvInfo info = {
4047                                         .unit = unit,
4048                                         .path = pglob.gl_pathv[n]
4049                                 };
4050
4051                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4052                         }
4053
4054                         if (!r)
4055                                 r = p;
4056                         else {
4057                                 char **m;
4058
4059                                 m = strv_env_merge(2, r, p);
4060                                 strv_free(r);
4061                                 strv_free(p);
4062                                 if (!m)
4063                                         return -ENOMEM;
4064
4065                                 r = m;
4066                         }
4067                 }
4068         }
4069
4070         *l = r;
4071
4072         return 0;
4073 }
4074
4075 static bool tty_may_match_dev_console(const char *tty) {
4076         _cleanup_free_ char *resolved = NULL;
4077
4078         if (!tty)
4079                 return true;
4080
4081         tty = skip_dev_prefix(tty);
4082
4083         /* trivial identity? */
4084         if (streq(tty, "console"))
4085                 return true;
4086
4087         if (resolve_dev_console(&resolved) < 0)
4088                 return true; /* if we could not resolve, assume it may */
4089
4090         /* "tty0" means the active VC, so it may be the same sometimes */
4091         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4092 }
4093
4094 bool exec_context_may_touch_console(const ExecContext *ec) {
4095
4096         return (ec->tty_reset ||
4097                 ec->tty_vhangup ||
4098                 ec->tty_vt_disallocate ||
4099                 is_terminal_input(ec->std_input) ||
4100                 is_terminal_output(ec->std_output) ||
4101                 is_terminal_output(ec->std_error)) &&
4102                tty_may_match_dev_console(exec_context_tty_path(ec));
4103 }
4104
4105 static void strv_fprintf(FILE *f, char **l) {
4106         char **g;
4107
4108         assert(f);
4109
4110         STRV_FOREACH(g, l)
4111                 fprintf(f, " %s", *g);
4112 }
4113
4114 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4115         ExecDirectoryType dt;
4116         char **e, **d;
4117         unsigned i;
4118         int r;
4119
4120         assert(c);
4121         assert(f);
4122
4123         prefix = strempty(prefix);
4124
4125         fprintf(f,
4126                 "%sUMask: %04o\n"
4127                 "%sWorkingDirectory: %s\n"
4128                 "%sRootDirectory: %s\n"
4129                 "%sNonBlocking: %s\n"
4130                 "%sPrivateTmp: %s\n"
4131                 "%sPrivateDevices: %s\n"
4132                 "%sProtectKernelTunables: %s\n"
4133                 "%sProtectKernelModules: %s\n"
4134                 "%sProtectControlGroups: %s\n"
4135                 "%sPrivateNetwork: %s\n"
4136                 "%sPrivateUsers: %s\n"
4137                 "%sProtectHome: %s\n"
4138                 "%sProtectSystem: %s\n"
4139                 "%sMountAPIVFS: %s\n"
4140                 "%sIgnoreSIGPIPE: %s\n"
4141                 "%sMemoryDenyWriteExecute: %s\n"
4142                 "%sRestrictRealtime: %s\n"
4143                 "%sKeyringMode: %s\n",
4144                 prefix, c->umask,
4145                 prefix, c->working_directory ? c->working_directory : "/",
4146                 prefix, c->root_directory ? c->root_directory : "/",
4147                 prefix, yes_no(c->non_blocking),
4148                 prefix, yes_no(c->private_tmp),
4149                 prefix, yes_no(c->private_devices),
4150                 prefix, yes_no(c->protect_kernel_tunables),
4151                 prefix, yes_no(c->protect_kernel_modules),
4152                 prefix, yes_no(c->protect_control_groups),
4153                 prefix, yes_no(c->private_network),
4154                 prefix, yes_no(c->private_users),
4155                 prefix, protect_home_to_string(c->protect_home),
4156                 prefix, protect_system_to_string(c->protect_system),
4157                 prefix, yes_no(c->mount_apivfs),
4158                 prefix, yes_no(c->ignore_sigpipe),
4159                 prefix, yes_no(c->memory_deny_write_execute),
4160                 prefix, yes_no(c->restrict_realtime),
4161                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
4162
4163         if (c->root_image)
4164                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4165
4166         STRV_FOREACH(e, c->environment)
4167                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4168
4169         STRV_FOREACH(e, c->environment_files)
4170                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4171
4172         STRV_FOREACH(e, c->pass_environment)
4173                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4174
4175         STRV_FOREACH(e, c->unset_environment)
4176                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4177
4178         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4179
4180         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4181                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4182
4183                 STRV_FOREACH(d, c->directories[dt].paths)
4184                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4185         }
4186
4187         if (c->nice_set)
4188                 fprintf(f,
4189                         "%sNice: %i\n",
4190                         prefix, c->nice);
4191
4192         if (c->oom_score_adjust_set)
4193                 fprintf(f,
4194                         "%sOOMScoreAdjust: %i\n",
4195                         prefix, c->oom_score_adjust);
4196
4197         for (i = 0; i < RLIM_NLIMITS; i++)
4198                 if (c->rlimit[i]) {
4199                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4200                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4201                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4202                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4203                 }
4204
4205         if (c->ioprio_set) {
4206                 _cleanup_free_ char *class_str = NULL;
4207
4208                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4209                 if (r >= 0)
4210                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4211
4212                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4213         }
4214
4215         if (c->cpu_sched_set) {
4216                 _cleanup_free_ char *policy_str = NULL;
4217
4218                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4219                 if (r >= 0)
4220                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4221
4222                 fprintf(f,
4223                         "%sCPUSchedulingPriority: %i\n"
4224                         "%sCPUSchedulingResetOnFork: %s\n",
4225                         prefix, c->cpu_sched_priority,
4226                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4227         }
4228
4229         if (c->cpuset) {
4230                 fprintf(f, "%sCPUAffinity:", prefix);
4231                 for (i = 0; i < c->cpuset_ncpus; i++)
4232                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4233                                 fprintf(f, " %u", i);
4234                 fputs("\n", f);
4235         }
4236
4237         if (c->timer_slack_nsec != NSEC_INFINITY)
4238                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4239
4240         fprintf(f,
4241                 "%sStandardInput: %s\n"
4242                 "%sStandardOutput: %s\n"
4243                 "%sStandardError: %s\n",
4244                 prefix, exec_input_to_string(c->std_input),
4245                 prefix, exec_output_to_string(c->std_output),
4246                 prefix, exec_output_to_string(c->std_error));
4247
4248         if (c->std_input == EXEC_INPUT_NAMED_FD)
4249                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4250         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4251                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4252         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4253                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4254
4255         if (c->std_input == EXEC_INPUT_FILE)
4256                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4257         if (c->std_output == EXEC_OUTPUT_FILE)
4258                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4259         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4260                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4261         if (c->std_error == EXEC_OUTPUT_FILE)
4262                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4263         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4264                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4265
4266         if (c->tty_path)
4267                 fprintf(f,
4268                         "%sTTYPath: %s\n"
4269                         "%sTTYReset: %s\n"
4270                         "%sTTYVHangup: %s\n"
4271                         "%sTTYVTDisallocate: %s\n",
4272                         prefix, c->tty_path,
4273                         prefix, yes_no(c->tty_reset),
4274                         prefix, yes_no(c->tty_vhangup),
4275                         prefix, yes_no(c->tty_vt_disallocate));
4276
4277         if (IN_SET(c->std_output,
4278                    EXEC_OUTPUT_SYSLOG,
4279                    EXEC_OUTPUT_KMSG,
4280                    EXEC_OUTPUT_JOURNAL,
4281                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4282                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4283                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4284             IN_SET(c->std_error,
4285                    EXEC_OUTPUT_SYSLOG,
4286                    EXEC_OUTPUT_KMSG,
4287                    EXEC_OUTPUT_JOURNAL,
4288                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4289                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4290                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4291
4292                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4293
4294                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4295                 if (r >= 0)
4296                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4297
4298                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4299                 if (r >= 0)
4300                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4301         }
4302
4303         if (c->log_level_max >= 0) {
4304                 _cleanup_free_ char *t = NULL;
4305
4306                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4307
4308                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4309         }
4310
4311         if (c->log_rate_limit_interval_usec > 0) {
4312                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4313
4314                 fprintf(f,
4315                         "%sLogRateLimitIntervalSec: %s\n",
4316                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4317         }
4318
4319         if (c->log_rate_limit_burst > 0)
4320                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4321
4322         if (c->n_log_extra_fields > 0) {
4323                 size_t j;
4324
4325                 for (j = 0; j < c->n_log_extra_fields; j++) {
4326                         fprintf(f, "%sLogExtraFields: ", prefix);
4327                         fwrite(c->log_extra_fields[j].iov_base,
4328                                1, c->log_extra_fields[j].iov_len,
4329                                f);
4330                         fputc('\n', f);
4331                 }
4332         }
4333
4334         if (c->secure_bits) {
4335                 _cleanup_free_ char *str = NULL;
4336
4337                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4338                 if (r >= 0)
4339                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4340         }
4341
4342         if (c->capability_bounding_set != CAP_ALL) {
4343                 _cleanup_free_ char *str = NULL;
4344
4345                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4346                 if (r >= 0)
4347                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4348         }
4349
4350         if (c->capability_ambient_set != 0) {
4351                 _cleanup_free_ char *str = NULL;
4352
4353                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4354                 if (r >= 0)
4355                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4356         }
4357
4358         if (c->user)
4359                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4360         if (c->group)
4361                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4362
4363         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4364
4365         if (!strv_isempty(c->supplementary_groups)) {
4366                 fprintf(f, "%sSupplementaryGroups:", prefix);
4367                 strv_fprintf(f, c->supplementary_groups);
4368                 fputs("\n", f);
4369         }
4370
4371         if (c->pam_name)
4372                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4373
4374         if (!strv_isempty(c->read_write_paths)) {
4375                 fprintf(f, "%sReadWritePaths:", prefix);
4376                 strv_fprintf(f, c->read_write_paths);
4377                 fputs("\n", f);
4378         }
4379
4380         if (!strv_isempty(c->read_only_paths)) {
4381                 fprintf(f, "%sReadOnlyPaths:", prefix);
4382                 strv_fprintf(f, c->read_only_paths);
4383                 fputs("\n", f);
4384         }
4385
4386         if (!strv_isempty(c->inaccessible_paths)) {
4387                 fprintf(f, "%sInaccessiblePaths:", prefix);
4388                 strv_fprintf(f, c->inaccessible_paths);
4389                 fputs("\n", f);
4390         }
4391
4392         if (c->n_bind_mounts > 0)
4393                 for (i = 0; i < c->n_bind_mounts; i++)
4394                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4395                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4396                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4397                                 c->bind_mounts[i].source,
4398                                 c->bind_mounts[i].destination,
4399                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4400
4401         if (c->n_temporary_filesystems > 0)
4402                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4403                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4404
4405                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4406                                 t->path,
4407                                 isempty(t->options) ? "" : ":",
4408                                 strempty(t->options));
4409                 }
4410
4411         if (c->utmp_id)
4412                 fprintf(f,
4413                         "%sUtmpIdentifier: %s\n",
4414                         prefix, c->utmp_id);
4415
4416         if (c->selinux_context)
4417                 fprintf(f,
4418                         "%sSELinuxContext: %s%s\n",
4419                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4420
4421         if (c->apparmor_profile)
4422                 fprintf(f,
4423                         "%sAppArmorProfile: %s%s\n",
4424                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4425
4426         if (c->smack_process_label)
4427                 fprintf(f,
4428                         "%sSmackProcessLabel: %s%s\n",
4429                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4430
4431         if (c->personality != PERSONALITY_INVALID)
4432                 fprintf(f,
4433                         "%sPersonality: %s\n",
4434                         prefix, strna(personality_to_string(c->personality)));
4435
4436         fprintf(f,
4437                 "%sLockPersonality: %s\n",
4438                 prefix, yes_no(c->lock_personality));
4439
4440         if (c->syscall_filter) {
4441 #if HAVE_SECCOMP
4442                 Iterator j;
4443                 void *id, *val;
4444                 bool first = true;
4445 #endif
4446
4447                 fprintf(f,
4448                         "%sSystemCallFilter: ",
4449                         prefix);
4450
4451                 if (!c->syscall_whitelist)
4452                         fputc('~', f);
4453
4454 #if HAVE_SECCOMP
4455                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4456                         _cleanup_free_ char *name = NULL;
4457                         const char *errno_name = NULL;
4458                         int num = PTR_TO_INT(val);
4459
4460                         if (first)
4461                                 first = false;
4462                         else
4463                                 fputc(' ', f);
4464
4465                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4466                         fputs(strna(name), f);
4467
4468                         if (num >= 0) {
4469                                 errno_name = errno_to_name(num);
4470                                 if (errno_name)
4471                                         fprintf(f, ":%s", errno_name);
4472                                 else
4473                                         fprintf(f, ":%d", num);
4474                         }
4475                 }
4476 #endif
4477
4478                 fputc('\n', f);
4479         }
4480
4481         if (c->syscall_archs) {
4482 #if HAVE_SECCOMP
4483                 Iterator j;
4484                 void *id;
4485 #endif
4486
4487                 fprintf(f,
4488                         "%sSystemCallArchitectures:",
4489                         prefix);
4490
4491 #if HAVE_SECCOMP
4492                 SET_FOREACH(id, c->syscall_archs, j)
4493                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4494 #endif
4495                 fputc('\n', f);
4496         }
4497
4498         if (exec_context_restrict_namespaces_set(c)) {
4499                 _cleanup_free_ char *s = NULL;
4500
4501                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4502                 if (r >= 0)
4503                         fprintf(f, "%sRestrictNamespaces: %s\n",
4504                                 prefix, s);
4505         }
4506
4507         if (c->syscall_errno > 0) {
4508                 const char *errno_name;
4509
4510                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4511
4512                 errno_name = errno_to_name(c->syscall_errno);
4513                 if (errno_name)
4514                         fprintf(f, "%s\n", errno_name);
4515                 else
4516                         fprintf(f, "%d\n", c->syscall_errno);
4517         }
4518
4519         if (c->apparmor_profile)
4520                 fprintf(f,
4521                         "%sAppArmorProfile: %s%s\n",
4522                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4523 }
4524
4525 bool exec_context_maintains_privileges(const ExecContext *c) {
4526         assert(c);
4527
4528         /* Returns true if the process forked off would run under
4529          * an unchanged UID or as root. */
4530
4531         if (!c->user)
4532                 return true;
4533
4534         if (streq(c->user, "root") || streq(c->user, "0"))
4535                 return true;
4536
4537         return false;
4538 }
4539
4540 int exec_context_get_effective_ioprio(const ExecContext *c) {
4541         int p;
4542
4543         assert(c);
4544
4545         if (c->ioprio_set)
4546                 return c->ioprio;
4547
4548         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4549         if (p < 0)
4550                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4551
4552         return p;
4553 }
4554
4555 void exec_context_free_log_extra_fields(ExecContext *c) {
4556         size_t l;
4557
4558         assert(c);
4559
4560         for (l = 0; l < c->n_log_extra_fields; l++)
4561                 free(c->log_extra_fields[l].iov_base);
4562         c->log_extra_fields = mfree(c->log_extra_fields);
4563         c->n_log_extra_fields = 0;
4564 }
4565
4566 void exec_status_start(ExecStatus *s, pid_t pid) {
4567         assert(s);
4568
4569         *s = (ExecStatus) {
4570                 .pid = pid,
4571         };
4572
4573         dual_timestamp_get(&s->start_timestamp);
4574 }
4575
4576 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4577         assert(s);
4578
4579         if (s->pid != pid) {
4580                 *s = (ExecStatus) {
4581                         .pid = pid,
4582                 };
4583         }
4584
4585         dual_timestamp_get(&s->exit_timestamp);
4586
4587         s->code = code;
4588         s->status = status;
4589
4590         if (context) {
4591                 if (context->utmp_id)
4592                         (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4593
4594                 exec_context_tty_reset(context, NULL);
4595         }
4596 }
4597
4598 void exec_status_reset(ExecStatus *s) {
4599         assert(s);
4600
4601         *s = (ExecStatus) {};
4602 }
4603
4604 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4605         char buf[FORMAT_TIMESTAMP_MAX];
4606
4607         assert(s);
4608         assert(f);
4609
4610         if (s->pid <= 0)
4611                 return;
4612
4613         prefix = strempty(prefix);
4614
4615         fprintf(f,
4616                 "%sPID: "PID_FMT"\n",
4617                 prefix, s->pid);
4618
4619         if (dual_timestamp_is_set(&s->start_timestamp))
4620                 fprintf(f,
4621                         "%sStart Timestamp: %s\n",
4622                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4623
4624         if (dual_timestamp_is_set(&s->exit_timestamp))
4625                 fprintf(f,
4626                         "%sExit Timestamp: %s\n"
4627                         "%sExit Code: %s\n"
4628                         "%sExit Status: %i\n",
4629                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4630                         prefix, sigchld_code_to_string(s->code),
4631                         prefix, s->status);
4632 }
4633
4634 static char *exec_command_line(char **argv) {
4635         size_t k;
4636         char *n, *p, **a;
4637         bool first = true;
4638
4639         assert(argv);
4640
4641         k = 1;
4642         STRV_FOREACH(a, argv)
4643                 k += strlen(*a)+3;
4644
4645         n = new(char, k);
4646         if (!n)
4647                 return NULL;
4648
4649         p = n;
4650         STRV_FOREACH(a, argv) {
4651
4652                 if (!first)
4653                         *(p++) = ' ';
4654                 else
4655                         first = false;
4656
4657                 if (strpbrk(*a, WHITESPACE)) {
4658                         *(p++) = '\'';
4659                         p = stpcpy(p, *a);
4660                         *(p++) = '\'';
4661                 } else
4662                         p = stpcpy(p, *a);
4663
4664         }
4665
4666         *p = 0;
4667
4668         /* FIXME: this doesn't really handle arguments that have
4669          * spaces and ticks in them */
4670
4671         return n;
4672 }
4673
4674 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4675         _cleanup_free_ char *cmd = NULL;
4676         const char *prefix2;
4677
4678         assert(c);
4679         assert(f);
4680
4681         prefix = strempty(prefix);
4682         prefix2 = strjoina(prefix, "\t");
4683
4684         cmd = exec_command_line(c->argv);
4685         fprintf(f,
4686                 "%sCommand Line: %s\n",
4687                 prefix, cmd ? cmd : strerror(ENOMEM));
4688
4689         exec_status_dump(&c->exec_status, f, prefix2);
4690 }
4691
4692 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4693         assert(f);
4694
4695         prefix = strempty(prefix);
4696
4697         LIST_FOREACH(command, c, c)
4698                 exec_command_dump(c, f, prefix);
4699 }
4700
4701 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4702         ExecCommand *end;
4703
4704         assert(l);
4705         assert(e);
4706
4707         if (*l) {
4708                 /* It's kind of important, that we keep the order here */
4709                 LIST_FIND_TAIL(command, *l, end);
4710                 LIST_INSERT_AFTER(command, *l, end, e);
4711         } else
4712               *l = e;
4713 }
4714
4715 int exec_command_set(ExecCommand *c, const char *path, ...) {
4716         va_list ap;
4717         char **l, *p;
4718
4719         assert(c);
4720         assert(path);
4721
4722         va_start(ap, path);
4723         l = strv_new_ap(path, ap);
4724         va_end(ap);
4725
4726         if (!l)
4727                 return -ENOMEM;
4728
4729         p = strdup(path);
4730         if (!p) {
4731                 strv_free(l);
4732                 return -ENOMEM;
4733         }
4734
4735         free_and_replace(c->path, p);
4736
4737         return strv_free_and_replace(c->argv, l);
4738 }
4739
4740 int exec_command_append(ExecCommand *c, const char *path, ...) {
4741         _cleanup_strv_free_ char **l = NULL;
4742         va_list ap;
4743         int r;
4744
4745         assert(c);
4746         assert(path);
4747
4748         va_start(ap, path);
4749         l = strv_new_ap(path, ap);
4750         va_end(ap);
4751
4752         if (!l)
4753                 return -ENOMEM;
4754
4755         r = strv_extend_strv(&c->argv, l, false);
4756         if (r < 0)
4757                 return r;
4758
4759         return 0;
4760 }
4761
4762 static void *remove_tmpdir_thread(void *p) {
4763         _cleanup_free_ char *path = p;
4764
4765         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4766         return NULL;
4767 }
4768
4769 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4770         int r;
4771
4772         if (!rt)
4773                 return NULL;
4774
4775         if (rt->manager)
4776                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4777
4778         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4779         if (destroy && rt->tmp_dir) {
4780                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4781
4782                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4783                 if (r < 0) {
4784                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4785                         free(rt->tmp_dir);
4786                 }
4787
4788                 rt->tmp_dir = NULL;
4789         }
4790
4791         if (destroy && rt->var_tmp_dir) {
4792                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4793
4794                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4795                 if (r < 0) {
4796                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4797                         free(rt->var_tmp_dir);
4798                 }
4799
4800                 rt->var_tmp_dir = NULL;
4801         }
4802
4803         rt->id = mfree(rt->id);
4804         rt->tmp_dir = mfree(rt->tmp_dir);
4805         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4806         safe_close_pair(rt->netns_storage_socket);
4807         return mfree(rt);
4808 }
4809
4810 static void exec_runtime_freep(ExecRuntime **rt) {
4811         if (*rt)
4812                 (void) exec_runtime_free(*rt, false);
4813 }
4814
4815 static int exec_runtime_allocate(ExecRuntime **rt) {
4816         assert(rt);
4817
4818         *rt = new0(ExecRuntime, 1);
4819         if (!*rt)
4820                 return -ENOMEM;
4821
4822         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4823         return 0;
4824 }
4825
4826 static int exec_runtime_add(
4827                 Manager *m,
4828                 const char *id,
4829                 const char *tmp_dir,
4830                 const char *var_tmp_dir,
4831                 const int netns_storage_socket[2],
4832                 ExecRuntime **ret) {
4833
4834         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4835         int r;
4836
4837         assert(m);
4838         assert(id);
4839
4840         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4841         if (r < 0)
4842                 return r;
4843
4844         r = exec_runtime_allocate(&rt);
4845         if (r < 0)
4846                 return r;
4847
4848         rt->id = strdup(id);
4849         if (!rt->id)
4850                 return -ENOMEM;
4851
4852         if (tmp_dir) {
4853                 rt->tmp_dir = strdup(tmp_dir);
4854                 if (!rt->tmp_dir)
4855                         return -ENOMEM;
4856
4857                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4858                 assert(var_tmp_dir);
4859                 rt->var_tmp_dir = strdup(var_tmp_dir);
4860                 if (!rt->var_tmp_dir)
4861                         return -ENOMEM;
4862         }
4863
4864         if (netns_storage_socket) {
4865                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4866                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4867         }
4868
4869         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4870         if (r < 0)
4871                 return r;
4872
4873         rt->manager = m;
4874
4875         if (ret)
4876                 *ret = rt;
4877
4878         /* do not remove created ExecRuntime object when the operation succeeds. */
4879         rt = NULL;
4880         return 0;
4881 }
4882
4883 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4884         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4885         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4886         int r;
4887
4888         assert(m);
4889         assert(c);
4890         assert(id);
4891
4892         /* It is not necessary to create ExecRuntime object. */
4893         if (!c->private_network && !c->private_tmp)
4894                 return 0;
4895
4896         if (c->private_tmp) {
4897                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4898                 if (r < 0)
4899                         return r;
4900         }
4901
4902         if (c->private_network) {
4903                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4904                         return -errno;
4905         }
4906
4907         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4908         if (r < 0)
4909                 return r;
4910
4911         /* Avoid cleanup */
4912         netns_storage_socket[0] = -1;
4913         netns_storage_socket[1] = -1;
4914         return 1;
4915 }
4916
4917 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4918         ExecRuntime *rt;
4919         int r;
4920
4921         assert(m);
4922         assert(id);
4923         assert(ret);
4924
4925         rt = hashmap_get(m->exec_runtime_by_id, id);
4926         if (rt)
4927                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4928                 goto ref;
4929
4930         if (!create)
4931                 return 0;
4932
4933         /* If not found, then create a new object. */
4934         r = exec_runtime_make(m, c, id, &rt);
4935         if (r <= 0)
4936                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4937                 return r;
4938
4939 ref:
4940         /* increment reference counter. */
4941         rt->n_ref++;
4942         *ret = rt;
4943         return 1;
4944 }
4945
4946 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4947         if (!rt)
4948                 return NULL;
4949
4950         assert(rt->n_ref > 0);
4951
4952         rt->n_ref--;
4953         if (rt->n_ref > 0)
4954                 return NULL;
4955
4956         return exec_runtime_free(rt, destroy);
4957 }
4958
4959 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4960         ExecRuntime *rt;
4961         Iterator i;
4962
4963         assert(m);
4964         assert(f);
4965         assert(fds);
4966
4967         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4968                 fprintf(f, "exec-runtime=%s", rt->id);
4969
4970                 if (rt->tmp_dir)
4971                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4972
4973                 if (rt->var_tmp_dir)
4974                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4975
4976                 if (rt->netns_storage_socket[0] >= 0) {
4977                         int copy;
4978
4979                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4980                         if (copy < 0)
4981                                 return copy;
4982
4983                         fprintf(f, " netns-socket-0=%i", copy);
4984                 }
4985
4986                 if (rt->netns_storage_socket[1] >= 0) {
4987                         int copy;
4988
4989                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4990                         if (copy < 0)
4991                                 return copy;
4992
4993                         fprintf(f, " netns-socket-1=%i", copy);
4994                 }
4995
4996                 fputc('\n', f);
4997         }
4998
4999         return 0;
5000 }
5001
5002 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5003         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5004         ExecRuntime *rt;
5005         int r;
5006
5007         /* This is for the migration from old (v237 or earlier) deserialization text.
5008          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5009          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5010          * so or not from the serialized text, then we always creates a new object owned by this. */
5011
5012         assert(u);
5013         assert(key);
5014         assert(value);
5015
5016         /* Manager manages ExecRuntime objects by the unit id.
5017          * So, we omit the serialized text when the unit does not have id (yet?)... */
5018         if (isempty(u->id)) {
5019                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5020                 return 0;
5021         }
5022
5023         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5024         if (r < 0) {
5025                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5026                 return 0;
5027         }
5028
5029         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5030         if (!rt) {
5031                 r = exec_runtime_allocate(&rt_create);
5032                 if (r < 0)
5033                         return log_oom();
5034
5035                 rt_create->id = strdup(u->id);
5036                 if (!rt_create->id)
5037                         return log_oom();
5038
5039                 rt = rt_create;
5040         }
5041
5042         if (streq(key, "tmp-dir")) {
5043                 char *copy;
5044
5045                 copy = strdup(value);
5046                 if (!copy)
5047                         return log_oom();
5048
5049                 free_and_replace(rt->tmp_dir, copy);
5050
5051         } else if (streq(key, "var-tmp-dir")) {
5052                 char *copy;
5053
5054                 copy = strdup(value);
5055                 if (!copy)
5056                         return log_oom();
5057
5058                 free_and_replace(rt->var_tmp_dir, copy);
5059
5060         } else if (streq(key, "netns-socket-0")) {
5061                 int fd;
5062
5063                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5064                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5065                         return 0;
5066                 }
5067
5068                 safe_close(rt->netns_storage_socket[0]);
5069                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5070
5071         } else if (streq(key, "netns-socket-1")) {
5072                 int fd;
5073
5074                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5075                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5076                         return 0;
5077                 }
5078
5079                 safe_close(rt->netns_storage_socket[1]);
5080                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5081         } else
5082                 return 0;
5083
5084         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5085         if (rt_create) {
5086                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5087                 if (r < 0) {
5088                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5089                         return 0;
5090                 }
5091
5092                 rt_create->manager = u->manager;
5093
5094                 /* Avoid cleanup */
5095                 rt_create = NULL;
5096         }
5097
5098         return 1;
5099 }
5100
5101 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5102         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5103         int r, fd0 = -1, fd1 = -1;
5104         const char *p, *v = value;
5105         size_t n;
5106
5107         assert(m);
5108         assert(value);
5109         assert(fds);
5110
5111         n = strcspn(v, " ");
5112         id = strndupa(v, n);
5113         if (v[n] != ' ')
5114                 goto finalize;
5115         p = v + n + 1;
5116
5117         v = startswith(p, "tmp-dir=");
5118         if (v) {
5119                 n = strcspn(v, " ");
5120                 tmp_dir = strndupa(v, n);
5121                 if (v[n] != ' ')
5122                         goto finalize;
5123                 p = v + n + 1;
5124         }
5125
5126         v = startswith(p, "var-tmp-dir=");
5127         if (v) {
5128                 n = strcspn(v, " ");
5129                 var_tmp_dir = strndupa(v, n);
5130                 if (v[n] != ' ')
5131                         goto finalize;
5132                 p = v + n + 1;
5133         }
5134
5135         v = startswith(p, "netns-socket-0=");
5136         if (v) {
5137                 char *buf;
5138
5139                 n = strcspn(v, " ");
5140                 buf = strndupa(v, n);
5141                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5142                         log_debug("Unable to process exec-runtime netns fd specification.");
5143                         return;
5144                 }
5145                 fd0 = fdset_remove(fds, fd0);
5146                 if (v[n] != ' ')
5147                         goto finalize;
5148                 p = v + n + 1;
5149         }
5150
5151         v = startswith(p, "netns-socket-1=");
5152         if (v) {
5153                 char *buf;
5154
5155                 n = strcspn(v, " ");
5156                 buf = strndupa(v, n);
5157                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5158                         log_debug("Unable to process exec-runtime netns fd specification.");
5159                         return;
5160                 }
5161                 fd1 = fdset_remove(fds, fd1);
5162         }
5163
5164 finalize:
5165
5166         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5167         if (r < 0)
5168                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5169 }
5170
5171 void exec_runtime_vacuum(Manager *m) {
5172         ExecRuntime *rt;
5173         Iterator i;
5174
5175         assert(m);
5176
5177         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5178
5179         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5180                 if (rt->n_ref > 0)
5181                         continue;
5182
5183                 (void) exec_runtime_free(rt, false);
5184         }
5185 }
5186
5187 void exec_params_clear(ExecParameters *p) {
5188         if (!p)
5189                 return;
5190
5191         strv_free(p->environment);
5192 }
5193
5194 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5195         [EXEC_INPUT_NULL] = "null",
5196         [EXEC_INPUT_TTY] = "tty",
5197         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5198         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5199         [EXEC_INPUT_SOCKET] = "socket",
5200         [EXEC_INPUT_NAMED_FD] = "fd",
5201         [EXEC_INPUT_DATA] = "data",
5202         [EXEC_INPUT_FILE] = "file",
5203 };
5204
5205 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5206
5207 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5208         [EXEC_OUTPUT_INHERIT] = "inherit",
5209         [EXEC_OUTPUT_NULL] = "null",
5210         [EXEC_OUTPUT_TTY] = "tty",
5211         [EXEC_OUTPUT_SYSLOG] = "syslog",
5212         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5213         [EXEC_OUTPUT_KMSG] = "kmsg",
5214         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5215         [EXEC_OUTPUT_JOURNAL] = "journal",
5216         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5217         [EXEC_OUTPUT_SOCKET] = "socket",
5218         [EXEC_OUTPUT_NAMED_FD] = "fd",
5219         [EXEC_OUTPUT_FILE] = "file",
5220         [EXEC_OUTPUT_FILE_APPEND] = "append",
5221 };
5222
5223 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5224
5225 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5226         [EXEC_UTMP_INIT] = "init",
5227         [EXEC_UTMP_LOGIN] = "login",
5228         [EXEC_UTMP_USER] = "user",
5229 };
5230
5231 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5232
5233 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5234         [EXEC_PRESERVE_NO] = "no",
5235         [EXEC_PRESERVE_YES] = "yes",
5236         [EXEC_PRESERVE_RESTART] = "restart",
5237 };
5238
5239 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5240
5241 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5242         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5243         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5244         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5245         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5246         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5247 };
5248
5249 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5250
5251 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5252         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5253         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5254         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5255         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5256         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5257 };
5258
5259 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5260
5261 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5262         [EXEC_KEYRING_INHERIT] = "inherit",
5263         [EXEC_KEYRING_PRIVATE] = "private",
5264         [EXEC_KEYRING_SHARED] = "shared",
5265 };
5266
5267 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);