src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-util.h"
  54 #include "errno-list.h"
  55 #include "execute.h"
  56 #include "exit-status.h"
  57 #include "fd-util.h"
  58 #include "fileio.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "missing.h"
  69 #include "mkdir.h"
  70 #include "namespace.h"
  71 #include "parse-util.h"
  72 #include "path-util.h"
  73 #include "process-util.h"
  74 #include "rlimit-util.h"
  75 #include "rm-rf.h"
  76 #if HAVE_SECCOMP
  77 #include "seccomp-util.h"
  78 #endif
  79 #include "securebits.h"
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "util.h"
  96 #include "utmp-wtmp.h"
  97
  98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 100
 101 /* This assumes there is a 'tty' group */
 102 #define TTY_MODE 0620
 103
 104 #define SNDBUF_SIZE (8*1024*1024)
 105
 106 static int shift_fds(int fds[], size_t n_fds) {
 107         int start, restart_from;
 108
 109         if (n_fds <= 0)
 110                 return 0;
 111
 112         /* Modifies the fds array! (sorts it) */
 113
 114         assert(fds);
 115
 116         start = 0;
 117         for (;;) {
 118                 int i;
 119
 120                 restart_from = -1;
 121
 122                 for (i = start; i < (int) n_fds; i++) {
 123                         int nfd;
 124
 125                         /* Already at right index? */
 126                         if (fds[i] == i+3)
 127                                 continue;
 128
 129                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 130                         if (nfd < 0)
 131                                 return -errno;
 132
 133                         safe_close(fds[i]);
 134                         fds[i] = nfd;
 135
 136                         /* Hmm, the fd we wanted isn't free? Then
 137                          * let's remember that and try again from here */
 138                         if (nfd != i+3 && restart_from < 0)
 139                                 restart_from = i;
 140                 }
 141
 142                 if (restart_from < 0)
 143                         break;
 144
 145                 start = restart_from;
 146         }
 147
 148         return 0;
 149 }
 150
 151 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 152         size_t i, n_fds;
 153         int r;
 154
 155         n_fds = n_socket_fds + n_storage_fds;
 156         if (n_fds <= 0)
 157                 return 0;
 158
 159         assert(fds);
 160
 161         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 162          * O_NONBLOCK only applies to socket activation though. */
 163
 164         for (i = 0; i < n_fds; i++) {
 165
 166                 if (i < n_socket_fds) {
 167                         r = fd_nonblock(fds[i], nonblock);
 168                         if (r < 0)
 169                                 return r;
 170                 }
 171
 172                 /* We unconditionally drop FD_CLOEXEC from the fds,
 173                  * since after all we want to pass these fds to our
 174                  * children */
 175
 176                 r = fd_cloexec(fds[i], false);
 177                 if (r < 0)
 178                         return r;
 179         }
 180
 181         return 0;
 182 }
 183
 184 static const char *exec_context_tty_path(const ExecContext *context) {
 185         assert(context);
 186
 187         if (context->stdio_as_fds)
 188                 return NULL;
 189
 190         if (context->tty_path)
 191                 return context->tty_path;
 192
 193         return "/dev/console";
 194 }
 195
 196 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 197         const char *path;
 198
 199         assert(context);
 200
 201         path = exec_context_tty_path(context);
 202
 203         if (context->tty_vhangup) {
 204                 if (p && p->stdin_fd >= 0)
 205                         (void) terminal_vhangup_fd(p->stdin_fd);
 206                 else if (path)
 207                         (void) terminal_vhangup(path);
 208         }
 209
 210         if (context->tty_reset) {
 211                 if (p && p->stdin_fd >= 0)
 212                         (void) reset_terminal_fd(p->stdin_fd, true);
 213                 else if (path)
 214                         (void) reset_terminal(path);
 215         }
 216
 217         if (context->tty_vt_disallocate && path)
 218                 (void) vt_disallocate(path);
 219 }
 220
 221 static bool is_terminal_input(ExecInput i) {
 222         return IN_SET(i,
 223                       EXEC_INPUT_TTY,
 224                       EXEC_INPUT_TTY_FORCE,
 225                       EXEC_INPUT_TTY_FAIL);
 226 }
 227
 228 static bool is_terminal_output(ExecOutput o) {
 229         return IN_SET(o,
 230                       EXEC_OUTPUT_TTY,
 231                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 232                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 233                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 234 }
 235
 236 static bool is_syslog_output(ExecOutput o) {
 237         return IN_SET(o,
 238                       EXEC_OUTPUT_SYSLOG,
 239                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 240 }
 241
 242 static bool is_kmsg_output(ExecOutput o) {
 243         return IN_SET(o,
 244                       EXEC_OUTPUT_KMSG,
 245                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 246 }
 247
 248 static bool exec_context_needs_term(const ExecContext *c) {
 249         assert(c);
 250
 251         /* Return true if the execution context suggests we should set $TERM to something useful. */
 252
 253         if (is_terminal_input(c->std_input))
 254                 return true;
 255
 256         if (is_terminal_output(c->std_output))
 257                 return true;
 258
 259         if (is_terminal_output(c->std_error))
 260                 return true;
 261
 262         return !!c->tty_path;
 263 }
 264
 265 static int open_null_as(int flags, int nfd) {
 266         int fd;
 267
 268         assert(nfd >= 0);
 269
 270         fd = open("/dev/null", flags|O_NOCTTY);
 271         if (fd < 0)
 272                 return -errno;
 273
 274         return move_fd(fd, nfd, false);
 275 }
 276
 277 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 278         static const union sockaddr_union sa = {
 279                 .un.sun_family = AF_UNIX,
 280                 .un.sun_path = "/run/systemd/journal/stdout",
 281         };
 282         uid_t olduid = UID_INVALID;
 283         gid_t oldgid = GID_INVALID;
 284         int r;
 285
 286         if (gid_is_valid(gid)) {
 287                 oldgid = getgid();
 288
 289                 if (setegid(gid) < 0)
 290                         return -errno;
 291         }
 292
 293         if (uid_is_valid(uid)) {
 294                 olduid = getuid();
 295
 296                 if (seteuid(uid) < 0) {
 297                         r = -errno;
 298                         goto restore_gid;
 299                 }
 300         }
 301
 302         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 303
 304         /* If we fail to restore the uid or gid, things will likely
 305            fail later on. This should only happen if an LSM interferes. */
 306
 307         if (uid_is_valid(uid))
 308                 (void) seteuid(olduid);
 309
 310  restore_gid:
 311         if (gid_is_valid(gid))
 312                 (void) setegid(oldgid);
 313
 314         return r;
 315 }
 316
 317 static int connect_logger_as(
 318                 const Unit *unit,
 319                 const ExecContext *context,
 320                 const ExecParameters *params,
 321                 ExecOutput output,
 322                 const char *ident,
 323                 int nfd,
 324                 uid_t uid,
 325                 gid_t gid) {
 326
 327         int fd, r;
 328
 329         assert(context);
 330         assert(params);
 331         assert(output < _EXEC_OUTPUT_MAX);
 332         assert(ident);
 333         assert(nfd >= 0);
 334
 335         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = connect_journal_socket(fd, uid, gid);
 340         if (r < 0)
 341                 return r;
 342
 343         if (shutdown(fd, SHUT_RD) < 0) {
 344                 safe_close(fd);
 345                 return -errno;
 346         }
 347
 348         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 349
 350         dprintf(fd,
 351                 "%s\n"
 352                 "%s\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n"
 356                 "%i\n"
 357                 "%i\n",
 358                 context->syslog_identifier ?: ident,
 359                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 360                 context->syslog_priority,
 361                 !!context->syslog_level_prefix,
 362                 is_syslog_output(output),
 363                 is_kmsg_output(output),
 364                 is_terminal_output(output));
 365
 366         return move_fd(fd, nfd, false);
 367 }
 368 static int open_terminal_as(const char *path, int flags, int nfd) {
 369         int fd;
 370
 371         assert(path);
 372         assert(nfd >= 0);
 373
 374         fd = open_terminal(path, flags | O_NOCTTY);
 375         if (fd < 0)
 376                 return fd;
 377
 378         return move_fd(fd, nfd, false);
 379 }
 380
 381 static int acquire_path(const char *path, int flags, mode_t mode) {
 382         union sockaddr_union sa = {};
 383         _cleanup_close_ int fd = -1;
 384         int r, salen;
 385
 386         assert(path);
 387
 388         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 389                 flags |= O_CREAT;
 390
 391         fd = open(path, flags|O_NOCTTY, mode);
 392         if (fd >= 0)
 393                 return TAKE_FD(fd);
 394
 395         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 396                 return -errno;
 397         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 398                 return -ENXIO;
 399
 400         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 401
 402         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 403         if (fd < 0)
 404                 return -errno;
 405
 406         salen = sockaddr_un_set_path(&sa.un, path);
 407         if (salen < 0)
 408                 return salen;
 409
 410         if (connect(fd, &sa.sa, salen) < 0)
 411                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 412                                                            * indication that his wasn't an AF_UNIX socket after all */
 413
 414         if ((flags & O_ACCMODE) == O_RDONLY)
 415                 r = shutdown(fd, SHUT_WR);
 416         else if ((flags & O_ACCMODE) == O_WRONLY)
 417                 r = shutdown(fd, SHUT_RD);
 418         else
 419                 return TAKE_FD(fd);
 420         if (r < 0)
 421                 return -errno;
 422
 423         return TAKE_FD(fd);
 424 }
 425
 426 static int fixup_input(
 427                 const ExecContext *context,
 428                 int socket_fd,
 429                 bool apply_tty_stdin) {
 430
 431         ExecInput std_input;
 432
 433         assert(context);
 434
 435         std_input = context->std_input;
 436
 437         if (is_terminal_input(std_input) && !apply_tty_stdin)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 444                 return EXEC_INPUT_NULL;
 445
 446         return std_input;
 447 }
 448
 449 static int fixup_output(ExecOutput std_output, int socket_fd) {
 450
 451         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 452                 return EXEC_OUTPUT_INHERIT;
 453
 454         return std_output;
 455 }
 456
 457 static int setup_input(
 458                 const ExecContext *context,
 459                 const ExecParameters *params,
 460                 int socket_fd,
 461                 int named_iofds[3]) {
 462
 463         ExecInput i;
 464
 465         assert(context);
 466         assert(params);
 467
 468         if (params->stdin_fd >= 0) {
 469                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 470                         return -errno;
 471
 472                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 473                 if (isatty(STDIN_FILENO)) {
 474                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 475                         (void) reset_terminal_fd(STDIN_FILENO, true);
 476                 }
 477
 478                 return STDIN_FILENO;
 479         }
 480
 481         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 482
 483         switch (i) {
 484
 485         case EXEC_INPUT_NULL:
 486                 return open_null_as(O_RDONLY, STDIN_FILENO);
 487
 488         case EXEC_INPUT_TTY:
 489         case EXEC_INPUT_TTY_FORCE:
 490         case EXEC_INPUT_TTY_FAIL: {
 491                 int fd;
 492
 493                 fd = acquire_terminal(exec_context_tty_path(context),
 494                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 495                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 496                                                                   ACQUIRE_TERMINAL_WAIT,
 497                                       USEC_INFINITY);
 498                 if (fd < 0)
 499                         return fd;
 500
 501                 return move_fd(fd, STDIN_FILENO, false);
 502         }
 503
 504         case EXEC_INPUT_SOCKET:
 505                 assert(socket_fd >= 0);
 506
 507                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 508
 509         case EXEC_INPUT_NAMED_FD:
 510                 assert(named_iofds[STDIN_FILENO] >= 0);
 511
 512                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 513                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 514
 515         case EXEC_INPUT_DATA: {
 516                 int fd;
 517
 518                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 519                 if (fd < 0)
 520                         return fd;
 521
 522                 return move_fd(fd, STDIN_FILENO, false);
 523         }
 524
 525         case EXEC_INPUT_FILE: {
 526                 bool rw;
 527                 int fd;
 528
 529                 assert(context->stdio_file[STDIN_FILENO]);
 530
 531                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 532                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 533
 534                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 535                 if (fd < 0)
 536                         return fd;
 537
 538                 return move_fd(fd, STDIN_FILENO, false);
 539         }
 540
 541         default:
 542                 assert_not_reached("Unknown input type");
 543         }
 544 }
 545
 546 static int setup_output(
 547                 const Unit *unit,
 548                 const ExecContext *context,
 549                 const ExecParameters *params,
 550                 int fileno,
 551                 int socket_fd,
 552                 int named_iofds[3],
 553                 const char *ident,
 554                 uid_t uid,
 555                 gid_t gid,
 556                 dev_t *journal_stream_dev,
 557                 ino_t *journal_stream_ino) {
 558
 559         ExecOutput o;
 560         ExecInput i;
 561         int r;
 562
 563         assert(unit);
 564         assert(context);
 565         assert(params);
 566         assert(ident);
 567         assert(journal_stream_dev);
 568         assert(journal_stream_ino);
 569
 570         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 571
 572                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 573                         return -errno;
 574
 575                 return STDOUT_FILENO;
 576         }
 577
 578         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 579                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 580                         return -errno;
 581
 582                 return STDERR_FILENO;
 583         }
 584
 585         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 586         o = fixup_output(context->std_output, socket_fd);
 587
 588         if (fileno == STDERR_FILENO) {
 589                 ExecOutput e;
 590                 e = fixup_output(context->std_error, socket_fd);
 591
 592                 /* This expects the input and output are already set up */
 593
 594                 /* Don't change the stderr file descriptor if we inherit all
 595                  * the way and are not on a tty */
 596                 if (e == EXEC_OUTPUT_INHERIT &&
 597                     o == EXEC_OUTPUT_INHERIT &&
 598                     i == EXEC_INPUT_NULL &&
 599                     !is_terminal_input(context->std_input) &&
 600                     getppid () != 1)
 601                         return fileno;
 602
 603                 /* Duplicate from stdout if possible */
 604                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 605                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 606
 607                 o = e;
 608
 609         } else if (o == EXEC_OUTPUT_INHERIT) {
 610                 /* If input got downgraded, inherit the original value */
 611                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 612                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 613
 614                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 615                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 616                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 617
 618                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 619                 if (getppid() != 1)
 620                         return fileno;
 621
 622                 /* We need to open /dev/null here anew, to get the right access mode. */
 623                 return open_null_as(O_WRONLY, fileno);
 624         }
 625
 626         switch (o) {
 627
 628         case EXEC_OUTPUT_NULL:
 629                 return open_null_as(O_WRONLY, fileno);
 630
 631         case EXEC_OUTPUT_TTY:
 632                 if (is_terminal_input(i))
 633                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 634
 635                 /* We don't reset the terminal if this is just about output */
 636                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 637
 638         case EXEC_OUTPUT_SYSLOG:
 639         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 640         case EXEC_OUTPUT_KMSG:
 641         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 642         case EXEC_OUTPUT_JOURNAL:
 643         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 644                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 645                 if (r < 0) {
 646                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 647                         r = open_null_as(O_WRONLY, fileno);
 648                 } else {
 649                         struct stat st;
 650
 651                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 652                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 653                          * services to detect whether they are connected to the journal or not.
 654                          *
 655                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 656                          * about STDERR as that's usually the best way to do logging. */
 657
 658                         if (fstat(fileno, &st) >= 0 &&
 659                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 660                                 *journal_stream_dev = st.st_dev;
 661                                 *journal_stream_ino = st.st_ino;
 662                         }
 663                 }
 664                 return r;
 665
 666         case EXEC_OUTPUT_SOCKET:
 667                 assert(socket_fd >= 0);
 668
 669                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 670
 671         case EXEC_OUTPUT_NAMED_FD:
 672                 assert(named_iofds[fileno] >= 0);
 673
 674                 (void) fd_nonblock(named_iofds[fileno], false);
 675                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 676
 677         case EXEC_OUTPUT_FILE:
 678         case EXEC_OUTPUT_FILE_APPEND: {
 679                 bool rw;
 680                 int fd, flags;
 681
 682                 assert(context->stdio_file[fileno]);
 683
 684                 rw = context->std_input == EXEC_INPUT_FILE &&
 685                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 686
 687                 if (rw)
 688                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 689
 690                 flags = O_WRONLY;
 691                 if (o == EXEC_OUTPUT_FILE_APPEND)
 692                         flags |= O_APPEND;
 693
 694                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 695
 696                 if (fd < 0)
 697                         return fd;
 698
 699                 return move_fd(fd, fileno, 0);
 700         }
 701
 702         default:
 703                 assert_not_reached("Unknown error type");
 704         }
 705 }
 706
 707 static int chown_terminal(int fd, uid_t uid) {
 708         struct stat st;
 709
 710         assert(fd >= 0);
 711
 712         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 713         if (isatty(fd) < 1)
 714                 return 0;
 715
 716         /* This might fail. What matters are the results. */
 717         (void) fchown(fd, uid, -1);
 718         (void) fchmod(fd, TTY_MODE);
 719
 720         if (fstat(fd, &st) < 0)
 721                 return -errno;
 722
 723         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 724                 return -EPERM;
 725
 726         return 0;
 727 }
 728
 729 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 730         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 731         int r;
 732
 733         assert(_saved_stdin);
 734         assert(_saved_stdout);
 735
 736         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 737         if (saved_stdin < 0)
 738                 return -errno;
 739
 740         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 741         if (saved_stdout < 0)
 742                 return -errno;
 743
 744         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 745         if (fd < 0)
 746                 return fd;
 747
 748         r = chown_terminal(fd, getuid());
 749         if (r < 0)
 750                 return r;
 751
 752         r = reset_terminal_fd(fd, true);
 753         if (r < 0)
 754                 return r;
 755
 756         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 757         fd = -1;
 758         if (r < 0)
 759                 return r;
 760
 761         *_saved_stdin = saved_stdin;
 762         *_saved_stdout = saved_stdout;
 763
 764         saved_stdin = saved_stdout = -1;
 765
 766         return 0;
 767 }
 768
 769 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 770         assert(err < 0);
 771
 772         if (err == -ETIMEDOUT)
 773                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 774         else {
 775                 errno = -err;
 776                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 777         }
 778 }
 779
 780 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 781         _cleanup_close_ int fd = -1;
 782
 783         assert(vc);
 784
 785         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 786         if (fd < 0)
 787                 return;
 788
 789         write_confirm_error_fd(err, fd, u);
 790 }
 791
 792 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 793         int r = 0;
 794
 795         assert(saved_stdin);
 796         assert(saved_stdout);
 797
 798         release_terminal();
 799
 800         if (*saved_stdin >= 0)
 801                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 802                         r = -errno;
 803
 804         if (*saved_stdout >= 0)
 805                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 806                         r = -errno;
 807
 808         *saved_stdin = safe_close(*saved_stdin);
 809         *saved_stdout = safe_close(*saved_stdout);
 810
 811         return r;
 812 }
 813
 814 enum {
 815         CONFIRM_PRETEND_FAILURE = -1,
 816         CONFIRM_PRETEND_SUCCESS =  0,
 817         CONFIRM_EXECUTE = 1,
 818 };
 819
 820 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 821         int saved_stdout = -1, saved_stdin = -1, r;
 822         _cleanup_free_ char *e = NULL;
 823         char c;
 824
 825         /* For any internal errors, assume a positive response. */
 826         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 827         if (r < 0) {
 828                 write_confirm_error(r, vc, u);
 829                 return CONFIRM_EXECUTE;
 830         }
 831
 832         /* confirm_spawn might have been disabled while we were sleeping. */
 833         if (manager_is_confirm_spawn_disabled(u->manager)) {
 834                 r = 1;
 835                 goto restore_stdio;
 836         }
 837
 838         e = ellipsize(cmdline, 60, 100);
 839         if (!e) {
 840                 log_oom();
 841                 r = CONFIRM_EXECUTE;
 842                 goto restore_stdio;
 843         }
 844
 845         for (;;) {
 846                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 847                 if (r < 0) {
 848                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 849                         r = CONFIRM_EXECUTE;
 850                         goto restore_stdio;
 851                 }
 852
 853                 switch (c) {
 854                 case 'c':
 855                         printf("Resuming normal execution.\n");
 856                         manager_disable_confirm_spawn();
 857                         r = 1;
 858                         break;
 859                 case 'D':
 860                         unit_dump(u, stdout, "  ");
 861                         continue; /* ask again */
 862                 case 'f':
 863                         printf("Failing execution.\n");
 864                         r = CONFIRM_PRETEND_FAILURE;
 865                         break;
 866                 case 'h':
 867                         printf("  c - continue, proceed without asking anymore\n"
 868                                "  D - dump, show the state of the unit\n"
 869                                "  f - fail, don't execute the command and pretend it failed\n"
 870                                "  h - help\n"
 871                                "  i - info, show a short summary of the unit\n"
 872                                "  j - jobs, show jobs that are in progress\n"
 873                                "  s - skip, don't execute the command and pretend it succeeded\n"
 874                                "  y - yes, execute the command\n");
 875                         continue; /* ask again */
 876                 case 'i':
 877                         printf("  Description: %s\n"
 878                                "  Unit:        %s\n"
 879                                "  Command:     %s\n",
 880                                u->id, u->description, cmdline);
 881                         continue; /* ask again */
 882                 case 'j':
 883                         manager_dump_jobs(u->manager, stdout, "  ");
 884                         continue; /* ask again */
 885                 case 'n':
 886                         /* 'n' was removed in favor of 'f'. */
 887                         printf("Didn't understand 'n', did you mean 'f'?\n");
 888                         continue; /* ask again */
 889                 case 's':
 890                         printf("Skipping execution.\n");
 891                         r = CONFIRM_PRETEND_SUCCESS;
 892                         break;
 893                 case 'y':
 894                         r = CONFIRM_EXECUTE;
 895                         break;
 896                 default:
 897                         assert_not_reached("Unhandled choice");
 898                 }
 899                 break;
 900         }
 901
 902 restore_stdio:
 903         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 904         return r;
 905 }
 906
 907 static int get_fixed_user(const ExecContext *c, const char **user,
 908                           uid_t *uid, gid_t *gid,
 909                           const char **home, const char **shell) {
 910         int r;
 911         const char *name;
 912
 913         assert(c);
 914
 915         if (!c->user)
 916                 return 0;
 917
 918         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 919          * (i.e. are "/" or "/bin/nologin"). */
 920
 921         name = c->user;
 922         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 923         if (r < 0)
 924                 return r;
 925
 926         *user = name;
 927         return 0;
 928 }
 929
 930 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 931         int r;
 932         const char *name;
 933
 934         assert(c);
 935
 936         if (!c->group)
 937                 return 0;
 938
 939         name = c->group;
 940         r = get_group_creds(&name, gid, 0);
 941         if (r < 0)
 942                 return r;
 943
 944         *group = name;
 945         return 0;
 946 }
 947
 948 static int get_supplementary_groups(const ExecContext *c, const char *user,
 949                                     const char *group, gid_t gid,
 950                                     gid_t **supplementary_gids, int *ngids) {
 951         char **i;
 952         int r, k = 0;
 953         int ngroups_max;
 954         bool keep_groups = false;
 955         gid_t *groups = NULL;
 956         _cleanup_free_ gid_t *l_gids = NULL;
 957
 958         assert(c);
 959
 960         /*
 961          * If user is given, then lookup GID and supplementary groups list.
 962          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 963          * here and as early as possible so we keep the list of supplementary
 964          * groups of the caller.
 965          */
 966         if (user && gid_is_valid(gid) && gid != 0) {
 967                 /* First step, initialize groups from /etc/groups */
 968                 if (initgroups(user, gid) < 0)
 969                         return -errno;
 970
 971                 keep_groups = true;
 972         }
 973
 974         if (strv_isempty(c->supplementary_groups))
 975                 return 0;
 976
 977         /*
 978          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 979          * be positive, otherwise fail.
 980          */
 981         errno = 0;
 982         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 983         if (ngroups_max <= 0) {
 984                 if (errno > 0)
 985                         return -errno;
 986                 else
 987                         return -EOPNOTSUPP; /* For all other values */
 988         }
 989
 990         l_gids = new(gid_t, ngroups_max);
 991         if (!l_gids)
 992                 return -ENOMEM;
 993
 994         if (keep_groups) {
 995                 /*
 996                  * Lookup the list of groups that the user belongs to, we
 997                  * avoid NSS lookups here too for gid=0.
 998                  */
 999                 k = ngroups_max;
1000                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1001                         return -EINVAL;
1002         } else
1003                 k = 0;
1004
1005         STRV_FOREACH(i, c->supplementary_groups) {
1006                 const char *g;
1007
1008                 if (k >= ngroups_max)
1009                         return -E2BIG;
1010
1011                 g = *i;
1012                 r = get_group_creds(&g, l_gids+k, 0);
1013                 if (r < 0)
1014                         return r;
1015
1016                 k++;
1017         }
1018
1019         /*
1020          * Sets ngids to zero to drop all supplementary groups, happens
1021          * when we are under root and SupplementaryGroups= is empty.
1022          */
1023         if (k == 0) {
1024                 *ngids = 0;
1025                 return 0;
1026         }
1027
1028         /* Otherwise get the final list of supplementary groups */
1029         groups = memdup(l_gids, sizeof(gid_t) * k);
1030         if (!groups)
1031                 return -ENOMEM;
1032
1033         *supplementary_gids = groups;
1034         *ngids = k;
1035
1036         groups = NULL;
1037
1038         return 0;
1039 }
1040
1041 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1042         int r;
1043
1044         /* Handle SupplementaryGroups= if it is not empty */
1045         if (ngids > 0) {
1046                 r = maybe_setgroups(ngids, supplementary_gids);
1047                 if (r < 0)
1048                         return r;
1049         }
1050
1051         if (gid_is_valid(gid)) {
1052                 /* Then set our gids */
1053                 if (setresgid(gid, gid, gid) < 0)
1054                         return -errno;
1055         }
1056
1057         return 0;
1058 }
1059
1060 static int enforce_user(const ExecContext *context, uid_t uid) {
1061         assert(context);
1062
1063         if (!uid_is_valid(uid))
1064                 return 0;
1065
1066         /* Sets (but doesn't look up) the uid and make sure we keep the
1067          * capabilities while doing so. */
1068
1069         if (context->capability_ambient_set != 0) {
1070
1071                 /* First step: If we need to keep capabilities but
1072                  * drop privileges we need to make sure we keep our
1073                  * caps, while we drop privileges. */
1074                 if (uid != 0) {
1075                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1076
1077                         if (prctl(PR_GET_SECUREBITS) != sb)
1078                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1079                                         return -errno;
1080                 }
1081         }
1082
1083         /* Second step: actually set the uids */
1084         if (setresuid(uid, uid, uid) < 0)
1085                 return -errno;
1086
1087         /* At this point we should have all necessary capabilities but
1088            are otherwise a normal user. However, the caps might got
1089            corrupted due to the setresuid() so we need clean them up
1090            later. This is done outside of this call. */
1091
1092         return 0;
1093 }
1094
1095 #if HAVE_PAM
1096
1097 static int null_conv(
1098                 int num_msg,
1099                 const struct pam_message **msg,
1100                 struct pam_response **resp,
1101                 void *appdata_ptr) {
1102
1103         /* We don't support conversations */
1104
1105         return PAM_CONV_ERR;
1106 }
1107
1108 #endif
1109
1110 static int setup_pam(
1111                 const char *name,
1112                 const char *user,
1113                 uid_t uid,
1114                 gid_t gid,
1115                 const char *tty,
1116                 char ***env,
1117                 int fds[], size_t n_fds) {
1118
1119 #if HAVE_PAM
1120
1121         static const struct pam_conv conv = {
1122                 .conv = null_conv,
1123                 .appdata_ptr = NULL
1124         };
1125
1126         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1127         pam_handle_t *handle = NULL;
1128         sigset_t old_ss;
1129         int pam_code = PAM_SUCCESS, r;
1130         char **nv, **e = NULL;
1131         bool close_session = false;
1132         pid_t pam_pid = 0, parent_pid;
1133         int flags = 0;
1134
1135         assert(name);
1136         assert(user);
1137         assert(env);
1138
1139         /* We set up PAM in the parent process, then fork. The child
1140          * will then stay around until killed via PR_GET_PDEATHSIG or
1141          * systemd via the cgroup logic. It will then remove the PAM
1142          * session again. The parent process will exec() the actual
1143          * daemon. We do things this way to ensure that the main PID
1144          * of the daemon is the one we initially fork()ed. */
1145
1146         r = barrier_create(&barrier);
1147         if (r < 0)
1148                 goto fail;
1149
1150         if (log_get_max_level() < LOG_DEBUG)
1151                 flags |= PAM_SILENT;
1152
1153         pam_code = pam_start(name, user, &conv, &handle);
1154         if (pam_code != PAM_SUCCESS) {
1155                 handle = NULL;
1156                 goto fail;
1157         }
1158
1159         if (!tty) {
1160                 _cleanup_free_ char *q = NULL;
1161
1162                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1163                  * out if that's the case, and read the TTY off it. */
1164
1165                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1166                         tty = strjoina("/dev/", q);
1167         }
1168
1169         if (tty) {
1170                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1171                 if (pam_code != PAM_SUCCESS)
1172                         goto fail;
1173         }
1174
1175         STRV_FOREACH(nv, *env) {
1176                 pam_code = pam_putenv(handle, *nv);
1177                 if (pam_code != PAM_SUCCESS)
1178                         goto fail;
1179         }
1180
1181         pam_code = pam_acct_mgmt(handle, flags);
1182         if (pam_code != PAM_SUCCESS)
1183                 goto fail;
1184
1185         pam_code = pam_open_session(handle, flags);
1186         if (pam_code != PAM_SUCCESS)
1187                 goto fail;
1188
1189         close_session = true;
1190
1191         e = pam_getenvlist(handle);
1192         if (!e) {
1193                 pam_code = PAM_BUF_ERR;
1194                 goto fail;
1195         }
1196
1197         /* Block SIGTERM, so that we know that it won't get lost in
1198          * the child */
1199
1200         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1201
1202         parent_pid = getpid_cached();
1203
1204         r = safe_fork("(sd-pam)", 0, &pam_pid);
1205         if (r < 0)
1206                 goto fail;
1207         if (r == 0) {
1208                 int sig, ret = EXIT_PAM;
1209
1210                 /* The child's job is to reset the PAM session on
1211                  * termination */
1212                 barrier_set_role(&barrier, BARRIER_CHILD);
1213
1214                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1215                  * are open here that have been opened by PAM. */
1216                 (void) close_many(fds, n_fds);
1217
1218                 /* Drop privileges - we don't need any to pam_close_session
1219                  * and this will make PR_SET_PDEATHSIG work in most cases.
1220                  * If this fails, ignore the error - but expect sd-pam threads
1221                  * to fail to exit normally */
1222
1223                 r = maybe_setgroups(0, NULL);
1224                 if (r < 0)
1225                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1226                 if (setresgid(gid, gid, gid) < 0)
1227                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1228                 if (setresuid(uid, uid, uid) < 0)
1229                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1230
1231                 (void) ignore_signals(SIGPIPE, -1);
1232
1233                 /* Wait until our parent died. This will only work if
1234                  * the above setresuid() succeeds, otherwise the kernel
1235                  * will not allow unprivileged parents kill their privileged
1236                  * children this way. We rely on the control groups kill logic
1237                  * to do the rest for us. */
1238                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1239                         goto child_finish;
1240
1241                 /* Tell the parent that our setup is done. This is especially
1242                  * important regarding dropping privileges. Otherwise, unit
1243                  * setup might race against our setresuid(2) call.
1244                  *
1245                  * If the parent aborted, we'll detect this below, hence ignore
1246                  * return failure here. */
1247                 (void) barrier_place(&barrier);
1248
1249                 /* Check if our parent process might already have died? */
1250                 if (getppid() == parent_pid) {
1251                         sigset_t ss;
1252
1253                         assert_se(sigemptyset(&ss) >= 0);
1254                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1255
1256                         for (;;) {
1257                                 if (sigwait(&ss, &sig) < 0) {
1258                                         if (errno == EINTR)
1259                                                 continue;
1260
1261                                         goto child_finish;
1262                                 }
1263
1264                                 assert(sig == SIGTERM);
1265                                 break;
1266                         }
1267                 }
1268
1269                 /* If our parent died we'll end the session */
1270                 if (getppid() != parent_pid) {
1271                         pam_code = pam_close_session(handle, flags);
1272                         if (pam_code != PAM_SUCCESS)
1273                                 goto child_finish;
1274                 }
1275
1276                 ret = 0;
1277
1278         child_finish:
1279                 pam_end(handle, pam_code | flags);
1280                 _exit(ret);
1281         }
1282
1283         barrier_set_role(&barrier, BARRIER_PARENT);
1284
1285         /* If the child was forked off successfully it will do all the
1286          * cleanups, so forget about the handle here. */
1287         handle = NULL;
1288
1289         /* Unblock SIGTERM again in the parent */
1290         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1291
1292         /* We close the log explicitly here, since the PAM modules
1293          * might have opened it, but we don't want this fd around. */
1294         closelog();
1295
1296         /* Synchronously wait for the child to initialize. We don't care for
1297          * errors as we cannot recover. However, warn loudly if it happens. */
1298         if (!barrier_place_and_sync(&barrier))
1299                 log_error("PAM initialization failed");
1300
1301         return strv_free_and_replace(*env, e);
1302
1303 fail:
1304         if (pam_code != PAM_SUCCESS) {
1305                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1306                 r = -EPERM;  /* PAM errors do not map to errno */
1307         } else
1308                 log_error_errno(r, "PAM failed: %m");
1309
1310         if (handle) {
1311                 if (close_session)
1312                         pam_code = pam_close_session(handle, flags);
1313
1314                 pam_end(handle, pam_code | flags);
1315         }
1316
1317         strv_free(e);
1318         closelog();
1319
1320         return r;
1321 #else
1322         return 0;
1323 #endif
1324 }
1325
1326 static void rename_process_from_path(const char *path) {
1327         char process_name[11];
1328         const char *p;
1329         size_t l;
1330
1331         /* This resulting string must fit in 10 chars (i.e. the length
1332          * of "/sbin/init") to look pretty in /bin/ps */
1333
1334         p = basename(path);
1335         if (isempty(p)) {
1336                 rename_process("(...)");
1337                 return;
1338         }
1339
1340         l = strlen(p);
1341         if (l > 8) {
1342                 /* The end of the process name is usually more
1343                  * interesting, since the first bit might just be
1344                  * "systemd-" */
1345                 p = p + l - 8;
1346                 l = 8;
1347         }
1348
1349         process_name[0] = '(';
1350         memcpy(process_name+1, p, l);
1351         process_name[1+l] = ')';
1352         process_name[1+l+1] = 0;
1353
1354         rename_process(process_name);
1355 }
1356
1357 static bool context_has_address_families(const ExecContext *c) {
1358         assert(c);
1359
1360         return c->address_families_whitelist ||
1361                 !set_isempty(c->address_families);
1362 }
1363
1364 static bool context_has_syscall_filters(const ExecContext *c) {
1365         assert(c);
1366
1367         return c->syscall_whitelist ||
1368                 !hashmap_isempty(c->syscall_filter);
1369 }
1370
1371 static bool context_has_no_new_privileges(const ExecContext *c) {
1372         assert(c);
1373
1374         if (c->no_new_privileges)
1375                 return true;
1376
1377         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1378                 return false;
1379
1380         /* We need NNP if we have any form of seccomp and are unprivileged */
1381         return context_has_address_families(c) ||
1382                 c->memory_deny_write_execute ||
1383                 c->restrict_realtime ||
1384                 exec_context_restrict_namespaces_set(c) ||
1385                 c->protect_kernel_tunables ||
1386                 c->protect_kernel_modules ||
1387                 c->private_devices ||
1388                 context_has_syscall_filters(c) ||
1389                 !set_isempty(c->syscall_archs) ||
1390                 c->lock_personality;
1391 }
1392
1393 #if HAVE_SECCOMP
1394
1395 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1396
1397         if (is_seccomp_available())
1398                 return false;
1399
1400         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1401         return true;
1402 }
1403
1404 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1405         uint32_t negative_action, default_action, action;
1406         int r;
1407
1408         assert(u);
1409         assert(c);
1410
1411         if (!context_has_syscall_filters(c))
1412                 return 0;
1413
1414         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1415                 return 0;
1416
1417         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1418
1419         if (c->syscall_whitelist) {
1420                 default_action = negative_action;
1421                 action = SCMP_ACT_ALLOW;
1422         } else {
1423                 default_action = SCMP_ACT_ALLOW;
1424                 action = negative_action;
1425         }
1426
1427         if (needs_ambient_hack) {
1428                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1429                 if (r < 0)
1430                         return r;
1431         }
1432
1433         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1434 }
1435
1436 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1437         assert(u);
1438         assert(c);
1439
1440         if (set_isempty(c->syscall_archs))
1441                 return 0;
1442
1443         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1444                 return 0;
1445
1446         return seccomp_restrict_archs(c->syscall_archs);
1447 }
1448
1449 static int apply_address_families(const Unit* u, const ExecContext *c) {
1450         assert(u);
1451         assert(c);
1452
1453         if (!context_has_address_families(c))
1454                 return 0;
1455
1456         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1457                 return 0;
1458
1459         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1460 }
1461
1462 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1463         assert(u);
1464         assert(c);
1465
1466         if (!c->memory_deny_write_execute)
1467                 return 0;
1468
1469         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1470                 return 0;
1471
1472         return seccomp_memory_deny_write_execute();
1473 }
1474
1475 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1476         assert(u);
1477         assert(c);
1478
1479         if (!c->restrict_realtime)
1480                 return 0;
1481
1482         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1483                 return 0;
1484
1485         return seccomp_restrict_realtime();
1486 }
1487
1488 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1489         assert(u);
1490         assert(c);
1491
1492         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1493          * let's protect even those systems where this is left on in the kernel. */
1494
1495         if (!c->protect_kernel_tunables)
1496                 return 0;
1497
1498         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1499                 return 0;
1500
1501         return seccomp_protect_sysctl();
1502 }
1503
1504 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1505         assert(u);
1506         assert(c);
1507
1508         /* Turn off module syscalls on ProtectKernelModules=yes */
1509
1510         if (!c->protect_kernel_modules)
1511                 return 0;
1512
1513         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1514                 return 0;
1515
1516         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1517 }
1518
1519 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1520         assert(u);
1521         assert(c);
1522
1523         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1524
1525         if (!c->private_devices)
1526                 return 0;
1527
1528         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1529                 return 0;
1530
1531         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1532 }
1533
1534 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1535         assert(u);
1536         assert(c);
1537
1538         if (!exec_context_restrict_namespaces_set(c))
1539                 return 0;
1540
1541         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1542                 return 0;
1543
1544         return seccomp_restrict_namespaces(c->restrict_namespaces);
1545 }
1546
1547 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1548         unsigned long personality;
1549         int r;
1550
1551         assert(u);
1552         assert(c);
1553
1554         if (!c->lock_personality)
1555                 return 0;
1556
1557         if (skip_seccomp_unavailable(u, "LockPersonality="))
1558                 return 0;
1559
1560         personality = c->personality;
1561
1562         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1563         if (personality == PERSONALITY_INVALID) {
1564
1565                 r = opinionated_personality(&personality);
1566                 if (r < 0)
1567                         return r;
1568         }
1569
1570         return seccomp_lock_personality(personality);
1571 }
1572
1573 #endif
1574
1575 static void do_idle_pipe_dance(int idle_pipe[4]) {
1576         assert(idle_pipe);
1577
1578         idle_pipe[1] = safe_close(idle_pipe[1]);
1579         idle_pipe[2] = safe_close(idle_pipe[2]);
1580
1581         if (idle_pipe[0] >= 0) {
1582                 int r;
1583
1584                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1585
1586                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1587                         ssize_t n;
1588
1589                         /* Signal systemd that we are bored and want to continue. */
1590                         n = write(idle_pipe[3], "x", 1);
1591                         if (n > 0)
1592                                 /* Wait for systemd to react to the signal above. */
1593                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1594                 }
1595
1596                 idle_pipe[0] = safe_close(idle_pipe[0]);
1597
1598         }
1599
1600         idle_pipe[3] = safe_close(idle_pipe[3]);
1601 }
1602
1603 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1604
1605 static int build_environment(
1606                 const Unit *u,
1607                 const ExecContext *c,
1608                 const ExecParameters *p,
1609                 size_t n_fds,
1610                 const char *home,
1611                 const char *username,
1612                 const char *shell,
1613                 dev_t journal_stream_dev,
1614                 ino_t journal_stream_ino,
1615                 char ***ret) {
1616
1617         _cleanup_strv_free_ char **our_env = NULL;
1618         ExecDirectoryType t;
1619         size_t n_env = 0;
1620         char *x;
1621
1622         assert(u);
1623         assert(c);
1624         assert(p);
1625         assert(ret);
1626
1627         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1628         if (!our_env)
1629                 return -ENOMEM;
1630
1631         if (n_fds > 0) {
1632                 _cleanup_free_ char *joined = NULL;
1633
1634                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1635                         return -ENOMEM;
1636                 our_env[n_env++] = x;
1637
1638                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1639                         return -ENOMEM;
1640                 our_env[n_env++] = x;
1641
1642                 joined = strv_join(p->fd_names, ":");
1643                 if (!joined)
1644                         return -ENOMEM;
1645
1646                 x = strjoin("LISTEN_FDNAMES=", joined);
1647                 if (!x)
1648                         return -ENOMEM;
1649                 our_env[n_env++] = x;
1650         }
1651
1652         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1653                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1654                         return -ENOMEM;
1655                 our_env[n_env++] = x;
1656
1657                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1658                         return -ENOMEM;
1659                 our_env[n_env++] = x;
1660         }
1661
1662         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1663          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1664          * check the database directly. */
1665         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1666                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1667                 if (!x)
1668                         return -ENOMEM;
1669                 our_env[n_env++] = x;
1670         }
1671
1672         if (home) {
1673                 x = strappend("HOME=", home);
1674                 if (!x)
1675                         return -ENOMEM;
1676                 our_env[n_env++] = x;
1677         }
1678
1679         if (username) {
1680                 x = strappend("LOGNAME=", username);
1681                 if (!x)
1682                         return -ENOMEM;
1683                 our_env[n_env++] = x;
1684
1685                 x = strappend("USER=", username);
1686                 if (!x)
1687                         return -ENOMEM;
1688                 our_env[n_env++] = x;
1689         }
1690
1691         if (shell) {
1692                 x = strappend("SHELL=", shell);
1693                 if (!x)
1694                         return -ENOMEM;
1695                 our_env[n_env++] = x;
1696         }
1697
1698         if (!sd_id128_is_null(u->invocation_id)) {
1699                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1700                         return -ENOMEM;
1701
1702                 our_env[n_env++] = x;
1703         }
1704
1705         if (exec_context_needs_term(c)) {
1706                 const char *tty_path, *term = NULL;
1707
1708                 tty_path = exec_context_tty_path(c);
1709
1710                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1711                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1712                  * passes to PID 1 ends up all the way in the console login shown. */
1713
1714                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1715                         term = getenv("TERM");
1716                 if (!term)
1717                         term = default_term_for_tty(tty_path);
1718
1719                 x = strappend("TERM=", term);
1720                 if (!x)
1721                         return -ENOMEM;
1722                 our_env[n_env++] = x;
1723         }
1724
1725         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1726                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1727                         return -ENOMEM;
1728
1729                 our_env[n_env++] = x;
1730         }
1731
1732         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1733                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1734                 const char *n;
1735
1736                 if (!p->prefix[t])
1737                         continue;
1738
1739                 if (strv_isempty(c->directories[t].paths))
1740                         continue;
1741
1742                 n = exec_directory_env_name_to_string(t);
1743                 if (!n)
1744                         continue;
1745
1746                 pre = strjoin(p->prefix[t], "/");
1747                 if (!pre)
1748                         return -ENOMEM;
1749
1750                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1751                 if (!joined)
1752                         return -ENOMEM;
1753
1754                 x = strjoin(n, "=", joined);
1755                 if (!x)
1756                         return -ENOMEM;
1757
1758                 our_env[n_env++] = x;
1759         }
1760
1761         our_env[n_env++] = NULL;
1762         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1763
1764         *ret = TAKE_PTR(our_env);
1765
1766         return 0;
1767 }
1768
1769 static int build_pass_environment(const ExecContext *c, char ***ret) {
1770         _cleanup_strv_free_ char **pass_env = NULL;
1771         size_t n_env = 0, n_bufsize = 0;
1772         char **i;
1773
1774         STRV_FOREACH(i, c->pass_environment) {
1775                 _cleanup_free_ char *x = NULL;
1776                 char *v;
1777
1778                 v = getenv(*i);
1779                 if (!v)
1780                         continue;
1781                 x = strjoin(*i, "=", v);
1782                 if (!x)
1783                         return -ENOMEM;
1784
1785                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1786                         return -ENOMEM;
1787
1788                 pass_env[n_env++] = TAKE_PTR(x);
1789                 pass_env[n_env] = NULL;
1790         }
1791
1792         *ret = TAKE_PTR(pass_env);
1793
1794         return 0;
1795 }
1796
1797 static bool exec_needs_mount_namespace(
1798                 const ExecContext *context,
1799                 const ExecParameters *params,
1800                 const ExecRuntime *runtime) {
1801
1802         assert(context);
1803         assert(params);
1804
1805         if (context->root_image)
1806                 return true;
1807
1808         if (!strv_isempty(context->read_write_paths) ||
1809             !strv_isempty(context->read_only_paths) ||
1810             !strv_isempty(context->inaccessible_paths))
1811                 return true;
1812
1813         if (context->n_bind_mounts > 0)
1814                 return true;
1815
1816         if (context->n_temporary_filesystems > 0)
1817                 return true;
1818
1819         if (context->mount_flags != 0)
1820                 return true;
1821
1822         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1823                 return true;
1824
1825         if (context->private_devices ||
1826             context->private_mounts ||
1827             context->protect_system != PROTECT_SYSTEM_NO ||
1828             context->protect_home != PROTECT_HOME_NO ||
1829             context->protect_kernel_tunables ||
1830             context->protect_kernel_modules ||
1831             context->protect_control_groups)
1832                 return true;
1833
1834         if (context->root_directory) {
1835                 ExecDirectoryType t;
1836
1837                 if (context->mount_apivfs)
1838                         return true;
1839
1840                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1841                         if (!params->prefix[t])
1842                                 continue;
1843
1844                         if (!strv_isempty(context->directories[t].paths))
1845                                 return true;
1846                 }
1847         }
1848
1849         if (context->dynamic_user &&
1850             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1851              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1852              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1853                 return true;
1854
1855         return false;
1856 }
1857
1858 static int setup_private_users(uid_t uid, gid_t gid) {
1859         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1860         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1861         _cleanup_close_ int unshare_ready_fd = -1;
1862         _cleanup_(sigkill_waitp) pid_t pid = 0;
1863         uint64_t c = 1;
1864         ssize_t n;
1865         int r;
1866
1867         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1868          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1869          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1870          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1871          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1872          * continues execution normally. */
1873
1874         if (uid != 0 && uid_is_valid(uid)) {
1875                 r = asprintf(&uid_map,
1876                              "0 0 1\n"                      /* Map root → root */
1877                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1878                              uid, uid);
1879                 if (r < 0)
1880                         return -ENOMEM;
1881         } else {
1882                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1883                 if (!uid_map)
1884                         return -ENOMEM;
1885         }
1886
1887         if (gid != 0 && gid_is_valid(gid)) {
1888                 r = asprintf(&gid_map,
1889                              "0 0 1\n"                      /* Map root → root */
1890                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1891                              gid, gid);
1892                 if (r < 0)
1893                         return -ENOMEM;
1894         } else {
1895                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1896                 if (!gid_map)
1897                         return -ENOMEM;
1898         }
1899
1900         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1901          * namespace. */
1902         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1903         if (unshare_ready_fd < 0)
1904                 return -errno;
1905
1906         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1907          * failed. */
1908         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1909                 return -errno;
1910
1911         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1912         if (r < 0)
1913                 return r;
1914         if (r == 0) {
1915                 _cleanup_close_ int fd = -1;
1916                 const char *a;
1917                 pid_t ppid;
1918
1919                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1920                  * here, after the parent opened its own user namespace. */
1921
1922                 ppid = getppid();
1923                 errno_pipe[0] = safe_close(errno_pipe[0]);
1924
1925                 /* Wait until the parent unshared the user namespace */
1926                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1927                         r = -errno;
1928                         goto child_fail;
1929                 }
1930
1931                 /* Disable the setgroups() system call in the child user namespace, for good. */
1932                 a = procfs_file_alloca(ppid, "setgroups");
1933                 fd = open(a, O_WRONLY|O_CLOEXEC);
1934                 if (fd < 0) {
1935                         if (errno != ENOENT) {
1936                                 r = -errno;
1937                                 goto child_fail;
1938                         }
1939
1940                         /* If the file is missing the kernel is too old, let's continue anyway. */
1941                 } else {
1942                         if (write(fd, "deny\n", 5) < 0) {
1943                                 r = -errno;
1944                                 goto child_fail;
1945                         }
1946
1947                         fd = safe_close(fd);
1948                 }
1949
1950                 /* First write the GID map */
1951                 a = procfs_file_alloca(ppid, "gid_map");
1952                 fd = open(a, O_WRONLY|O_CLOEXEC);
1953                 if (fd < 0) {
1954                         r = -errno;
1955                         goto child_fail;
1956                 }
1957                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1958                         r = -errno;
1959                         goto child_fail;
1960                 }
1961                 fd = safe_close(fd);
1962
1963                 /* The write the UID map */
1964                 a = procfs_file_alloca(ppid, "uid_map");
1965                 fd = open(a, O_WRONLY|O_CLOEXEC);
1966                 if (fd < 0) {
1967                         r = -errno;
1968                         goto child_fail;
1969                 }
1970                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1971                         r = -errno;
1972                         goto child_fail;
1973                 }
1974
1975                 _exit(EXIT_SUCCESS);
1976
1977         child_fail:
1978                 (void) write(errno_pipe[1], &r, sizeof(r));
1979                 _exit(EXIT_FAILURE);
1980         }
1981
1982         errno_pipe[1] = safe_close(errno_pipe[1]);
1983
1984         if (unshare(CLONE_NEWUSER) < 0)
1985                 return -errno;
1986
1987         /* Let the child know that the namespace is ready now */
1988         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1989                 return -errno;
1990
1991         /* Try to read an error code from the child */
1992         n = read(errno_pipe[0], &r, sizeof(r));
1993         if (n < 0)
1994                 return -errno;
1995         if (n == sizeof(r)) { /* an error code was sent to us */
1996                 if (r < 0)
1997                         return r;
1998                 return -EIO;
1999         }
2000         if (n != 0) /* on success we should have read 0 bytes */
2001                 return -EIO;
2002
2003         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2004         pid = 0;
2005         if (r < 0)
2006                 return r;
2007         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2008                 return -EIO;
2009
2010         return 0;
2011 }
2012
2013 static int setup_exec_directory(
2014                 const ExecContext *context,
2015                 const ExecParameters *params,
2016                 uid_t uid,
2017                 gid_t gid,
2018                 ExecDirectoryType type,
2019                 int *exit_status) {
2020
2021         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2022                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2023                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2024                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2025                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2026                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2027         };
2028         char **rt;
2029         int r;
2030
2031         assert(context);
2032         assert(params);
2033         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2034         assert(exit_status);
2035
2036         if (!params->prefix[type])
2037                 return 0;
2038
2039         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2040                 if (!uid_is_valid(uid))
2041                         uid = 0;
2042                 if (!gid_is_valid(gid))
2043                         gid = 0;
2044         }
2045
2046         STRV_FOREACH(rt, context->directories[type].paths) {
2047                 _cleanup_free_ char *p = NULL, *pp = NULL;
2048
2049                 p = strjoin(params->prefix[type], "/", *rt);
2050                 if (!p) {
2051                         r = -ENOMEM;
2052                         goto fail;
2053                 }
2054
2055                 r = mkdir_parents_label(p, 0755);
2056                 if (r < 0)
2057                         goto fail;
2058
2059                 if (context->dynamic_user &&
2060                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2061                         _cleanup_free_ char *private_root = NULL;
2062
2063                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2064                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2065                          * whose UID is later on reused. To lock this down we use the same trick used by container
2066                          * managers to prohibit host users to get access to files of the same UID in containers: we
2067                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2068                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2069                          * to make this directory permeable for the service itself.
2070                          *
2071                          * Specifically: for a service which wants a special directory "foo/" we first create a
2072                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2073                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2074                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2075                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2076                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2077                          * disabling the access boundary for the service and making sure it only gets access to the
2078                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2079                          *
2080                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2081                          * owned by the service itself.
2082                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2083                          * files or sockets with other services. */
2084
2085                         private_root = strjoin(params->prefix[type], "/private");
2086                         if (!private_root) {
2087                                 r = -ENOMEM;
2088                                 goto fail;
2089                         }
2090
2091                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2092                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2093                         if (r < 0)
2094                                 goto fail;
2095
2096                         pp = strjoin(private_root, "/", *rt);
2097                         if (!pp) {
2098                                 r = -ENOMEM;
2099                                 goto fail;
2100                         }
2101
2102                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2103                         r = mkdir_parents_label(pp, 0755);
2104                         if (r < 0)
2105                                 goto fail;
2106
2107                         if (is_dir(p, false) > 0 &&
2108                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2109
2110                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2111                                  * it over. Most likely the service has been upgraded from one that didn't use
2112                                  * DynamicUser=1, to one that does. */
2113
2114                                 if (rename(p, pp) < 0) {
2115                                         r = -errno;
2116                                         goto fail;
2117                                 }
2118                         } else {
2119                                 /* Otherwise, create the actual directory for the service */
2120
2121                                 r = mkdir_label(pp, context->directories[type].mode);
2122                                 if (r < 0 && r != -EEXIST)
2123                                         goto fail;
2124                         }
2125
2126                         /* And link it up from the original place */
2127                         r = symlink_idempotent(pp, p, true);
2128                         if (r < 0)
2129                                 goto fail;
2130
2131                         /* Lock down the access mode */
2132                         if (chmod(pp, context->directories[type].mode) < 0) {
2133                                 r = -errno;
2134                                 goto fail;
2135                         }
2136                 } else {
2137                         r = mkdir_label(p, context->directories[type].mode);
2138                         if (r < 0 && r != -EEXIST)
2139                                 goto fail;
2140                         if (r == -EEXIST && !context->dynamic_user)
2141                                 continue;
2142                 }
2143
2144                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2145                  * a service, and shall not be writable. */
2146                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2147                         continue;
2148
2149                 /* Then, change the ownership of the whole tree, if necessary */
2150                 r = path_chown_recursive(pp ?: p, uid, gid);
2151                 if (r < 0)
2152                         goto fail;
2153         }
2154
2155         return 0;
2156
2157 fail:
2158         *exit_status = exit_status_table[type];
2159         return r;
2160 }
2161
2162 #if ENABLE_SMACK
2163 static int setup_smack(
2164                 const ExecContext *context,
2165                 const ExecCommand *command) {
2166
2167         int r;
2168
2169         assert(context);
2170         assert(command);
2171
2172         if (context->smack_process_label) {
2173                 r = mac_smack_apply_pid(0, context->smack_process_label);
2174                 if (r < 0)
2175                         return r;
2176         }
2177 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2178         else {
2179                 _cleanup_free_ char *exec_label = NULL;
2180
2181                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2182                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2183                         return r;
2184
2185                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2186                 if (r < 0)
2187                         return r;
2188         }
2189 #endif
2190
2191         return 0;
2192 }
2193 #endif
2194
2195 static int compile_bind_mounts(
2196                 const ExecContext *context,
2197                 const ExecParameters *params,
2198                 BindMount **ret_bind_mounts,
2199                 size_t *ret_n_bind_mounts,
2200                 char ***ret_empty_directories) {
2201
2202         _cleanup_strv_free_ char **empty_directories = NULL;
2203         BindMount *bind_mounts;
2204         size_t n, h = 0, i;
2205         ExecDirectoryType t;
2206         int r;
2207
2208         assert(context);
2209         assert(params);
2210         assert(ret_bind_mounts);
2211         assert(ret_n_bind_mounts);
2212         assert(ret_empty_directories);
2213
2214         n = context->n_bind_mounts;
2215         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2216                 if (!params->prefix[t])
2217                         continue;
2218
2219                 n += strv_length(context->directories[t].paths);
2220         }
2221
2222         if (n <= 0) {
2223                 *ret_bind_mounts = NULL;
2224                 *ret_n_bind_mounts = 0;
2225                 *ret_empty_directories = NULL;
2226                 return 0;
2227         }
2228
2229         bind_mounts = new(BindMount, n);
2230         if (!bind_mounts)
2231                 return -ENOMEM;
2232
2233         for (i = 0; i < context->n_bind_mounts; i++) {
2234                 BindMount *item = context->bind_mounts + i;
2235                 char *s, *d;
2236
2237                 s = strdup(item->source);
2238                 if (!s) {
2239                         r = -ENOMEM;
2240                         goto finish;
2241                 }
2242
2243                 d = strdup(item->destination);
2244                 if (!d) {
2245                         free(s);
2246                         r = -ENOMEM;
2247                         goto finish;
2248                 }
2249
2250                 bind_mounts[h++] = (BindMount) {
2251                         .source = s,
2252                         .destination = d,
2253                         .read_only = item->read_only,
2254                         .recursive = item->recursive,
2255                         .ignore_enoent = item->ignore_enoent,
2256                 };
2257         }
2258
2259         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2260                 char **suffix;
2261
2262                 if (!params->prefix[t])
2263                         continue;
2264
2265                 if (strv_isempty(context->directories[t].paths))
2266                         continue;
2267
2268                 if (context->dynamic_user &&
2269                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2270                     !(context->root_directory || context->root_image)) {
2271                         char *private_root;
2272
2273                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2274                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2275                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2276
2277                         private_root = strjoin(params->prefix[t], "/private");
2278                         if (!private_root) {
2279                                 r = -ENOMEM;
2280                                 goto finish;
2281                         }
2282
2283                         r = strv_consume(&empty_directories, private_root);
2284                         if (r < 0)
2285                                 goto finish;
2286                 }
2287
2288                 STRV_FOREACH(suffix, context->directories[t].paths) {
2289                         char *s, *d;
2290
2291                         if (context->dynamic_user &&
2292                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2293                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2294                         else
2295                                 s = strjoin(params->prefix[t], "/", *suffix);
2296                         if (!s) {
2297                                 r = -ENOMEM;
2298                                 goto finish;
2299                         }
2300
2301                         if (context->dynamic_user &&
2302                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2303                             (context->root_directory || context->root_image))
2304                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2305                                  * directory is not created on the root directory. So, let's bind-mount the directory
2306                                  * on the 'non-private' place. */
2307                                 d = strjoin(params->prefix[t], "/", *suffix);
2308                         else
2309                                 d = strdup(s);
2310                         if (!d) {
2311                                 free(s);
2312                                 r = -ENOMEM;
2313                                 goto finish;
2314                         }
2315
2316                         bind_mounts[h++] = (BindMount) {
2317                                 .source = s,
2318                                 .destination = d,
2319                                 .read_only = false,
2320                                 .recursive = true,
2321                                 .ignore_enoent = false,
2322                         };
2323                 }
2324         }
2325
2326         assert(h == n);
2327
2328         *ret_bind_mounts = bind_mounts;
2329         *ret_n_bind_mounts = n;
2330         *ret_empty_directories = TAKE_PTR(empty_directories);
2331
2332         return (int) n;
2333
2334 finish:
2335         bind_mount_free_many(bind_mounts, h);
2336         return r;
2337 }
2338
2339 static int apply_mount_namespace(
2340                 const Unit *u,
2341                 const ExecCommand *command,
2342                 const ExecContext *context,
2343                 const ExecParameters *params,
2344                 const ExecRuntime *runtime) {
2345
2346         _cleanup_strv_free_ char **empty_directories = NULL;
2347         char *tmp = NULL, *var = NULL;
2348         const char *root_dir = NULL, *root_image = NULL;
2349         NamespaceInfo ns_info;
2350         bool needs_sandboxing;
2351         BindMount *bind_mounts = NULL;
2352         size_t n_bind_mounts = 0;
2353         int r;
2354
2355         assert(context);
2356
2357         /* The runtime struct only contains the parent of the private /tmp,
2358          * which is non-accessible to world users. Inside of it there's a /tmp
2359          * that is sticky, and that's the one we want to use here. */
2360
2361         if (context->private_tmp && runtime) {
2362                 if (runtime->tmp_dir)
2363                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2364                 if (runtime->var_tmp_dir)
2365                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2366         }
2367
2368         if (params->flags & EXEC_APPLY_CHROOT) {
2369                 root_image = context->root_image;
2370
2371                 if (!root_image)
2372                         root_dir = context->root_directory;
2373         }
2374
2375         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2376         if (r < 0)
2377                 return r;
2378
2379         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2380         if (needs_sandboxing)
2381                 ns_info = (NamespaceInfo) {
2382                         .ignore_protect_paths = false,
2383                         .private_dev = context->private_devices,
2384                         .protect_control_groups = context->protect_control_groups,
2385                         .protect_kernel_tunables = context->protect_kernel_tunables,
2386                         .protect_kernel_modules = context->protect_kernel_modules,
2387                         .mount_apivfs = context->mount_apivfs,
2388                         .private_mounts = context->private_mounts,
2389                 };
2390         else if (!context->dynamic_user && root_dir)
2391                 /*
2392                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2393                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2394                  * fail if we are enable to apply the sandbox inside the mount namespace.
2395                  */
2396                 ns_info = (NamespaceInfo) {
2397                         .ignore_protect_paths = true,
2398                 };
2399         else
2400                 ns_info = (NamespaceInfo) {};
2401
2402         r = setup_namespace(root_dir, root_image,
2403                             &ns_info, context->read_write_paths,
2404                             needs_sandboxing ? context->read_only_paths : NULL,
2405                             needs_sandboxing ? context->inaccessible_paths : NULL,
2406                             empty_directories,
2407                             bind_mounts,
2408                             n_bind_mounts,
2409                             context->temporary_filesystems,
2410                             context->n_temporary_filesystems,
2411                             tmp,
2412                             var,
2413                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2414                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2415                             context->mount_flags,
2416                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2417
2418         bind_mount_free_many(bind_mounts, n_bind_mounts);
2419
2420         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2421          * that with a special, recognizable error ENOANO. In this case, silently proceeed, but only if exclusively
2422          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2423          * completely different execution environment. */
2424         if (r == -ENOANO) {
2425                 if (n_bind_mounts == 0 &&
2426                     context->n_temporary_filesystems == 0 &&
2427                     !root_dir && !root_image &&
2428                     !context->dynamic_user) {
2429                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2430                         return 0;
2431                 }
2432
2433                 return -EOPNOTSUPP;
2434         }
2435
2436         return r;
2437 }
2438
2439 static int apply_working_directory(
2440                 const ExecContext *context,
2441                 const ExecParameters *params,
2442                 const char *home,
2443                 const bool needs_mount_ns,
2444                 int *exit_status) {
2445
2446         const char *d, *wd;
2447
2448         assert(context);
2449         assert(exit_status);
2450
2451         if (context->working_directory_home) {
2452
2453                 if (!home) {
2454                         *exit_status = EXIT_CHDIR;
2455                         return -ENXIO;
2456                 }
2457
2458                 wd = home;
2459
2460         } else if (context->working_directory)
2461                 wd = context->working_directory;
2462         else
2463                 wd = "/";
2464
2465         if (params->flags & EXEC_APPLY_CHROOT) {
2466                 if (!needs_mount_ns && context->root_directory)
2467                         if (chroot(context->root_directory) < 0) {
2468                                 *exit_status = EXIT_CHROOT;
2469                                 return -errno;
2470                         }
2471
2472                 d = wd;
2473         } else
2474                 d = prefix_roota(context->root_directory, wd);
2475
2476         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2477                 *exit_status = EXIT_CHDIR;
2478                 return -errno;
2479         }
2480
2481         return 0;
2482 }
2483
2484 static int setup_keyring(
2485                 const Unit *u,
2486                 const ExecContext *context,
2487                 const ExecParameters *p,
2488                 uid_t uid, gid_t gid) {
2489
2490         key_serial_t keyring;
2491         int r = 0;
2492         uid_t saved_uid;
2493         gid_t saved_gid;
2494
2495         assert(u);
2496         assert(context);
2497         assert(p);
2498
2499         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2500          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2501          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2502          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2503          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2504          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2505
2506         if (!(p->flags & EXEC_NEW_KEYRING))
2507                 return 0;
2508
2509         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2510                 return 0;
2511
2512         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2513          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2514          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2515          * & group is just as nasty as acquiring a reference to the user keyring. */
2516
2517         saved_uid = getuid();
2518         saved_gid = getgid();
2519
2520         if (gid_is_valid(gid) && gid != saved_gid) {
2521                 if (setregid(gid, -1) < 0)
2522                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2523         }
2524
2525         if (uid_is_valid(uid) && uid != saved_uid) {
2526                 if (setreuid(uid, -1) < 0) {
2527                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2528                         goto out;
2529                 }
2530         }
2531
2532         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2533         if (keyring == -1) {
2534                 if (errno == ENOSYS)
2535                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2536                 else if (IN_SET(errno, EACCES, EPERM))
2537                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2538                 else if (errno == EDQUOT)
2539                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2540                 else
2541                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2542
2543                 goto out;
2544         }
2545
2546         /* When requested link the user keyring into the session keyring. */
2547         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2548
2549                 if (keyctl(KEYCTL_LINK,
2550                            KEY_SPEC_USER_KEYRING,
2551                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2552                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2553                         goto out;
2554                 }
2555         }
2556
2557         /* Restore uid/gid back */
2558         if (uid_is_valid(uid) && uid != saved_uid) {
2559                 if (setreuid(saved_uid, -1) < 0) {
2560                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2561                         goto out;
2562                 }
2563         }
2564
2565         if (gid_is_valid(gid) && gid != saved_gid) {
2566                 if (setregid(saved_gid, -1) < 0)
2567                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2568         }
2569
2570         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2571         if (!sd_id128_is_null(u->invocation_id)) {
2572                 key_serial_t key;
2573
2574                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2575                 if (key == -1)
2576                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2577                 else {
2578                         if (keyctl(KEYCTL_SETPERM, key,
2579                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2580                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2581                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2582                 }
2583         }
2584
2585 out:
2586         /* Revert back uid & gid for the the last time, and exit */
2587         /* no extra logging, as only the first already reported error matters */
2588         if (getuid() != saved_uid)
2589                 (void) setreuid(saved_uid, -1);
2590
2591         if (getgid() != saved_gid)
2592                 (void) setregid(saved_gid, -1);
2593
2594         return r;
2595 }
2596
2597 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2598         assert(array);
2599         assert(n);
2600
2601         if (!pair)
2602                 return;
2603
2604         if (pair[0] >= 0)
2605                 array[(*n)++] = pair[0];
2606         if (pair[1] >= 0)
2607                 array[(*n)++] = pair[1];
2608 }
2609
2610 static int close_remaining_fds(
2611                 const ExecParameters *params,
2612                 const ExecRuntime *runtime,
2613                 const DynamicCreds *dcreds,
2614                 int user_lookup_fd,
2615                 int socket_fd,
2616                 int exec_fd,
2617                 int *fds, size_t n_fds) {
2618
2619         size_t n_dont_close = 0;
2620         int dont_close[n_fds + 12];
2621
2622         assert(params);
2623
2624         if (params->stdin_fd >= 0)
2625                 dont_close[n_dont_close++] = params->stdin_fd;
2626         if (params->stdout_fd >= 0)
2627                 dont_close[n_dont_close++] = params->stdout_fd;
2628         if (params->stderr_fd >= 0)
2629                 dont_close[n_dont_close++] = params->stderr_fd;
2630
2631         if (socket_fd >= 0)
2632                 dont_close[n_dont_close++] = socket_fd;
2633         if (exec_fd >= 0)
2634                 dont_close[n_dont_close++] = exec_fd;
2635         if (n_fds > 0) {
2636                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2637                 n_dont_close += n_fds;
2638         }
2639
2640         if (runtime)
2641                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2642
2643         if (dcreds) {
2644                 if (dcreds->user)
2645                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2646                 if (dcreds->group)
2647                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2648         }
2649
2650         if (user_lookup_fd >= 0)
2651                 dont_close[n_dont_close++] = user_lookup_fd;
2652
2653         return close_all_fds(dont_close, n_dont_close);
2654 }
2655
2656 static int send_user_lookup(
2657                 Unit *unit,
2658                 int user_lookup_fd,
2659                 uid_t uid,
2660                 gid_t gid) {
2661
2662         assert(unit);
2663
2664         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2665          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2666          * specified. */
2667
2668         if (user_lookup_fd < 0)
2669                 return 0;
2670
2671         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2672                 return 0;
2673
2674         if (writev(user_lookup_fd,
2675                (struct iovec[]) {
2676                            IOVEC_INIT(&uid, sizeof(uid)),
2677                            IOVEC_INIT(&gid, sizeof(gid)),
2678                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2679                 return -errno;
2680
2681         return 0;
2682 }
2683
2684 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2685         int r;
2686
2687         assert(c);
2688         assert(home);
2689         assert(buf);
2690
2691         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2692
2693         if (*home)
2694                 return 0;
2695
2696         if (!c->working_directory_home)
2697                 return 0;
2698
2699         if (uid == 0) {
2700                 /* Hardcode /root as home directory for UID 0 */
2701                 *home = "/root";
2702                 return 1;
2703         }
2704
2705         r = get_home_dir(buf);
2706         if (r < 0)
2707                 return r;
2708
2709         *home = *buf;
2710         return 1;
2711 }
2712
2713 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2714         _cleanup_strv_free_ char ** list = NULL;
2715         ExecDirectoryType t;
2716         int r;
2717
2718         assert(c);
2719         assert(p);
2720         assert(ret);
2721
2722         assert(c->dynamic_user);
2723
2724         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2725          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2726          * directories. */
2727
2728         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2729                 char **i;
2730
2731                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2732                         continue;
2733
2734                 if (!p->prefix[t])
2735                         continue;
2736
2737                 STRV_FOREACH(i, c->directories[t].paths) {
2738                         char *e;
2739
2740                         if (t == EXEC_DIRECTORY_RUNTIME)
2741                                 e = strjoin(p->prefix[t], "/", *i);
2742                         else
2743                                 e = strjoin(p->prefix[t], "/private/", *i);
2744                         if (!e)
2745                                 return -ENOMEM;
2746
2747                         r = strv_consume(&list, e);
2748                         if (r < 0)
2749                                 return r;
2750                 }
2751         }
2752
2753         *ret = TAKE_PTR(list);
2754
2755         return 0;
2756 }
2757
2758 static char *exec_command_line(char **argv);
2759
2760 static int exec_child(
2761                 Unit *unit,
2762                 const ExecCommand *command,
2763                 const ExecContext *context,
2764                 const ExecParameters *params,
2765                 ExecRuntime *runtime,
2766                 DynamicCreds *dcreds,
2767                 int socket_fd,
2768                 int named_iofds[3],
2769                 int *fds,
2770                 size_t n_socket_fds,
2771                 size_t n_storage_fds,
2772                 char **files_env,
2773                 int user_lookup_fd,
2774                 int *exit_status) {
2775
2776         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2777         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2778         _cleanup_free_ gid_t *supplementary_gids = NULL;
2779         const char *username = NULL, *groupname = NULL;
2780         _cleanup_free_ char *home_buffer = NULL;
2781         const char *home = NULL, *shell = NULL;
2782         dev_t journal_stream_dev = 0;
2783         ino_t journal_stream_ino = 0;
2784         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2785                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2786                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2787                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2788 #if HAVE_SELINUX
2789         _cleanup_free_ char *mac_selinux_context_net = NULL;
2790         bool use_selinux = false;
2791 #endif
2792 #if ENABLE_SMACK
2793         bool use_smack = false;
2794 #endif
2795 #if HAVE_APPARMOR
2796         bool use_apparmor = false;
2797 #endif
2798         uid_t uid = UID_INVALID;
2799         gid_t gid = GID_INVALID;
2800         size_t n_fds;
2801         ExecDirectoryType dt;
2802         int secure_bits;
2803
2804         assert(unit);
2805         assert(command);
2806         assert(context);
2807         assert(params);
2808         assert(exit_status);
2809
2810         rename_process_from_path(command->path);
2811
2812         /* We reset exactly these signals, since they are the
2813          * only ones we set to SIG_IGN in the main daemon. All
2814          * others we leave untouched because we set them to
2815          * SIG_DFL or a valid handler initially, both of which
2816          * will be demoted to SIG_DFL. */
2817         (void) default_signals(SIGNALS_CRASH_HANDLER,
2818                                SIGNALS_IGNORE, -1);
2819
2820         if (context->ignore_sigpipe)
2821                 (void) ignore_signals(SIGPIPE, -1);
2822
2823         r = reset_signal_mask();
2824         if (r < 0) {
2825                 *exit_status = EXIT_SIGNAL_MASK;
2826                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2827         }
2828
2829         if (params->idle_pipe)
2830                 do_idle_pipe_dance(params->idle_pipe);
2831
2832         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2833          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2834          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2835          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2836
2837         log_forget_fds();
2838         log_set_open_when_needed(true);
2839
2840         /* In case anything used libc syslog(), close this here, too */
2841         closelog();
2842
2843         n_fds = n_socket_fds + n_storage_fds;
2844         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2845         if (r < 0) {
2846                 *exit_status = EXIT_FDS;
2847                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2848         }
2849
2850         if (!context->same_pgrp)
2851                 if (setsid() < 0) {
2852                         *exit_status = EXIT_SETSID;
2853                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2854                 }
2855
2856         exec_context_tty_reset(context, params);
2857
2858         if (unit_shall_confirm_spawn(unit)) {
2859                 const char *vc = params->confirm_spawn;
2860                 _cleanup_free_ char *cmdline = NULL;
2861
2862                 cmdline = exec_command_line(command->argv);
2863                 if (!cmdline) {
2864                         *exit_status = EXIT_MEMORY;
2865                         return log_oom();
2866                 }
2867
2868                 r = ask_for_confirmation(vc, unit, cmdline);
2869                 if (r != CONFIRM_EXECUTE) {
2870                         if (r == CONFIRM_PRETEND_SUCCESS) {
2871                                 *exit_status = EXIT_SUCCESS;
2872                                 return 0;
2873                         }
2874                         *exit_status = EXIT_CONFIRM;
2875                         log_unit_error(unit, "Execution cancelled by the user");
2876                         return -ECANCELED;
2877                 }
2878         }
2879
2880         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2881          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2882          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2883          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2884          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2885         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2886             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2887                 *exit_status = EXIT_MEMORY;
2888                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2889         }
2890
2891         if (context->dynamic_user && dcreds) {
2892                 _cleanup_strv_free_ char **suggested_paths = NULL;
2893
2894                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2895                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2896                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2897                         *exit_status = EXIT_USER;
2898                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2899                 }
2900
2901                 r = compile_suggested_paths(context, params, &suggested_paths);
2902                 if (r < 0) {
2903                         *exit_status = EXIT_MEMORY;
2904                         return log_oom();
2905                 }
2906
2907                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2908                 if (r < 0) {
2909                         *exit_status = EXIT_USER;
2910                         if (r == -EILSEQ) {
2911                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2912                                 return -EOPNOTSUPP;
2913                         }
2914                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2915                 }
2916
2917                 if (!uid_is_valid(uid)) {
2918                         *exit_status = EXIT_USER;
2919                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2920                         return -ESRCH;
2921                 }
2922
2923                 if (!gid_is_valid(gid)) {
2924                         *exit_status = EXIT_USER;
2925                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2926                         return -ESRCH;
2927                 }
2928
2929                 if (dcreds->user)
2930                         username = dcreds->user->name;
2931
2932         } else {
2933                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2934                 if (r < 0) {
2935                         *exit_status = EXIT_USER;
2936                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2937                 }
2938
2939                 r = get_fixed_group(context, &groupname, &gid);
2940                 if (r < 0) {
2941                         *exit_status = EXIT_GROUP;
2942                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2943                 }
2944         }
2945
2946         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2947         r = get_supplementary_groups(context, username, groupname, gid,
2948                                      &supplementary_gids, &ngids);
2949         if (r < 0) {
2950                 *exit_status = EXIT_GROUP;
2951                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2952         }
2953
2954         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2955         if (r < 0) {
2956                 *exit_status = EXIT_USER;
2957                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2958         }
2959
2960         user_lookup_fd = safe_close(user_lookup_fd);
2961
2962         r = acquire_home(context, uid, &home, &home_buffer);
2963         if (r < 0) {
2964                 *exit_status = EXIT_CHDIR;
2965                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2966         }
2967
2968         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2969          * must sure to drop O_NONBLOCK */
2970         if (socket_fd >= 0)
2971                 (void) fd_nonblock(socket_fd, false);
2972
2973         r = setup_input(context, params, socket_fd, named_iofds);
2974         if (r < 0) {
2975                 *exit_status = EXIT_STDIN;
2976                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2977         }
2978
2979         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2980         if (r < 0) {
2981                 *exit_status = EXIT_STDOUT;
2982                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2983         }
2984
2985         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2986         if (r < 0) {
2987                 *exit_status = EXIT_STDERR;
2988                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2989         }
2990
2991         if (params->cgroup_path) {
2992                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2993                 if (r < 0) {
2994                         *exit_status = EXIT_CGROUP;
2995                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2996                 }
2997         }
2998
2999         if (context->oom_score_adjust_set) {
3000                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3001                  * prohibit write access to this file, and we shouldn't trip up over that. */
3002                 r = set_oom_score_adjust(context->oom_score_adjust);
3003                 if (IN_SET(r, -EPERM, -EACCES))
3004                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3005                 else if (r < 0) {
3006                         *exit_status = EXIT_OOM_ADJUST;
3007                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3008                 }
3009         }
3010
3011         if (context->nice_set)
3012                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3013                         *exit_status = EXIT_NICE;
3014                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3015                 }
3016
3017         if (context->cpu_sched_set) {
3018                 struct sched_param param = {
3019                         .sched_priority = context->cpu_sched_priority,
3020                 };
3021
3022                 r = sched_setscheduler(0,
3023                                        context->cpu_sched_policy |
3024                                        (context->cpu_sched_reset_on_fork ?
3025                                         SCHED_RESET_ON_FORK : 0),
3026                                        &param);
3027                 if (r < 0) {
3028                         *exit_status = EXIT_SETSCHEDULER;
3029                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3030                 }
3031         }
3032
3033         if (context->cpuset)
3034                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
3035                         *exit_status = EXIT_CPUAFFINITY;
3036                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3037                 }
3038
3039         if (context->ioprio_set)
3040                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3041                         *exit_status = EXIT_IOPRIO;
3042                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3043                 }
3044
3045         if (context->timer_slack_nsec != NSEC_INFINITY)
3046                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3047                         *exit_status = EXIT_TIMERSLACK;
3048                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3049                 }
3050
3051         if (context->personality != PERSONALITY_INVALID) {
3052                 r = safe_personality(context->personality);
3053                 if (r < 0) {
3054                         *exit_status = EXIT_PERSONALITY;
3055                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3056                 }
3057         }
3058
3059         if (context->utmp_id)
3060                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3061                                       context->tty_path,
3062                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3063                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3064                                       USER_PROCESS,
3065                                       username);
3066
3067         if (context->user) {
3068                 r = chown_terminal(STDIN_FILENO, uid);
3069                 if (r < 0) {
3070                         *exit_status = EXIT_STDIN;
3071                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3072                 }
3073         }
3074
3075         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3076          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3077          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3078          * touch a single hierarchy too. */
3079         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3080                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3081                 if (r < 0) {
3082                         *exit_status = EXIT_CGROUP;
3083                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3084                 }
3085         }
3086
3087         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3088                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3089                 if (r < 0)
3090                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3091         }
3092
3093         r = build_environment(
3094                         unit,
3095                         context,
3096                         params,
3097                         n_fds,
3098                         home,
3099                         username,
3100                         shell,
3101                         journal_stream_dev,
3102                         journal_stream_ino,
3103                         &our_env);
3104         if (r < 0) {
3105                 *exit_status = EXIT_MEMORY;
3106                 return log_oom();
3107         }
3108
3109         r = build_pass_environment(context, &pass_env);
3110         if (r < 0) {
3111                 *exit_status = EXIT_MEMORY;
3112                 return log_oom();
3113         }
3114
3115         accum_env = strv_env_merge(5,
3116                                    params->environment,
3117                                    our_env,
3118                                    pass_env,
3119                                    context->environment,
3120                                    files_env,
3121                                    NULL);
3122         if (!accum_env) {
3123                 *exit_status = EXIT_MEMORY;
3124                 return log_oom();
3125         }
3126         accum_env = strv_env_clean(accum_env);
3127
3128         (void) umask(context->umask);
3129
3130         r = setup_keyring(unit, context, params, uid, gid);
3131         if (r < 0) {
3132                 *exit_status = EXIT_KEYRING;
3133                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3134         }
3135
3136         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3137         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3138
3139         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3140         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3141
3142         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3143         if (needs_ambient_hack)
3144                 needs_setuid = false;
3145         else
3146                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3147
3148         if (needs_sandboxing) {
3149                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3150                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3151                  * impacting our own code paths. */
3152
3153 #if HAVE_SELINUX
3154                 use_selinux = mac_selinux_use();
3155 #endif
3156 #if ENABLE_SMACK
3157                 use_smack = mac_smack_use();
3158 #endif
3159 #if HAVE_APPARMOR
3160                 use_apparmor = mac_apparmor_use();
3161 #endif
3162         }
3163
3164         if (needs_setuid) {
3165                 if (context->pam_name && username) {
3166                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3167                         if (r < 0) {
3168                                 *exit_status = EXIT_PAM;
3169                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3170                         }
3171                 }
3172         }
3173
3174         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3175                 if (ns_type_supported(NAMESPACE_NET)) {
3176                         r = setup_netns(runtime->netns_storage_socket);
3177                         if (r < 0) {
3178                                 *exit_status = EXIT_NETWORK;
3179                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3180                         }
3181                 } else
3182                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3183         }
3184
3185         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3186         if (needs_mount_namespace) {
3187                 r = apply_mount_namespace(unit, command, context, params, runtime);
3188                 if (r < 0) {
3189                         *exit_status = EXIT_NAMESPACE;
3190                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3191                 }
3192         }
3193
3194         /* Apply just after mount namespace setup */
3195         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3196         if (r < 0)
3197                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3198
3199         /* Drop groups as early as possbile */
3200         if (needs_setuid) {
3201                 r = enforce_groups(gid, supplementary_gids, ngids);
3202                 if (r < 0) {
3203                         *exit_status = EXIT_GROUP;
3204                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3205                 }
3206         }
3207
3208         if (needs_sandboxing) {
3209 #if HAVE_SELINUX
3210                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3211                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3212                         if (r < 0) {
3213                                 *exit_status = EXIT_SELINUX_CONTEXT;
3214                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3215                         }
3216                 }
3217 #endif
3218
3219                 if (context->private_users) {
3220                         r = setup_private_users(uid, gid);
3221                         if (r < 0) {
3222                                 *exit_status = EXIT_USER;
3223                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3224                         }
3225                 }
3226         }
3227
3228         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3229          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3230          * however if we have it as we want to keep it open until the final execve(). */
3231
3232         if (params->exec_fd >= 0) {
3233                 exec_fd = params->exec_fd;
3234
3235                 if (exec_fd < 3 + (int) n_fds) {
3236                         int moved_fd;
3237
3238                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3239                          * process we are about to execute. */
3240
3241                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3242                         if (moved_fd < 0) {
3243                                 *exit_status = EXIT_FDS;
3244                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3245                         }
3246
3247                         safe_close(exec_fd);
3248                         exec_fd = moved_fd;
3249                 } else {
3250                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3251                         r = fd_cloexec(exec_fd, true);
3252                         if (r < 0) {
3253                                 *exit_status = EXIT_FDS;
3254                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3255                         }
3256                 }
3257
3258                 fds_with_exec_fd = newa(int, n_fds + 1);
3259                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3260                 fds_with_exec_fd[n_fds] = exec_fd;
3261                 n_fds_with_exec_fd = n_fds + 1;
3262         } else {
3263                 fds_with_exec_fd = fds;
3264                 n_fds_with_exec_fd = n_fds;
3265         }
3266
3267         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3268         if (r >= 0)
3269                 r = shift_fds(fds, n_fds);
3270         if (r >= 0)
3271                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3272         if (r < 0) {
3273                 *exit_status = EXIT_FDS;
3274                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3275         }
3276
3277         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3278          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3279          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3280          * came this far. */
3281
3282         secure_bits = context->secure_bits;
3283
3284         if (needs_sandboxing) {
3285                 uint64_t bset;
3286                 int which_failed;
3287
3288                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3289                 if (r < 0) {
3290                         *exit_status = EXIT_LIMITS;
3291                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3292                 }
3293
3294                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3295                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3296                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3297                                 *exit_status = EXIT_LIMITS;
3298                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3299                         }
3300                 }
3301
3302 #if ENABLE_SMACK
3303                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3304                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3305                 if (use_smack) {
3306                         r = setup_smack(context, command);
3307                         if (r < 0) {
3308                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3309                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3310                         }
3311                 }
3312 #endif
3313
3314                 bset = context->capability_bounding_set;
3315                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3316                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3317                  * instead of us doing that */
3318                 if (needs_ambient_hack)
3319                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3320                                 (UINT64_C(1) << CAP_SETUID) |
3321                                 (UINT64_C(1) << CAP_SETGID);
3322
3323                 if (!cap_test_all(bset)) {
3324                         r = capability_bounding_set_drop(bset, false);
3325                         if (r < 0) {
3326                                 *exit_status = EXIT_CAPABILITIES;
3327                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3328                         }
3329                 }
3330
3331                 /* This is done before enforce_user, but ambient set
3332                  * does not survive over setresuid() if keep_caps is not set. */
3333                 if (!needs_ambient_hack &&
3334                     context->capability_ambient_set != 0) {
3335                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3336                         if (r < 0) {
3337                                 *exit_status = EXIT_CAPABILITIES;
3338                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3339                         }
3340                 }
3341         }
3342
3343         if (needs_setuid) {
3344                 if (context->user) {
3345                         r = enforce_user(context, uid);
3346                         if (r < 0) {
3347                                 *exit_status = EXIT_USER;
3348                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3349                         }
3350
3351                         if (!needs_ambient_hack &&
3352                             context->capability_ambient_set != 0) {
3353
3354                                 /* Fix the ambient capabilities after user change. */
3355                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3356                                 if (r < 0) {
3357                                         *exit_status = EXIT_CAPABILITIES;
3358                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3359                                 }
3360
3361                                 /* If we were asked to change user and ambient capabilities
3362                                  * were requested, we had to add keep-caps to the securebits
3363                                  * so that we would maintain the inherited capability set
3364                                  * through the setresuid(). Make sure that the bit is added
3365                                  * also to the context secure_bits so that we don't try to
3366                                  * drop the bit away next. */
3367
3368                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3369                         }
3370                 }
3371         }
3372
3373         if (needs_sandboxing) {
3374                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3375                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3376                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3377                  * are restricted. */
3378
3379 #if HAVE_SELINUX
3380                 if (use_selinux) {
3381                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3382
3383                         if (exec_context) {
3384                                 r = setexeccon(exec_context);
3385                                 if (r < 0) {
3386                                         *exit_status = EXIT_SELINUX_CONTEXT;
3387                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3388                                 }
3389                         }
3390                 }
3391 #endif
3392
3393 #if HAVE_APPARMOR
3394                 if (use_apparmor && context->apparmor_profile) {
3395                         r = aa_change_onexec(context->apparmor_profile);
3396                         if (r < 0 && !context->apparmor_profile_ignore) {
3397                                 *exit_status = EXIT_APPARMOR_PROFILE;
3398                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3399                         }
3400                 }
3401 #endif
3402
3403                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3404                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3405                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3406                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3407                                 *exit_status = EXIT_SECUREBITS;
3408                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3409                         }
3410
3411                 if (context_has_no_new_privileges(context))
3412                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3413                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3414                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3415                         }
3416
3417 #if HAVE_SECCOMP
3418                 r = apply_address_families(unit, context);
3419                 if (r < 0) {
3420                         *exit_status = EXIT_ADDRESS_FAMILIES;
3421                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3422                 }
3423
3424                 r = apply_memory_deny_write_execute(unit, context);
3425                 if (r < 0) {
3426                         *exit_status = EXIT_SECCOMP;
3427                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3428                 }
3429
3430                 r = apply_restrict_realtime(unit, context);
3431                 if (r < 0) {
3432                         *exit_status = EXIT_SECCOMP;
3433                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3434                 }
3435
3436                 r = apply_restrict_namespaces(unit, context);
3437                 if (r < 0) {
3438                         *exit_status = EXIT_SECCOMP;
3439                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3440                 }
3441
3442                 r = apply_protect_sysctl(unit, context);
3443                 if (r < 0) {
3444                         *exit_status = EXIT_SECCOMP;
3445                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3446                 }
3447
3448                 r = apply_protect_kernel_modules(unit, context);
3449                 if (r < 0) {
3450                         *exit_status = EXIT_SECCOMP;
3451                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3452                 }
3453
3454                 r = apply_private_devices(unit, context);
3455                 if (r < 0) {
3456                         *exit_status = EXIT_SECCOMP;
3457                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3458                 }
3459
3460                 r = apply_syscall_archs(unit, context);
3461                 if (r < 0) {
3462                         *exit_status = EXIT_SECCOMP;
3463                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3464                 }
3465
3466                 r = apply_lock_personality(unit, context);
3467                 if (r < 0) {
3468                         *exit_status = EXIT_SECCOMP;
3469                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3470                 }
3471
3472                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3473                  * by the filter as little as possible. */
3474                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3475                 if (r < 0) {
3476                         *exit_status = EXIT_SECCOMP;
3477                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3478                 }
3479 #endif
3480         }
3481
3482         if (!strv_isempty(context->unset_environment)) {
3483                 char **ee = NULL;
3484
3485                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3486                 if (!ee) {
3487                         *exit_status = EXIT_MEMORY;
3488                         return log_oom();
3489                 }
3490
3491                 strv_free_and_replace(accum_env, ee);
3492         }
3493
3494         final_argv = replace_env_argv(command->argv, accum_env);
3495         if (!final_argv) {
3496                 *exit_status = EXIT_MEMORY;
3497                 return log_oom();
3498         }
3499
3500         if (DEBUG_LOGGING) {
3501                 _cleanup_free_ char *line;
3502
3503                 line = exec_command_line(final_argv);
3504                 if (line)
3505                         log_struct(LOG_DEBUG,
3506                                    "EXECUTABLE=%s", command->path,
3507                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3508                                    LOG_UNIT_ID(unit),
3509                                    LOG_UNIT_INVOCATION_ID(unit));
3510         }
3511
3512         if (exec_fd >= 0) {
3513                 uint8_t hot = 1;
3514
3515                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3516                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3517
3518                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3519                         *exit_status = EXIT_EXEC;
3520                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3521                 }
3522         }
3523
3524         execve(command->path, final_argv, accum_env);
3525         r = -errno;
3526
3527         if (exec_fd >= 0) {
3528                 uint8_t hot = 0;
3529
3530                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3531                  * that POLLHUP on it no longer means execve() succeeded. */
3532
3533                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3534                         *exit_status = EXIT_EXEC;
3535                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3536                 }
3537         }
3538
3539         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3540                 log_struct_errno(LOG_INFO, r,
3541                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3542                                  LOG_UNIT_ID(unit),
3543                                  LOG_UNIT_INVOCATION_ID(unit),
3544                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3545                                                   command->path),
3546                                  "EXECUTABLE=%s", command->path);
3547                 return 0;
3548         }
3549
3550         *exit_status = EXIT_EXEC;
3551         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3552 }
3553
3554 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3555 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3556
3557 int exec_spawn(Unit *unit,
3558                ExecCommand *command,
3559                const ExecContext *context,
3560                const ExecParameters *params,
3561                ExecRuntime *runtime,
3562                DynamicCreds *dcreds,
3563                pid_t *ret) {
3564
3565         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3566         _cleanup_strv_free_ char **files_env = NULL;
3567         size_t n_storage_fds = 0, n_socket_fds = 0;
3568         _cleanup_free_ char *line = NULL;
3569         pid_t pid;
3570
3571         assert(unit);
3572         assert(command);
3573         assert(context);
3574         assert(ret);
3575         assert(params);
3576         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3577
3578         if (context->std_input == EXEC_INPUT_SOCKET ||
3579             context->std_output == EXEC_OUTPUT_SOCKET ||
3580             context->std_error == EXEC_OUTPUT_SOCKET) {
3581
3582                 if (params->n_socket_fds > 1) {
3583                         log_unit_error(unit, "Got more than one socket.");
3584                         return -EINVAL;
3585                 }
3586
3587                 if (params->n_socket_fds == 0) {
3588                         log_unit_error(unit, "Got no socket.");
3589                         return -EINVAL;
3590                 }
3591
3592                 socket_fd = params->fds[0];
3593         } else {
3594                 socket_fd = -1;
3595                 fds = params->fds;
3596                 n_socket_fds = params->n_socket_fds;
3597                 n_storage_fds = params->n_storage_fds;
3598         }
3599
3600         r = exec_context_named_iofds(context, params, named_iofds);
3601         if (r < 0)
3602                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3603
3604         r = exec_context_load_environment(unit, context, &files_env);
3605         if (r < 0)
3606                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3607
3608         line = exec_command_line(command->argv);
3609         if (!line)
3610                 return log_oom();
3611
3612         log_struct(LOG_DEBUG,
3613                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3614                    "EXECUTABLE=%s", command->path,
3615                    LOG_UNIT_ID(unit),
3616                    LOG_UNIT_INVOCATION_ID(unit));
3617
3618         pid = fork();
3619         if (pid < 0)
3620                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3621
3622         if (pid == 0) {
3623                 int exit_status = EXIT_SUCCESS;
3624
3625                 r = exec_child(unit,
3626                                command,
3627                                context,
3628                                params,
3629                                runtime,
3630                                dcreds,
3631                                socket_fd,
3632                                named_iofds,
3633                                fds,
3634                                n_socket_fds,
3635                                n_storage_fds,
3636                                files_env,
3637                                unit->manager->user_lookup_fds[1],
3638                                &exit_status);
3639
3640                 if (r < 0)
3641                         log_struct_errno(LOG_ERR, r,
3642                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3643                                          LOG_UNIT_ID(unit),
3644                                          LOG_UNIT_INVOCATION_ID(unit),
3645                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3646                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3647                                                           command->path),
3648                                          "EXECUTABLE=%s", command->path);
3649
3650                 _exit(exit_status);
3651         }
3652
3653         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3654
3655         /* We add the new process to the cgroup both in the child (so
3656          * that we can be sure that no user code is ever executed
3657          * outside of the cgroup) and in the parent (so that we can be
3658          * sure that when we kill the cgroup the process will be
3659          * killed too). */
3660         if (params->cgroup_path)
3661                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3662
3663         exec_status_start(&command->exec_status, pid);
3664
3665         *ret = pid;
3666         return 0;
3667 }
3668
3669 void exec_context_init(ExecContext *c) {
3670         ExecDirectoryType i;
3671
3672         assert(c);
3673
3674         c->umask = 0022;
3675         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3676         c->cpu_sched_policy = SCHED_OTHER;
3677         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3678         c->syslog_level_prefix = true;
3679         c->ignore_sigpipe = true;
3680         c->timer_slack_nsec = NSEC_INFINITY;
3681         c->personality = PERSONALITY_INVALID;
3682         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3683                 c->directories[i].mode = 0755;
3684         c->capability_bounding_set = CAP_ALL;
3685         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3686         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3687         c->log_level_max = -1;
3688 }
3689
3690 void exec_context_done(ExecContext *c) {
3691         ExecDirectoryType i;
3692         size_t l;
3693
3694         assert(c);
3695
3696         c->environment = strv_free(c->environment);
3697         c->environment_files = strv_free(c->environment_files);
3698         c->pass_environment = strv_free(c->pass_environment);
3699         c->unset_environment = strv_free(c->unset_environment);
3700
3701         rlimit_free_all(c->rlimit);
3702
3703         for (l = 0; l < 3; l++) {
3704                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3705                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3706         }
3707
3708         c->working_directory = mfree(c->working_directory);
3709         c->root_directory = mfree(c->root_directory);
3710         c->root_image = mfree(c->root_image);
3711         c->tty_path = mfree(c->tty_path);
3712         c->syslog_identifier = mfree(c->syslog_identifier);
3713         c->user = mfree(c->user);
3714         c->group = mfree(c->group);
3715
3716         c->supplementary_groups = strv_free(c->supplementary_groups);
3717
3718         c->pam_name = mfree(c->pam_name);
3719
3720         c->read_only_paths = strv_free(c->read_only_paths);
3721         c->read_write_paths = strv_free(c->read_write_paths);
3722         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3723
3724         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3725         c->bind_mounts = NULL;
3726         c->n_bind_mounts = 0;
3727         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3728         c->temporary_filesystems = NULL;
3729         c->n_temporary_filesystems = 0;
3730
3731         c->cpuset = cpu_set_mfree(c->cpuset);
3732
3733         c->utmp_id = mfree(c->utmp_id);
3734         c->selinux_context = mfree(c->selinux_context);
3735         c->apparmor_profile = mfree(c->apparmor_profile);
3736         c->smack_process_label = mfree(c->smack_process_label);
3737
3738         c->syscall_filter = hashmap_free(c->syscall_filter);
3739         c->syscall_archs = set_free(c->syscall_archs);
3740         c->address_families = set_free(c->address_families);
3741
3742         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3743                 c->directories[i].paths = strv_free(c->directories[i].paths);
3744
3745         c->log_level_max = -1;
3746
3747         exec_context_free_log_extra_fields(c);
3748
3749         c->stdin_data = mfree(c->stdin_data);
3750         c->stdin_data_size = 0;
3751 }
3752
3753 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3754         char **i;
3755
3756         assert(c);
3757
3758         if (!runtime_prefix)
3759                 return 0;
3760
3761         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3762                 _cleanup_free_ char *p;
3763
3764                 p = strjoin(runtime_prefix, "/", *i);
3765                 if (!p)
3766                         return -ENOMEM;
3767
3768                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3769                  * next. */
3770                 (void) rm_rf(p, REMOVE_ROOT);
3771         }
3772
3773         return 0;
3774 }
3775
3776 static void exec_command_done(ExecCommand *c) {
3777         assert(c);
3778
3779         c->path = mfree(c->path);
3780         c->argv = strv_free(c->argv);
3781 }
3782
3783 void exec_command_done_array(ExecCommand *c, size_t n) {
3784         size_t i;
3785
3786         for (i = 0; i < n; i++)
3787                 exec_command_done(c+i);
3788 }
3789
3790 ExecCommand* exec_command_free_list(ExecCommand *c) {
3791         ExecCommand *i;
3792
3793         while ((i = c)) {
3794                 LIST_REMOVE(command, c, i);
3795                 exec_command_done(i);
3796                 free(i);
3797         }
3798
3799         return NULL;
3800 }
3801
3802 void exec_command_free_array(ExecCommand **c, size_t n) {
3803         size_t i;
3804
3805         for (i = 0; i < n; i++)
3806                 c[i] = exec_command_free_list(c[i]);
3807 }
3808
3809 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3810         size_t i;
3811
3812         for (i = 0; i < n; i++)
3813                 exec_status_reset(&c[i].exec_status);
3814 }
3815
3816 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3817         size_t i;
3818
3819         for (i = 0; i < n; i++) {
3820                 ExecCommand *z;
3821
3822                 LIST_FOREACH(command, z, c[i])
3823                         exec_status_reset(&z->exec_status);
3824         }
3825 }
3826
3827 typedef struct InvalidEnvInfo {
3828         const Unit *unit;
3829         const char *path;
3830 } InvalidEnvInfo;
3831
3832 static void invalid_env(const char *p, void *userdata) {
3833         InvalidEnvInfo *info = userdata;
3834
3835         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3836 }
3837
3838 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3839         assert(c);
3840
3841         switch (fd_index) {
3842
3843         case STDIN_FILENO:
3844                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3845                         return NULL;
3846
3847                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3848
3849         case STDOUT_FILENO:
3850                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3851                         return NULL;
3852
3853                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3854
3855         case STDERR_FILENO:
3856                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3857                         return NULL;
3858
3859                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3860
3861         default:
3862                 return NULL;
3863         }
3864 }
3865
3866 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3867         size_t i, targets;
3868         const char* stdio_fdname[3];
3869         size_t n_fds;
3870
3871         assert(c);
3872         assert(p);
3873
3874         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3875                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3876                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3877
3878         for (i = 0; i < 3; i++)
3879                 stdio_fdname[i] = exec_context_fdname(c, i);
3880
3881         n_fds = p->n_storage_fds + p->n_socket_fds;
3882
3883         for (i = 0; i < n_fds  && targets > 0; i++)
3884                 if (named_iofds[STDIN_FILENO] < 0 &&
3885                     c->std_input == EXEC_INPUT_NAMED_FD &&
3886                     stdio_fdname[STDIN_FILENO] &&
3887                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3888
3889                         named_iofds[STDIN_FILENO] = p->fds[i];
3890                         targets--;
3891
3892                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3893                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3894                            stdio_fdname[STDOUT_FILENO] &&
3895                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3896
3897                         named_iofds[STDOUT_FILENO] = p->fds[i];
3898                         targets--;
3899
3900                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3901                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3902                            stdio_fdname[STDERR_FILENO] &&
3903                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3904
3905                         named_iofds[STDERR_FILENO] = p->fds[i];
3906                         targets--;
3907                 }
3908
3909         return targets == 0 ? 0 : -ENOENT;
3910 }
3911
3912 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3913         char **i, **r = NULL;
3914
3915         assert(c);
3916         assert(l);
3917
3918         STRV_FOREACH(i, c->environment_files) {
3919                 char *fn;
3920                 int k;
3921                 unsigned n;
3922                 bool ignore = false;
3923                 char **p;
3924                 _cleanup_globfree_ glob_t pglob = {};
3925
3926                 fn = *i;
3927
3928                 if (fn[0] == '-') {
3929                         ignore = true;
3930                         fn++;
3931                 }
3932
3933                 if (!path_is_absolute(fn)) {
3934                         if (ignore)
3935                                 continue;
3936
3937                         strv_free(r);
3938                         return -EINVAL;
3939                 }
3940
3941                 /* Filename supports globbing, take all matching files */
3942                 k = safe_glob(fn, 0, &pglob);
3943                 if (k < 0) {
3944                         if (ignore)
3945                                 continue;
3946
3947                         strv_free(r);
3948                         return k;
3949                 }
3950
3951                 /* When we don't match anything, -ENOENT should be returned */
3952                 assert(pglob.gl_pathc > 0);
3953
3954                 for (n = 0; n < pglob.gl_pathc; n++) {
3955                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3956                         if (k < 0) {
3957                                 if (ignore)
3958                                         continue;
3959
3960                                 strv_free(r);
3961                                 return k;
3962                         }
3963                         /* Log invalid environment variables with filename */
3964                         if (p) {
3965                                 InvalidEnvInfo info = {
3966                                         .unit = unit,
3967                                         .path = pglob.gl_pathv[n]
3968                                 };
3969
3970                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3971                         }
3972
3973                         if (!r)
3974                                 r = p;
3975                         else {
3976                                 char **m;
3977
3978                                 m = strv_env_merge(2, r, p);
3979                                 strv_free(r);
3980                                 strv_free(p);
3981                                 if (!m)
3982                                         return -ENOMEM;
3983
3984                                 r = m;
3985                         }
3986                 }
3987         }
3988
3989         *l = r;
3990
3991         return 0;
3992 }
3993
3994 static bool tty_may_match_dev_console(const char *tty) {
3995         _cleanup_free_ char *resolved = NULL;
3996
3997         if (!tty)
3998                 return true;
3999
4000         tty = skip_dev_prefix(tty);
4001
4002         /* trivial identity? */
4003         if (streq(tty, "console"))
4004                 return true;
4005
4006         if (resolve_dev_console(&resolved) < 0)
4007                 return true; /* if we could not resolve, assume it may */
4008
4009         /* "tty0" means the active VC, so it may be the same sometimes */
4010         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4011 }
4012
4013 bool exec_context_may_touch_console(const ExecContext *ec) {
4014
4015         return (ec->tty_reset ||
4016                 ec->tty_vhangup ||
4017                 ec->tty_vt_disallocate ||
4018                 is_terminal_input(ec->std_input) ||
4019                 is_terminal_output(ec->std_output) ||
4020                 is_terminal_output(ec->std_error)) &&
4021                tty_may_match_dev_console(exec_context_tty_path(ec));
4022 }
4023
4024 static void strv_fprintf(FILE *f, char **l) {
4025         char **g;
4026
4027         assert(f);
4028
4029         STRV_FOREACH(g, l)
4030                 fprintf(f, " %s", *g);
4031 }
4032
4033 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4034         ExecDirectoryType dt;
4035         char **e, **d;
4036         unsigned i;
4037         int r;
4038
4039         assert(c);
4040         assert(f);
4041
4042         prefix = strempty(prefix);
4043
4044         fprintf(f,
4045                 "%sUMask: %04o\n"
4046                 "%sWorkingDirectory: %s\n"
4047                 "%sRootDirectory: %s\n"
4048                 "%sNonBlocking: %s\n"
4049                 "%sPrivateTmp: %s\n"
4050                 "%sPrivateDevices: %s\n"
4051                 "%sProtectKernelTunables: %s\n"
4052                 "%sProtectKernelModules: %s\n"
4053                 "%sProtectControlGroups: %s\n"
4054                 "%sPrivateNetwork: %s\n"
4055                 "%sPrivateUsers: %s\n"
4056                 "%sProtectHome: %s\n"
4057                 "%sProtectSystem: %s\n"
4058                 "%sMountAPIVFS: %s\n"
4059                 "%sIgnoreSIGPIPE: %s\n"
4060                 "%sMemoryDenyWriteExecute: %s\n"
4061                 "%sRestrictRealtime: %s\n"
4062                 "%sKeyringMode: %s\n",
4063                 prefix, c->umask,
4064                 prefix, c->working_directory ? c->working_directory : "/",
4065                 prefix, c->root_directory ? c->root_directory : "/",
4066                 prefix, yes_no(c->non_blocking),
4067                 prefix, yes_no(c->private_tmp),
4068                 prefix, yes_no(c->private_devices),
4069                 prefix, yes_no(c->protect_kernel_tunables),
4070                 prefix, yes_no(c->protect_kernel_modules),
4071                 prefix, yes_no(c->protect_control_groups),
4072                 prefix, yes_no(c->private_network),
4073                 prefix, yes_no(c->private_users),
4074                 prefix, protect_home_to_string(c->protect_home),
4075                 prefix, protect_system_to_string(c->protect_system),
4076                 prefix, yes_no(c->mount_apivfs),
4077                 prefix, yes_no(c->ignore_sigpipe),
4078                 prefix, yes_no(c->memory_deny_write_execute),
4079                 prefix, yes_no(c->restrict_realtime),
4080                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
4081
4082         if (c->root_image)
4083                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4084
4085         STRV_FOREACH(e, c->environment)
4086                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4087
4088         STRV_FOREACH(e, c->environment_files)
4089                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4090
4091         STRV_FOREACH(e, c->pass_environment)
4092                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4093
4094         STRV_FOREACH(e, c->unset_environment)
4095                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4096
4097         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4098
4099         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4100                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4101
4102                 STRV_FOREACH(d, c->directories[dt].paths)
4103                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4104         }
4105
4106         if (c->nice_set)
4107                 fprintf(f,
4108                         "%sNice: %i\n",
4109                         prefix, c->nice);
4110
4111         if (c->oom_score_adjust_set)
4112                 fprintf(f,
4113                         "%sOOMScoreAdjust: %i\n",
4114                         prefix, c->oom_score_adjust);
4115
4116         for (i = 0; i < RLIM_NLIMITS; i++)
4117                 if (c->rlimit[i]) {
4118                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4119                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4120                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4121                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4122                 }
4123
4124         if (c->ioprio_set) {
4125                 _cleanup_free_ char *class_str = NULL;
4126
4127                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4128                 if (r >= 0)
4129                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4130
4131                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4132         }
4133
4134         if (c->cpu_sched_set) {
4135                 _cleanup_free_ char *policy_str = NULL;
4136
4137                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4138                 if (r >= 0)
4139                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4140
4141                 fprintf(f,
4142                         "%sCPUSchedulingPriority: %i\n"
4143                         "%sCPUSchedulingResetOnFork: %s\n",
4144                         prefix, c->cpu_sched_priority,
4145                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4146         }
4147
4148         if (c->cpuset) {
4149                 fprintf(f, "%sCPUAffinity:", prefix);
4150                 for (i = 0; i < c->cpuset_ncpus; i++)
4151                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4152                                 fprintf(f, " %u", i);
4153                 fputs("\n", f);
4154         }
4155
4156         if (c->timer_slack_nsec != NSEC_INFINITY)
4157                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4158
4159         fprintf(f,
4160                 "%sStandardInput: %s\n"
4161                 "%sStandardOutput: %s\n"
4162                 "%sStandardError: %s\n",
4163                 prefix, exec_input_to_string(c->std_input),
4164                 prefix, exec_output_to_string(c->std_output),
4165                 prefix, exec_output_to_string(c->std_error));
4166
4167         if (c->std_input == EXEC_INPUT_NAMED_FD)
4168                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4169         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4170                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4171         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4172                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4173
4174         if (c->std_input == EXEC_INPUT_FILE)
4175                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4176         if (c->std_output == EXEC_OUTPUT_FILE)
4177                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4178         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4179                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4180         if (c->std_error == EXEC_OUTPUT_FILE)
4181                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4182         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4183                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4184
4185         if (c->tty_path)
4186                 fprintf(f,
4187                         "%sTTYPath: %s\n"
4188                         "%sTTYReset: %s\n"
4189                         "%sTTYVHangup: %s\n"
4190                         "%sTTYVTDisallocate: %s\n",
4191                         prefix, c->tty_path,
4192                         prefix, yes_no(c->tty_reset),
4193                         prefix, yes_no(c->tty_vhangup),
4194                         prefix, yes_no(c->tty_vt_disallocate));
4195
4196         if (IN_SET(c->std_output,
4197                    EXEC_OUTPUT_SYSLOG,
4198                    EXEC_OUTPUT_KMSG,
4199                    EXEC_OUTPUT_JOURNAL,
4200                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4201                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4202                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4203             IN_SET(c->std_error,
4204                    EXEC_OUTPUT_SYSLOG,
4205                    EXEC_OUTPUT_KMSG,
4206                    EXEC_OUTPUT_JOURNAL,
4207                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4208                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4209                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4210
4211                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4212
4213                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4214                 if (r >= 0)
4215                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4216
4217                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4218                 if (r >= 0)
4219                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4220         }
4221
4222         if (c->log_level_max >= 0) {
4223                 _cleanup_free_ char *t = NULL;
4224
4225                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4226
4227                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4228         }
4229
4230         if (c->n_log_extra_fields > 0) {
4231                 size_t j;
4232
4233                 for (j = 0; j < c->n_log_extra_fields; j++) {
4234                         fprintf(f, "%sLogExtraFields: ", prefix);
4235                         fwrite(c->log_extra_fields[j].iov_base,
4236                                1, c->log_extra_fields[j].iov_len,
4237                                f);
4238                         fputc('\n', f);
4239                 }
4240         }
4241
4242         if (c->secure_bits) {
4243                 _cleanup_free_ char *str = NULL;
4244
4245                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4246                 if (r >= 0)
4247                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4248         }
4249
4250         if (c->capability_bounding_set != CAP_ALL) {
4251                 _cleanup_free_ char *str = NULL;
4252
4253                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4254                 if (r >= 0)
4255                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4256         }
4257
4258         if (c->capability_ambient_set != 0) {
4259                 _cleanup_free_ char *str = NULL;
4260
4261                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4262                 if (r >= 0)
4263                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4264         }
4265
4266         if (c->user)
4267                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4268         if (c->group)
4269                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4270
4271         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4272
4273         if (!strv_isempty(c->supplementary_groups)) {
4274                 fprintf(f, "%sSupplementaryGroups:", prefix);
4275                 strv_fprintf(f, c->supplementary_groups);
4276                 fputs("\n", f);
4277         }
4278
4279         if (c->pam_name)
4280                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4281
4282         if (!strv_isempty(c->read_write_paths)) {
4283                 fprintf(f, "%sReadWritePaths:", prefix);
4284                 strv_fprintf(f, c->read_write_paths);
4285                 fputs("\n", f);
4286         }
4287
4288         if (!strv_isempty(c->read_only_paths)) {
4289                 fprintf(f, "%sReadOnlyPaths:", prefix);
4290                 strv_fprintf(f, c->read_only_paths);
4291                 fputs("\n", f);
4292         }
4293
4294         if (!strv_isempty(c->inaccessible_paths)) {
4295                 fprintf(f, "%sInaccessiblePaths:", prefix);
4296                 strv_fprintf(f, c->inaccessible_paths);
4297                 fputs("\n", f);
4298         }
4299
4300         if (c->n_bind_mounts > 0)
4301                 for (i = 0; i < c->n_bind_mounts; i++)
4302                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4303                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4304                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4305                                 c->bind_mounts[i].source,
4306                                 c->bind_mounts[i].destination,
4307                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4308
4309         if (c->n_temporary_filesystems > 0)
4310                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4311                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4312
4313                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4314                                 t->path,
4315                                 isempty(t->options) ? "" : ":",
4316                                 strempty(t->options));
4317                 }
4318
4319         if (c->utmp_id)
4320                 fprintf(f,
4321                         "%sUtmpIdentifier: %s\n",
4322                         prefix, c->utmp_id);
4323
4324         if (c->selinux_context)
4325                 fprintf(f,
4326                         "%sSELinuxContext: %s%s\n",
4327                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4328
4329         if (c->apparmor_profile)
4330                 fprintf(f,
4331                         "%sAppArmorProfile: %s%s\n",
4332                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4333
4334         if (c->smack_process_label)
4335                 fprintf(f,
4336                         "%sSmackProcessLabel: %s%s\n",
4337                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4338
4339         if (c->personality != PERSONALITY_INVALID)
4340                 fprintf(f,
4341                         "%sPersonality: %s\n",
4342                         prefix, strna(personality_to_string(c->personality)));
4343
4344         fprintf(f,
4345                 "%sLockPersonality: %s\n",
4346                 prefix, yes_no(c->lock_personality));
4347
4348         if (c->syscall_filter) {
4349 #if HAVE_SECCOMP
4350                 Iterator j;
4351                 void *id, *val;
4352                 bool first = true;
4353 #endif
4354
4355                 fprintf(f,
4356                         "%sSystemCallFilter: ",
4357                         prefix);
4358
4359                 if (!c->syscall_whitelist)
4360                         fputc('~', f);
4361
4362 #if HAVE_SECCOMP
4363                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4364                         _cleanup_free_ char *name = NULL;
4365                         const char *errno_name = NULL;
4366                         int num = PTR_TO_INT(val);
4367
4368                         if (first)
4369                                 first = false;
4370                         else
4371                                 fputc(' ', f);
4372
4373                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4374                         fputs(strna(name), f);
4375
4376                         if (num >= 0) {
4377                                 errno_name = errno_to_name(num);
4378                                 if (errno_name)
4379                                         fprintf(f, ":%s", errno_name);
4380                                 else
4381                                         fprintf(f, ":%d", num);
4382                         }
4383                 }
4384 #endif
4385
4386                 fputc('\n', f);
4387         }
4388
4389         if (c->syscall_archs) {
4390 #if HAVE_SECCOMP
4391                 Iterator j;
4392                 void *id;
4393 #endif
4394
4395                 fprintf(f,
4396                         "%sSystemCallArchitectures:",
4397                         prefix);
4398
4399 #if HAVE_SECCOMP
4400                 SET_FOREACH(id, c->syscall_archs, j)
4401                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4402 #endif
4403                 fputc('\n', f);
4404         }
4405
4406         if (exec_context_restrict_namespaces_set(c)) {
4407                 _cleanup_free_ char *s = NULL;
4408
4409                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4410                 if (r >= 0)
4411                         fprintf(f, "%sRestrictNamespaces: %s\n",
4412                                 prefix, s);
4413         }
4414
4415         if (c->syscall_errno > 0) {
4416                 const char *errno_name;
4417
4418                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4419
4420                 errno_name = errno_to_name(c->syscall_errno);
4421                 if (errno_name)
4422                         fprintf(f, "%s\n", errno_name);
4423                 else
4424                         fprintf(f, "%d\n", c->syscall_errno);
4425         }
4426
4427         if (c->apparmor_profile)
4428                 fprintf(f,
4429                         "%sAppArmorProfile: %s%s\n",
4430                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4431 }
4432
4433 bool exec_context_maintains_privileges(const ExecContext *c) {
4434         assert(c);
4435
4436         /* Returns true if the process forked off would run under
4437          * an unchanged UID or as root. */
4438
4439         if (!c->user)
4440                 return true;
4441
4442         if (streq(c->user, "root") || streq(c->user, "0"))
4443                 return true;
4444
4445         return false;
4446 }
4447
4448 int exec_context_get_effective_ioprio(const ExecContext *c) {
4449         int p;
4450
4451         assert(c);
4452
4453         if (c->ioprio_set)
4454                 return c->ioprio;
4455
4456         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4457         if (p < 0)
4458                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4459
4460         return p;
4461 }
4462
4463 void exec_context_free_log_extra_fields(ExecContext *c) {
4464         size_t l;
4465
4466         assert(c);
4467
4468         for (l = 0; l < c->n_log_extra_fields; l++)
4469                 free(c->log_extra_fields[l].iov_base);
4470         c->log_extra_fields = mfree(c->log_extra_fields);
4471         c->n_log_extra_fields = 0;
4472 }
4473
4474 void exec_status_start(ExecStatus *s, pid_t pid) {
4475         assert(s);
4476
4477         *s = (ExecStatus) {
4478                 .pid = pid,
4479         };
4480
4481         dual_timestamp_get(&s->start_timestamp);
4482 }
4483
4484 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4485         assert(s);
4486
4487         if (s->pid != pid) {
4488                 *s = (ExecStatus) {
4489                         .pid = pid,
4490                 };
4491         }
4492
4493         dual_timestamp_get(&s->exit_timestamp);
4494
4495         s->code = code;
4496         s->status = status;
4497
4498         if (context) {
4499                 if (context->utmp_id)
4500                         (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4501
4502                 exec_context_tty_reset(context, NULL);
4503         }
4504 }
4505
4506 void exec_status_reset(ExecStatus *s) {
4507         assert(s);
4508
4509         *s = (ExecStatus) {};
4510 }
4511
4512 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4513         char buf[FORMAT_TIMESTAMP_MAX];
4514
4515         assert(s);
4516         assert(f);
4517
4518         if (s->pid <= 0)
4519                 return;
4520
4521         prefix = strempty(prefix);
4522
4523         fprintf(f,
4524                 "%sPID: "PID_FMT"\n",
4525                 prefix, s->pid);
4526
4527         if (dual_timestamp_is_set(&s->start_timestamp))
4528                 fprintf(f,
4529                         "%sStart Timestamp: %s\n",
4530                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4531
4532         if (dual_timestamp_is_set(&s->exit_timestamp))
4533                 fprintf(f,
4534                         "%sExit Timestamp: %s\n"
4535                         "%sExit Code: %s\n"
4536                         "%sExit Status: %i\n",
4537                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4538                         prefix, sigchld_code_to_string(s->code),
4539                         prefix, s->status);
4540 }
4541
4542 static char *exec_command_line(char **argv) {
4543         size_t k;
4544         char *n, *p, **a;
4545         bool first = true;
4546
4547         assert(argv);
4548
4549         k = 1;
4550         STRV_FOREACH(a, argv)
4551                 k += strlen(*a)+3;
4552
4553         n = new(char, k);
4554         if (!n)
4555                 return NULL;
4556
4557         p = n;
4558         STRV_FOREACH(a, argv) {
4559
4560                 if (!first)
4561                         *(p++) = ' ';
4562                 else
4563                         first = false;
4564
4565                 if (strpbrk(*a, WHITESPACE)) {
4566                         *(p++) = '\'';
4567                         p = stpcpy(p, *a);
4568                         *(p++) = '\'';
4569                 } else
4570                         p = stpcpy(p, *a);
4571
4572         }
4573
4574         *p = 0;
4575
4576         /* FIXME: this doesn't really handle arguments that have
4577          * spaces and ticks in them */
4578
4579         return n;
4580 }
4581
4582 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4583         _cleanup_free_ char *cmd = NULL;
4584         const char *prefix2;
4585
4586         assert(c);
4587         assert(f);
4588
4589         prefix = strempty(prefix);
4590         prefix2 = strjoina(prefix, "\t");
4591
4592         cmd = exec_command_line(c->argv);
4593         fprintf(f,
4594                 "%sCommand Line: %s\n",
4595                 prefix, cmd ? cmd : strerror(ENOMEM));
4596
4597         exec_status_dump(&c->exec_status, f, prefix2);
4598 }
4599
4600 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4601         assert(f);
4602
4603         prefix = strempty(prefix);
4604
4605         LIST_FOREACH(command, c, c)
4606                 exec_command_dump(c, f, prefix);
4607 }
4608
4609 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4610         ExecCommand *end;
4611
4612         assert(l);
4613         assert(e);
4614
4615         if (*l) {
4616                 /* It's kind of important, that we keep the order here */
4617                 LIST_FIND_TAIL(command, *l, end);
4618                 LIST_INSERT_AFTER(command, *l, end, e);
4619         } else
4620               *l = e;
4621 }
4622
4623 int exec_command_set(ExecCommand *c, const char *path, ...) {
4624         va_list ap;
4625         char **l, *p;
4626
4627         assert(c);
4628         assert(path);
4629
4630         va_start(ap, path);
4631         l = strv_new_ap(path, ap);
4632         va_end(ap);
4633
4634         if (!l)
4635                 return -ENOMEM;
4636
4637         p = strdup(path);
4638         if (!p) {
4639                 strv_free(l);
4640                 return -ENOMEM;
4641         }
4642
4643         free(c->path);
4644         c->path = p;
4645
4646         return strv_free_and_replace(c->argv, l);
4647 }
4648
4649 int exec_command_append(ExecCommand *c, const char *path, ...) {
4650         _cleanup_strv_free_ char **l = NULL;
4651         va_list ap;
4652         int r;
4653
4654         assert(c);
4655         assert(path);
4656
4657         va_start(ap, path);
4658         l = strv_new_ap(path, ap);
4659         va_end(ap);
4660
4661         if (!l)
4662                 return -ENOMEM;
4663
4664         r = strv_extend_strv(&c->argv, l, false);
4665         if (r < 0)
4666                 return r;
4667
4668         return 0;
4669 }
4670
4671 static void *remove_tmpdir_thread(void *p) {
4672         _cleanup_free_ char *path = p;
4673
4674         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4675         return NULL;
4676 }
4677
4678 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4679         int r;
4680
4681         if (!rt)
4682                 return NULL;
4683
4684         if (rt->manager)
4685                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4686
4687         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4688         if (destroy && rt->tmp_dir) {
4689                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4690
4691                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4692                 if (r < 0) {
4693                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4694                         free(rt->tmp_dir);
4695                 }
4696
4697                 rt->tmp_dir = NULL;
4698         }
4699
4700         if (destroy && rt->var_tmp_dir) {
4701                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4702
4703                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4704                 if (r < 0) {
4705                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4706                         free(rt->var_tmp_dir);
4707                 }
4708
4709                 rt->var_tmp_dir = NULL;
4710         }
4711
4712         rt->id = mfree(rt->id);
4713         rt->tmp_dir = mfree(rt->tmp_dir);
4714         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4715         safe_close_pair(rt->netns_storage_socket);
4716         return mfree(rt);
4717 }
4718
4719 static void exec_runtime_freep(ExecRuntime **rt) {
4720         if (*rt)
4721                 (void) exec_runtime_free(*rt, false);
4722 }
4723
4724 static int exec_runtime_allocate(ExecRuntime **rt) {
4725         assert(rt);
4726
4727         *rt = new0(ExecRuntime, 1);
4728         if (!*rt)
4729                 return -ENOMEM;
4730
4731         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4732         return 0;
4733 }
4734
4735 static int exec_runtime_add(
4736                 Manager *m,
4737                 const char *id,
4738                 const char *tmp_dir,
4739                 const char *var_tmp_dir,
4740                 const int netns_storage_socket[2],
4741                 ExecRuntime **ret) {
4742
4743         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4744         int r;
4745
4746         assert(m);
4747         assert(id);
4748
4749         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4750         if (r < 0)
4751                 return r;
4752
4753         r = exec_runtime_allocate(&rt);
4754         if (r < 0)
4755                 return r;
4756
4757         rt->id = strdup(id);
4758         if (!rt->id)
4759                 return -ENOMEM;
4760
4761         if (tmp_dir) {
4762                 rt->tmp_dir = strdup(tmp_dir);
4763                 if (!rt->tmp_dir)
4764                         return -ENOMEM;
4765
4766                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4767                 assert(var_tmp_dir);
4768                 rt->var_tmp_dir = strdup(var_tmp_dir);
4769                 if (!rt->var_tmp_dir)
4770                         return -ENOMEM;
4771         }
4772
4773         if (netns_storage_socket) {
4774                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4775                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4776         }
4777
4778         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4779         if (r < 0)
4780                 return r;
4781
4782         rt->manager = m;
4783
4784         if (ret)
4785                 *ret = rt;
4786
4787         /* do not remove created ExecRuntime object when the operation succeeds. */
4788         rt = NULL;
4789         return 0;
4790 }
4791
4792 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4793         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4794         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4795         int r;
4796
4797         assert(m);
4798         assert(c);
4799         assert(id);
4800
4801         /* It is not necessary to create ExecRuntime object. */
4802         if (!c->private_network && !c->private_tmp)
4803                 return 0;
4804
4805         if (c->private_tmp) {
4806                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4807                 if (r < 0)
4808                         return r;
4809         }
4810
4811         if (c->private_network) {
4812                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4813                         return -errno;
4814         }
4815
4816         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4817         if (r < 0)
4818                 return r;
4819
4820         /* Avoid cleanup */
4821         netns_storage_socket[0] = -1;
4822         netns_storage_socket[1] = -1;
4823         return 1;
4824 }
4825
4826 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4827         ExecRuntime *rt;
4828         int r;
4829
4830         assert(m);
4831         assert(id);
4832         assert(ret);
4833
4834         rt = hashmap_get(m->exec_runtime_by_id, id);
4835         if (rt)
4836                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4837                 goto ref;
4838
4839         if (!create)
4840                 return 0;
4841
4842         /* If not found, then create a new object. */
4843         r = exec_runtime_make(m, c, id, &rt);
4844         if (r <= 0)
4845                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4846                 return r;
4847
4848 ref:
4849         /* increment reference counter. */
4850         rt->n_ref++;
4851         *ret = rt;
4852         return 1;
4853 }
4854
4855 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4856         if (!rt)
4857                 return NULL;
4858
4859         assert(rt->n_ref > 0);
4860
4861         rt->n_ref--;
4862         if (rt->n_ref > 0)
4863                 return NULL;
4864
4865         return exec_runtime_free(rt, destroy);
4866 }
4867
4868 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4869         ExecRuntime *rt;
4870         Iterator i;
4871
4872         assert(m);
4873         assert(f);
4874         assert(fds);
4875
4876         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4877                 fprintf(f, "exec-runtime=%s", rt->id);
4878
4879                 if (rt->tmp_dir)
4880                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4881
4882                 if (rt->var_tmp_dir)
4883                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4884
4885                 if (rt->netns_storage_socket[0] >= 0) {
4886                         int copy;
4887
4888                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4889                         if (copy < 0)
4890                                 return copy;
4891
4892                         fprintf(f, " netns-socket-0=%i", copy);
4893                 }
4894
4895                 if (rt->netns_storage_socket[1] >= 0) {
4896                         int copy;
4897
4898                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4899                         if (copy < 0)
4900                                 return copy;
4901
4902                         fprintf(f, " netns-socket-1=%i", copy);
4903                 }
4904
4905                 fputc('\n', f);
4906         }
4907
4908         return 0;
4909 }
4910
4911 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4912         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4913         ExecRuntime *rt;
4914         int r;
4915
4916         /* This is for the migration from old (v237 or earlier) deserialization text.
4917          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4918          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4919          * so or not from the serialized text, then we always creates a new object owned by this. */
4920
4921         assert(u);
4922         assert(key);
4923         assert(value);
4924
4925         /* Manager manages ExecRuntime objects by the unit id.
4926          * So, we omit the serialized text when the unit does not have id (yet?)... */
4927         if (isempty(u->id)) {
4928                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4929                 return 0;
4930         }
4931
4932         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4933         if (r < 0) {
4934                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4935                 return 0;
4936         }
4937
4938         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4939         if (!rt) {
4940                 r = exec_runtime_allocate(&rt_create);
4941                 if (r < 0)
4942                         return log_oom();
4943
4944                 rt_create->id = strdup(u->id);
4945                 if (!rt_create->id)
4946                         return log_oom();
4947
4948                 rt = rt_create;
4949         }
4950
4951         if (streq(key, "tmp-dir")) {
4952                 char *copy;
4953
4954                 copy = strdup(value);
4955                 if (!copy)
4956                         return log_oom();
4957
4958                 free_and_replace(rt->tmp_dir, copy);
4959
4960         } else if (streq(key, "var-tmp-dir")) {
4961                 char *copy;
4962
4963                 copy = strdup(value);
4964                 if (!copy)
4965                         return log_oom();
4966
4967                 free_and_replace(rt->var_tmp_dir, copy);
4968
4969         } else if (streq(key, "netns-socket-0")) {
4970                 int fd;
4971
4972                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4973                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4974                         return 0;
4975                 }
4976
4977                 safe_close(rt->netns_storage_socket[0]);
4978                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4979
4980         } else if (streq(key, "netns-socket-1")) {
4981                 int fd;
4982
4983                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4984                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4985                         return 0;
4986                 }
4987
4988                 safe_close(rt->netns_storage_socket[1]);
4989                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4990         } else
4991                 return 0;
4992
4993         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4994         if (rt_create) {
4995                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4996                 if (r < 0) {
4997                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
4998                         return 0;
4999                 }
5000
5001                 rt_create->manager = u->manager;
5002
5003                 /* Avoid cleanup */
5004                 rt_create = NULL;
5005         }
5006
5007         return 1;
5008 }
5009
5010 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5011         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5012         int r, fd0 = -1, fd1 = -1;
5013         const char *p, *v = value;
5014         size_t n;
5015
5016         assert(m);
5017         assert(value);
5018         assert(fds);
5019
5020         n = strcspn(v, " ");
5021         id = strndupa(v, n);
5022         if (v[n] != ' ')
5023                 goto finalize;
5024         p = v + n + 1;
5025
5026         v = startswith(p, "tmp-dir=");
5027         if (v) {
5028                 n = strcspn(v, " ");
5029                 tmp_dir = strndupa(v, n);
5030                 if (v[n] != ' ')
5031                         goto finalize;
5032                 p = v + n + 1;
5033         }
5034
5035         v = startswith(p, "var-tmp-dir=");
5036         if (v) {
5037                 n = strcspn(v, " ");
5038                 var_tmp_dir = strndupa(v, n);
5039                 if (v[n] != ' ')
5040                         goto finalize;
5041                 p = v + n + 1;
5042         }
5043
5044         v = startswith(p, "netns-socket-0=");
5045         if (v) {
5046                 char *buf;
5047
5048                 n = strcspn(v, " ");
5049                 buf = strndupa(v, n);
5050                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5051                         log_debug("Unable to process exec-runtime netns fd specification.");
5052                         return;
5053                 }
5054                 fd0 = fdset_remove(fds, fd0);
5055                 if (v[n] != ' ')
5056                         goto finalize;
5057                 p = v + n + 1;
5058         }
5059
5060         v = startswith(p, "netns-socket-1=");
5061         if (v) {
5062                 char *buf;
5063
5064                 n = strcspn(v, " ");
5065                 buf = strndupa(v, n);
5066                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5067                         log_debug("Unable to process exec-runtime netns fd specification.");
5068                         return;
5069                 }
5070                 fd1 = fdset_remove(fds, fd1);
5071         }
5072
5073 finalize:
5074
5075         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5076         if (r < 0) {
5077                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5078                 return;
5079         }
5080 }
5081
5082 void exec_runtime_vacuum(Manager *m) {
5083         ExecRuntime *rt;
5084         Iterator i;
5085
5086         assert(m);
5087
5088         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5089
5090         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5091                 if (rt->n_ref > 0)
5092                         continue;
5093
5094                 (void) exec_runtime_free(rt, false);
5095         }
5096 }
5097
5098 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5099         [EXEC_INPUT_NULL] = "null",
5100         [EXEC_INPUT_TTY] = "tty",
5101         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5102         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5103         [EXEC_INPUT_SOCKET] = "socket",
5104         [EXEC_INPUT_NAMED_FD] = "fd",
5105         [EXEC_INPUT_DATA] = "data",
5106         [EXEC_INPUT_FILE] = "file",
5107 };
5108
5109 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5110
5111 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5112         [EXEC_OUTPUT_INHERIT] = "inherit",
5113         [EXEC_OUTPUT_NULL] = "null",
5114         [EXEC_OUTPUT_TTY] = "tty",
5115         [EXEC_OUTPUT_SYSLOG] = "syslog",
5116         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5117         [EXEC_OUTPUT_KMSG] = "kmsg",
5118         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5119         [EXEC_OUTPUT_JOURNAL] = "journal",
5120         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5121         [EXEC_OUTPUT_SOCKET] = "socket",
5122         [EXEC_OUTPUT_NAMED_FD] = "fd",
5123         [EXEC_OUTPUT_FILE] = "file",
5124         [EXEC_OUTPUT_FILE_APPEND] = "append",
5125 };
5126
5127 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5128
5129 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5130         [EXEC_UTMP_INIT] = "init",
5131         [EXEC_UTMP_LOGIN] = "login",
5132         [EXEC_UTMP_USER] = "user",
5133 };
5134
5135 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5136
5137 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5138         [EXEC_PRESERVE_NO] = "no",
5139         [EXEC_PRESERVE_YES] = "yes",
5140         [EXEC_PRESERVE_RESTART] = "restart",
5141 };
5142
5143 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5144
5145 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5146         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5147         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5148         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5149         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5150         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5151 };
5152
5153 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5154
5155 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5156         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5157         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5158         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5159         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5160         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5161 };
5162
5163 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5164
5165 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5166         [EXEC_KEYRING_INHERIT] = "inherit",
5167         [EXEC_KEYRING_PRIVATE] = "private",
5168         [EXEC_KEYRING_SHARED] = "shared",
5169 };
5170
5171 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);