src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6 ***/
   7
   8 #include <errno.h>
   9 #include <fcntl.h>
  10 #include <glob.h>
  11 #include <grp.h>
  12 #include <poll.h>
  13 #include <signal.h>
  14 #include <string.h>
  15 #include <sys/capability.h>
  16 #include <sys/eventfd.h>
  17 #include <sys/mman.h>
  18 #include <sys/personality.h>
  19 #include <sys/prctl.h>
  20 #include <sys/shm.h>
  21 #include <sys/socket.h>
  22 #include <sys/stat.h>
  23 #include <sys/types.h>
  24 #include <sys/un.h>
  25 #include <unistd.h>
  26 #include <utmpx.h>
  27
  28 #if HAVE_PAM
  29 #include <security/pam_appl.h>
  30 #endif
  31
  32 #if HAVE_SELINUX
  33 #include <selinux/selinux.h>
  34 #endif
  35
  36 #if HAVE_SECCOMP
  37 #include <seccomp.h>
  38 #endif
  39
  40 #if HAVE_APPARMOR
  41 #include <sys/apparmor.h>
  42 #endif
  43
  44 #include "sd-messages.h"
  45
  46 #include "af-list.h"
  47 #include "alloc-util.h"
  48 #if HAVE_APPARMOR
  49 #include "apparmor-util.h"
  50 #endif
  51 #include "async.h"
  52 #include "barrier.h"
  53 #include "cap-list.h"
  54 #include "capability-util.h"
  55 #include "chown-recursive.h"
  56 #include "cpu-set-util.h"
  57 #include "def.h"
  58 #include "env-util.h"
  59 #include "errno-list.h"
  60 #include "execute.h"
  61 #include "exit-status.h"
  62 #include "fd-util.h"
  63 #include "fileio.h"
  64 #include "format-util.h"
  65 #include "fs-util.h"
  66 #include "glob-util.h"
  67 #include "io-util.h"
  68 #include "ioprio.h"
  69 #include "label.h"
  70 #include "log.h"
  71 #include "macro.h"
  72 #include "manager.h"
  73 #include "missing.h"
  74 #include "mkdir.h"
  75 #include "namespace.h"
  76 #include "parse-util.h"
  77 #include "path-util.h"
  78 #include "process-util.h"
  79 #include "rlimit-util.h"
  80 #include "rm-rf.h"
  81 #if HAVE_SECCOMP
  82 #include "seccomp-util.h"
  83 #endif
  84 #include "securebits.h"
  85 #include "securebits-util.h"
  86 #include "selinux-util.h"
  87 #include "signal-util.h"
  88 #include "smack-util.h"
  89 #include "socket-util.h"
  90 #include "special.h"
  91 #include "stat-util.h"
  92 #include "string-table.h"
  93 #include "string-util.h"
  94 #include "strv.h"
  95 #include "syslog-util.h"
  96 #include "terminal-util.h"
  97 #include "unit.h"
  98 #include "user-util.h"
  99 #include "util.h"
 100 #include "utmp-wtmp.h"
 101
 102 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 103 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 104
 105 /* This assumes there is a 'tty' group */
 106 #define TTY_MODE 0620
 107
 108 #define SNDBUF_SIZE (8*1024*1024)
 109
 110 static int shift_fds(int fds[], size_t n_fds) {
 111         int start, restart_from;
 112
 113         if (n_fds <= 0)
 114                 return 0;
 115
 116         /* Modifies the fds array! (sorts it) */
 117
 118         assert(fds);
 119
 120         start = 0;
 121         for (;;) {
 122                 int i;
 123
 124                 restart_from = -1;
 125
 126                 for (i = start; i < (int) n_fds; i++) {
 127                         int nfd;
 128
 129                         /* Already at right index? */
 130                         if (fds[i] == i+3)
 131                                 continue;
 132
 133                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 134                         if (nfd < 0)
 135                                 return -errno;
 136
 137                         safe_close(fds[i]);
 138                         fds[i] = nfd;
 139
 140                         /* Hmm, the fd we wanted isn't free? Then
 141                          * let's remember that and try again from here */
 142                         if (nfd != i+3 && restart_from < 0)
 143                                 restart_from = i;
 144                 }
 145
 146                 if (restart_from < 0)
 147                         break;
 148
 149                 start = restart_from;
 150         }
 151
 152         return 0;
 153 }
 154
 155 static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) {
 156         size_t i, n_fds;
 157         int r;
 158
 159         n_fds = n_storage_fds + n_socket_fds;
 160         if (n_fds <= 0)
 161                 return 0;
 162
 163         assert(fds);
 164
 165         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 166          * O_NONBLOCK only applies to socket activation though. */
 167
 168         for (i = 0; i < n_fds; i++) {
 169
 170                 if (i < n_socket_fds) {
 171                         r = fd_nonblock(fds[i], nonblock);
 172                         if (r < 0)
 173                                 return r;
 174                 }
 175
 176                 /* We unconditionally drop FD_CLOEXEC from the fds,
 177                  * since after all we want to pass these fds to our
 178                  * children */
 179
 180                 r = fd_cloexec(fds[i], false);
 181                 if (r < 0)
 182                         return r;
 183         }
 184
 185         return 0;
 186 }
 187
 188 static const char *exec_context_tty_path(const ExecContext *context) {
 189         assert(context);
 190
 191         if (context->stdio_as_fds)
 192                 return NULL;
 193
 194         if (context->tty_path)
 195                 return context->tty_path;
 196
 197         return "/dev/console";
 198 }
 199
 200 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 201         const char *path;
 202
 203         assert(context);
 204
 205         path = exec_context_tty_path(context);
 206
 207         if (context->tty_vhangup) {
 208                 if (p && p->stdin_fd >= 0)
 209                         (void) terminal_vhangup_fd(p->stdin_fd);
 210                 else if (path)
 211                         (void) terminal_vhangup(path);
 212         }
 213
 214         if (context->tty_reset) {
 215                 if (p && p->stdin_fd >= 0)
 216                         (void) reset_terminal_fd(p->stdin_fd, true);
 217                 else if (path)
 218                         (void) reset_terminal(path);
 219         }
 220
 221         if (context->tty_vt_disallocate && path)
 222                 (void) vt_disallocate(path);
 223 }
 224
 225 static bool is_terminal_input(ExecInput i) {
 226         return IN_SET(i,
 227                       EXEC_INPUT_TTY,
 228                       EXEC_INPUT_TTY_FORCE,
 229                       EXEC_INPUT_TTY_FAIL);
 230 }
 231
 232 static bool is_terminal_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_TTY,
 235                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 236                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 237                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 238 }
 239
 240 static bool is_syslog_output(ExecOutput o) {
 241         return IN_SET(o,
 242                       EXEC_OUTPUT_SYSLOG,
 243                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 244 }
 245
 246 static bool is_kmsg_output(ExecOutput o) {
 247         return IN_SET(o,
 248                       EXEC_OUTPUT_KMSG,
 249                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 250 }
 251
 252 static bool exec_context_needs_term(const ExecContext *c) {
 253         assert(c);
 254
 255         /* Return true if the execution context suggests we should set $TERM to something useful. */
 256
 257         if (is_terminal_input(c->std_input))
 258                 return true;
 259
 260         if (is_terminal_output(c->std_output))
 261                 return true;
 262
 263         if (is_terminal_output(c->std_error))
 264                 return true;
 265
 266         return !!c->tty_path;
 267 }
 268
 269 static int open_null_as(int flags, int nfd) {
 270         int fd;
 271
 272         assert(nfd >= 0);
 273
 274         fd = open("/dev/null", flags|O_NOCTTY);
 275         if (fd < 0)
 276                 return -errno;
 277
 278         return move_fd(fd, nfd, false);
 279 }
 280
 281 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 282         static const union sockaddr_union sa = {
 283                 .un.sun_family = AF_UNIX,
 284                 .un.sun_path = "/run/systemd/journal/stdout",
 285         };
 286         uid_t olduid = UID_INVALID;
 287         gid_t oldgid = GID_INVALID;
 288         int r;
 289
 290         if (gid_is_valid(gid)) {
 291                 oldgid = getgid();
 292
 293                 if (setegid(gid) < 0)
 294                         return -errno;
 295         }
 296
 297         if (uid_is_valid(uid)) {
 298                 olduid = getuid();
 299
 300                 if (seteuid(uid) < 0) {
 301                         r = -errno;
 302                         goto restore_gid;
 303                 }
 304         }
 305
 306         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 307
 308         /* If we fail to restore the uid or gid, things will likely
 309            fail later on. This should only happen if an LSM interferes. */
 310
 311         if (uid_is_valid(uid))
 312                 (void) seteuid(olduid);
 313
 314  restore_gid:
 315         if (gid_is_valid(gid))
 316                 (void) setegid(oldgid);
 317
 318         return r;
 319 }
 320
 321 static int connect_logger_as(
 322                 const Unit *unit,
 323                 const ExecContext *context,
 324                 const ExecParameters *params,
 325                 ExecOutput output,
 326                 const char *ident,
 327                 int nfd,
 328                 uid_t uid,
 329                 gid_t gid) {
 330
 331         int fd, r;
 332
 333         assert(context);
 334         assert(params);
 335         assert(output < _EXEC_OUTPUT_MAX);
 336         assert(ident);
 337         assert(nfd >= 0);
 338
 339         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 340         if (fd < 0)
 341                 return -errno;
 342
 343         r = connect_journal_socket(fd, uid, gid);
 344         if (r < 0)
 345                 return r;
 346
 347         if (shutdown(fd, SHUT_RD) < 0) {
 348                 safe_close(fd);
 349                 return -errno;
 350         }
 351
 352         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 353
 354         dprintf(fd,
 355                 "%s\n"
 356                 "%s\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n"
 360                 "%i\n"
 361                 "%i\n",
 362                 context->syslog_identifier ?: ident,
 363                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 364                 context->syslog_priority,
 365                 !!context->syslog_level_prefix,
 366                 is_syslog_output(output),
 367                 is_kmsg_output(output),
 368                 is_terminal_output(output));
 369
 370         return move_fd(fd, nfd, false);
 371 }
 372 static int open_terminal_as(const char *path, int flags, int nfd) {
 373         int fd;
 374
 375         assert(path);
 376         assert(nfd >= 0);
 377
 378         fd = open_terminal(path, flags | O_NOCTTY);
 379         if (fd < 0)
 380                 return fd;
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384
 385 static int acquire_path(const char *path, int flags, mode_t mode) {
 386         union sockaddr_union sa = {
 387                 .sa.sa_family = AF_UNIX,
 388         };
 389         int fd, r;
 390
 391         assert(path);
 392
 393         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 394                 flags |= O_CREAT;
 395
 396         fd = open(path, flags|O_NOCTTY, mode);
 397         if (fd >= 0)
 398                 return fd;
 399
 400         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 401                 return -errno;
 402         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 403                 return -ENXIO;
 404
 405         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 406
 407         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 408         if (fd < 0)
 409                 return -errno;
 410
 411         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 412         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 413                 safe_close(fd);
 414                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 415                                                            * indication that his wasn't an AF_UNIX socket after all */
 416         }
 417
 418         if ((flags & O_ACCMODE) == O_RDONLY)
 419                 r = shutdown(fd, SHUT_WR);
 420         else if ((flags & O_ACCMODE) == O_WRONLY)
 421                 r = shutdown(fd, SHUT_RD);
 422         else
 423                 return fd;
 424         if (r < 0) {
 425                 safe_close(fd);
 426                 return -errno;
 427         }
 428
 429         return fd;
 430 }
 431
 432 static int fixup_input(
 433                 const ExecContext *context,
 434                 int socket_fd,
 435                 bool apply_tty_stdin) {
 436
 437         ExecInput std_input;
 438
 439         assert(context);
 440
 441         std_input = context->std_input;
 442
 443         if (is_terminal_input(std_input) && !apply_tty_stdin)
 444                 return EXEC_INPUT_NULL;
 445
 446         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 447                 return EXEC_INPUT_NULL;
 448
 449         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 450                 return EXEC_INPUT_NULL;
 451
 452         return std_input;
 453 }
 454
 455 static int fixup_output(ExecOutput std_output, int socket_fd) {
 456
 457         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 458                 return EXEC_OUTPUT_INHERIT;
 459
 460         return std_output;
 461 }
 462
 463 static int setup_input(
 464                 const ExecContext *context,
 465                 const ExecParameters *params,
 466                 int socket_fd,
 467                 int named_iofds[3]) {
 468
 469         ExecInput i;
 470
 471         assert(context);
 472         assert(params);
 473
 474         if (params->stdin_fd >= 0) {
 475                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 476                         return -errno;
 477
 478                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 479                 if (isatty(STDIN_FILENO)) {
 480                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 481                         (void) reset_terminal_fd(STDIN_FILENO, true);
 482                 }
 483
 484                 return STDIN_FILENO;
 485         }
 486
 487         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 488
 489         switch (i) {
 490
 491         case EXEC_INPUT_NULL:
 492                 return open_null_as(O_RDONLY, STDIN_FILENO);
 493
 494         case EXEC_INPUT_TTY:
 495         case EXEC_INPUT_TTY_FORCE:
 496         case EXEC_INPUT_TTY_FAIL: {
 497                 int fd;
 498
 499                 fd = acquire_terminal(exec_context_tty_path(context),
 500                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 501                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 502                                                                   ACQUIRE_TERMINAL_WAIT,
 503                                       USEC_INFINITY);
 504                 if (fd < 0)
 505                         return fd;
 506
 507                 return move_fd(fd, STDIN_FILENO, false);
 508         }
 509
 510         case EXEC_INPUT_SOCKET:
 511                 assert(socket_fd >= 0);
 512
 513                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 514
 515         case EXEC_INPUT_NAMED_FD:
 516                 assert(named_iofds[STDIN_FILENO] >= 0);
 517
 518                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 519                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 520
 521         case EXEC_INPUT_DATA: {
 522                 int fd;
 523
 524                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 525                 if (fd < 0)
 526                         return fd;
 527
 528                 return move_fd(fd, STDIN_FILENO, false);
 529         }
 530
 531         case EXEC_INPUT_FILE: {
 532                 bool rw;
 533                 int fd;
 534
 535                 assert(context->stdio_file[STDIN_FILENO]);
 536
 537                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 538                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 539
 540                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 541                 if (fd < 0)
 542                         return fd;
 543
 544                 return move_fd(fd, STDIN_FILENO, false);
 545         }
 546
 547         default:
 548                 assert_not_reached("Unknown input type");
 549         }
 550 }
 551
 552 static int setup_output(
 553                 const Unit *unit,
 554                 const ExecContext *context,
 555                 const ExecParameters *params,
 556                 int fileno,
 557                 int socket_fd,
 558                 int named_iofds[3],
 559                 const char *ident,
 560                 uid_t uid,
 561                 gid_t gid,
 562                 dev_t *journal_stream_dev,
 563                 ino_t *journal_stream_ino) {
 564
 565         ExecOutput o;
 566         ExecInput i;
 567         int r;
 568
 569         assert(unit);
 570         assert(context);
 571         assert(params);
 572         assert(ident);
 573         assert(journal_stream_dev);
 574         assert(journal_stream_ino);
 575
 576         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 577
 578                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 579                         return -errno;
 580
 581                 return STDOUT_FILENO;
 582         }
 583
 584         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 585                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 586                         return -errno;
 587
 588                 return STDERR_FILENO;
 589         }
 590
 591         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 592         o = fixup_output(context->std_output, socket_fd);
 593
 594         if (fileno == STDERR_FILENO) {
 595                 ExecOutput e;
 596                 e = fixup_output(context->std_error, socket_fd);
 597
 598                 /* This expects the input and output are already set up */
 599
 600                 /* Don't change the stderr file descriptor if we inherit all
 601                  * the way and are not on a tty */
 602                 if (e == EXEC_OUTPUT_INHERIT &&
 603                     o == EXEC_OUTPUT_INHERIT &&
 604                     i == EXEC_INPUT_NULL &&
 605                     !is_terminal_input(context->std_input) &&
 606                     getppid () != 1)
 607                         return fileno;
 608
 609                 /* Duplicate from stdout if possible */
 610                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 611                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 612
 613                 o = e;
 614
 615         } else if (o == EXEC_OUTPUT_INHERIT) {
 616                 /* If input got downgraded, inherit the original value */
 617                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 618                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 619
 620                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 621                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 622                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 623
 624                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 625                 if (getppid() != 1)
 626                         return fileno;
 627
 628                 /* We need to open /dev/null here anew, to get the right access mode. */
 629                 return open_null_as(O_WRONLY, fileno);
 630         }
 631
 632         switch (o) {
 633
 634         case EXEC_OUTPUT_NULL:
 635                 return open_null_as(O_WRONLY, fileno);
 636
 637         case EXEC_OUTPUT_TTY:
 638                 if (is_terminal_input(i))
 639                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 640
 641                 /* We don't reset the terminal if this is just about output */
 642                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 643
 644         case EXEC_OUTPUT_SYSLOG:
 645         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 646         case EXEC_OUTPUT_KMSG:
 647         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 648         case EXEC_OUTPUT_JOURNAL:
 649         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 650                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 651                 if (r < 0) {
 652                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 653                         r = open_null_as(O_WRONLY, fileno);
 654                 } else {
 655                         struct stat st;
 656
 657                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 658                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 659                          * services to detect whether they are connected to the journal or not.
 660                          *
 661                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 662                          * about STDERR as that's usually the best way to do logging. */
 663
 664                         if (fstat(fileno, &st) >= 0 &&
 665                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 666                                 *journal_stream_dev = st.st_dev;
 667                                 *journal_stream_ino = st.st_ino;
 668                         }
 669                 }
 670                 return r;
 671
 672         case EXEC_OUTPUT_SOCKET:
 673                 assert(socket_fd >= 0);
 674
 675                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 676
 677         case EXEC_OUTPUT_NAMED_FD:
 678                 assert(named_iofds[fileno] >= 0);
 679
 680                 (void) fd_nonblock(named_iofds[fileno], false);
 681                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 682
 683         case EXEC_OUTPUT_FILE: {
 684                 bool rw;
 685                 int fd;
 686
 687                 assert(context->stdio_file[fileno]);
 688
 689                 rw = context->std_input == EXEC_INPUT_FILE &&
 690                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 691
 692                 if (rw)
 693                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 694
 695                 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
 696                 if (fd < 0)
 697                         return fd;
 698
 699                 return move_fd(fd, fileno, false);
 700         }
 701
 702         default:
 703                 assert_not_reached("Unknown error type");
 704         }
 705 }
 706
 707 static int chown_terminal(int fd, uid_t uid) {
 708         struct stat st;
 709
 710         assert(fd >= 0);
 711
 712         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 713         if (isatty(fd) < 1)
 714                 return 0;
 715
 716         /* This might fail. What matters are the results. */
 717         (void) fchown(fd, uid, -1);
 718         (void) fchmod(fd, TTY_MODE);
 719
 720         if (fstat(fd, &st) < 0)
 721                 return -errno;
 722
 723         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 724                 return -EPERM;
 725
 726         return 0;
 727 }
 728
 729 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 730         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 731         int r;
 732
 733         assert(_saved_stdin);
 734         assert(_saved_stdout);
 735
 736         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 737         if (saved_stdin < 0)
 738                 return -errno;
 739
 740         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 741         if (saved_stdout < 0)
 742                 return -errno;
 743
 744         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 745         if (fd < 0)
 746                 return fd;
 747
 748         r = chown_terminal(fd, getuid());
 749         if (r < 0)
 750                 return r;
 751
 752         r = reset_terminal_fd(fd, true);
 753         if (r < 0)
 754                 return r;
 755
 756         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 757         fd = -1;
 758         if (r < 0)
 759                 return r;
 760
 761         *_saved_stdin = saved_stdin;
 762         *_saved_stdout = saved_stdout;
 763
 764         saved_stdin = saved_stdout = -1;
 765
 766         return 0;
 767 }
 768
 769 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 770         assert(err < 0);
 771
 772         if (err == -ETIMEDOUT)
 773                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 774         else {
 775                 errno = -err;
 776                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 777         }
 778 }
 779
 780 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 781         _cleanup_close_ int fd = -1;
 782
 783         assert(vc);
 784
 785         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 786         if (fd < 0)
 787                 return;
 788
 789         write_confirm_error_fd(err, fd, u);
 790 }
 791
 792 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 793         int r = 0;
 794
 795         assert(saved_stdin);
 796         assert(saved_stdout);
 797
 798         release_terminal();
 799
 800         if (*saved_stdin >= 0)
 801                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 802                         r = -errno;
 803
 804         if (*saved_stdout >= 0)
 805                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 806                         r = -errno;
 807
 808         *saved_stdin = safe_close(*saved_stdin);
 809         *saved_stdout = safe_close(*saved_stdout);
 810
 811         return r;
 812 }
 813
 814 enum {
 815         CONFIRM_PRETEND_FAILURE = -1,
 816         CONFIRM_PRETEND_SUCCESS =  0,
 817         CONFIRM_EXECUTE = 1,
 818 };
 819
 820 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 821         int saved_stdout = -1, saved_stdin = -1, r;
 822         _cleanup_free_ char *e = NULL;
 823         char c;
 824
 825         /* For any internal errors, assume a positive response. */
 826         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 827         if (r < 0) {
 828                 write_confirm_error(r, vc, u);
 829                 return CONFIRM_EXECUTE;
 830         }
 831
 832         /* confirm_spawn might have been disabled while we were sleeping. */
 833         if (manager_is_confirm_spawn_disabled(u->manager)) {
 834                 r = 1;
 835                 goto restore_stdio;
 836         }
 837
 838         e = ellipsize(cmdline, 60, 100);
 839         if (!e) {
 840                 log_oom();
 841                 r = CONFIRM_EXECUTE;
 842                 goto restore_stdio;
 843         }
 844
 845         for (;;) {
 846                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 847                 if (r < 0) {
 848                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 849                         r = CONFIRM_EXECUTE;
 850                         goto restore_stdio;
 851                 }
 852
 853                 switch (c) {
 854                 case 'c':
 855                         printf("Resuming normal execution.\n");
 856                         manager_disable_confirm_spawn();
 857                         r = 1;
 858                         break;
 859                 case 'D':
 860                         unit_dump(u, stdout, "  ");
 861                         continue; /* ask again */
 862                 case 'f':
 863                         printf("Failing execution.\n");
 864                         r = CONFIRM_PRETEND_FAILURE;
 865                         break;
 866                 case 'h':
 867                         printf("  c - continue, proceed without asking anymore\n"
 868                                "  D - dump, show the state of the unit\n"
 869                                "  f - fail, don't execute the command and pretend it failed\n"
 870                                "  h - help\n"
 871                                "  i - info, show a short summary of the unit\n"
 872                                "  j - jobs, show jobs that are in progress\n"
 873                                "  s - skip, don't execute the command and pretend it succeeded\n"
 874                                "  y - yes, execute the command\n");
 875                         continue; /* ask again */
 876                 case 'i':
 877                         printf("  Description: %s\n"
 878                                "  Unit:        %s\n"
 879                                "  Command:     %s\n",
 880                                u->id, u->description, cmdline);
 881                         continue; /* ask again */
 882                 case 'j':
 883                         manager_dump_jobs(u->manager, stdout, "  ");
 884                         continue; /* ask again */
 885                 case 'n':
 886                         /* 'n' was removed in favor of 'f'. */
 887                         printf("Didn't understand 'n', did you mean 'f'?\n");
 888                         continue; /* ask again */
 889                 case 's':
 890                         printf("Skipping execution.\n");
 891                         r = CONFIRM_PRETEND_SUCCESS;
 892                         break;
 893                 case 'y':
 894                         r = CONFIRM_EXECUTE;
 895                         break;
 896                 default:
 897                         assert_not_reached("Unhandled choice");
 898                 }
 899                 break;
 900         }
 901
 902 restore_stdio:
 903         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 904         return r;
 905 }
 906
 907 static int get_fixed_user(const ExecContext *c, const char **user,
 908                           uid_t *uid, gid_t *gid,
 909                           const char **home, const char **shell) {
 910         int r;
 911         const char *name;
 912
 913         assert(c);
 914
 915         if (!c->user)
 916                 return 0;
 917
 918         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 919          * (i.e. are "/" or "/bin/nologin"). */
 920
 921         name = c->user;
 922         r = get_user_creds_clean(&name, uid, gid, home, shell);
 923         if (r < 0)
 924                 return r;
 925
 926         *user = name;
 927         return 0;
 928 }
 929
 930 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 931         int r;
 932         const char *name;
 933
 934         assert(c);
 935
 936         if (!c->group)
 937                 return 0;
 938
 939         name = c->group;
 940         r = get_group_creds(&name, gid);
 941         if (r < 0)
 942                 return r;
 943
 944         *group = name;
 945         return 0;
 946 }
 947
 948 static int get_supplementary_groups(const ExecContext *c, const char *user,
 949                                     const char *group, gid_t gid,
 950                                     gid_t **supplementary_gids, int *ngids) {
 951         char **i;
 952         int r, k = 0;
 953         int ngroups_max;
 954         bool keep_groups = false;
 955         gid_t *groups = NULL;
 956         _cleanup_free_ gid_t *l_gids = NULL;
 957
 958         assert(c);
 959
 960         /*
 961          * If user is given, then lookup GID and supplementary groups list.
 962          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 963          * here and as early as possible so we keep the list of supplementary
 964          * groups of the caller.
 965          */
 966         if (user && gid_is_valid(gid) && gid != 0) {
 967                 /* First step, initialize groups from /etc/groups */
 968                 if (initgroups(user, gid) < 0)
 969                         return -errno;
 970
 971                 keep_groups = true;
 972         }
 973
 974         if (strv_isempty(c->supplementary_groups))
 975                 return 0;
 976
 977         /*
 978          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 979          * be positive, otherwise fail.
 980          */
 981         errno = 0;
 982         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 983         if (ngroups_max <= 0) {
 984                 if (errno > 0)
 985                         return -errno;
 986                 else
 987                         return -EOPNOTSUPP; /* For all other values */
 988         }
 989
 990         l_gids = new(gid_t, ngroups_max);
 991         if (!l_gids)
 992                 return -ENOMEM;
 993
 994         if (keep_groups) {
 995                 /*
 996                  * Lookup the list of groups that the user belongs to, we
 997                  * avoid NSS lookups here too for gid=0.
 998                  */
 999                 k = ngroups_max;
1000                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1001                         return -EINVAL;
1002         } else
1003                 k = 0;
1004
1005         STRV_FOREACH(i, c->supplementary_groups) {
1006                 const char *g;
1007
1008                 if (k >= ngroups_max)
1009                         return -E2BIG;
1010
1011                 g = *i;
1012                 r = get_group_creds(&g, l_gids+k);
1013                 if (r < 0)
1014                         return r;
1015
1016                 k++;
1017         }
1018
1019         /*
1020          * Sets ngids to zero to drop all supplementary groups, happens
1021          * when we are under root and SupplementaryGroups= is empty.
1022          */
1023         if (k == 0) {
1024                 *ngids = 0;
1025                 return 0;
1026         }
1027
1028         /* Otherwise get the final list of supplementary groups */
1029         groups = memdup(l_gids, sizeof(gid_t) * k);
1030         if (!groups)
1031                 return -ENOMEM;
1032
1033         *supplementary_gids = groups;
1034         *ngids = k;
1035
1036         groups = NULL;
1037
1038         return 0;
1039 }
1040
1041 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1042         int r;
1043
1044         /* Handle SupplementaryGroups= if it is not empty */
1045         if (ngids > 0) {
1046                 r = maybe_setgroups(ngids, supplementary_gids);
1047                 if (r < 0)
1048                         return r;
1049         }
1050
1051         if (gid_is_valid(gid)) {
1052                 /* Then set our gids */
1053                 if (setresgid(gid, gid, gid) < 0)
1054                         return -errno;
1055         }
1056
1057         return 0;
1058 }
1059
1060 static int enforce_user(const ExecContext *context, uid_t uid) {
1061         assert(context);
1062
1063         if (!uid_is_valid(uid))
1064                 return 0;
1065
1066         /* Sets (but doesn't look up) the uid and make sure we keep the
1067          * capabilities while doing so. */
1068
1069         if (context->capability_ambient_set != 0) {
1070
1071                 /* First step: If we need to keep capabilities but
1072                  * drop privileges we need to make sure we keep our
1073                  * caps, while we drop privileges. */
1074                 if (uid != 0) {
1075                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1076
1077                         if (prctl(PR_GET_SECUREBITS) != sb)
1078                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1079                                         return -errno;
1080                 }
1081         }
1082
1083         /* Second step: actually set the uids */
1084         if (setresuid(uid, uid, uid) < 0)
1085                 return -errno;
1086
1087         /* At this point we should have all necessary capabilities but
1088            are otherwise a normal user. However, the caps might got
1089            corrupted due to the setresuid() so we need clean them up
1090            later. This is done outside of this call. */
1091
1092         return 0;
1093 }
1094
1095 #if HAVE_PAM
1096
1097 static int null_conv(
1098                 int num_msg,
1099                 const struct pam_message **msg,
1100                 struct pam_response **resp,
1101                 void *appdata_ptr) {
1102
1103         /* We don't support conversations */
1104
1105         return PAM_CONV_ERR;
1106 }
1107
1108 #endif
1109
1110 static int setup_pam(
1111                 const char *name,
1112                 const char *user,
1113                 uid_t uid,
1114                 gid_t gid,
1115                 const char *tty,
1116                 char ***env,
1117                 int fds[], size_t n_fds) {
1118
1119 #if HAVE_PAM
1120
1121         static const struct pam_conv conv = {
1122                 .conv = null_conv,
1123                 .appdata_ptr = NULL
1124         };
1125
1126         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1127         pam_handle_t *handle = NULL;
1128         sigset_t old_ss;
1129         int pam_code = PAM_SUCCESS, r;
1130         char **nv, **e = NULL;
1131         bool close_session = false;
1132         pid_t pam_pid = 0, parent_pid;
1133         int flags = 0;
1134
1135         assert(name);
1136         assert(user);
1137         assert(env);
1138
1139         /* We set up PAM in the parent process, then fork. The child
1140          * will then stay around until killed via PR_GET_PDEATHSIG or
1141          * systemd via the cgroup logic. It will then remove the PAM
1142          * session again. The parent process will exec() the actual
1143          * daemon. We do things this way to ensure that the main PID
1144          * of the daemon is the one we initially fork()ed. */
1145
1146         r = barrier_create(&barrier);
1147         if (r < 0)
1148                 goto fail;
1149
1150         if (log_get_max_level() < LOG_DEBUG)
1151                 flags |= PAM_SILENT;
1152
1153         pam_code = pam_start(name, user, &conv, &handle);
1154         if (pam_code != PAM_SUCCESS) {
1155                 handle = NULL;
1156                 goto fail;
1157         }
1158
1159         if (tty) {
1160                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1161                 if (pam_code != PAM_SUCCESS)
1162                         goto fail;
1163         }
1164
1165         STRV_FOREACH(nv, *env) {
1166                 pam_code = pam_putenv(handle, *nv);
1167                 if (pam_code != PAM_SUCCESS)
1168                         goto fail;
1169         }
1170
1171         pam_code = pam_acct_mgmt(handle, flags);
1172         if (pam_code != PAM_SUCCESS)
1173                 goto fail;
1174
1175         pam_code = pam_open_session(handle, flags);
1176         if (pam_code != PAM_SUCCESS)
1177                 goto fail;
1178
1179         close_session = true;
1180
1181         e = pam_getenvlist(handle);
1182         if (!e) {
1183                 pam_code = PAM_BUF_ERR;
1184                 goto fail;
1185         }
1186
1187         /* Block SIGTERM, so that we know that it won't get lost in
1188          * the child */
1189
1190         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1191
1192         parent_pid = getpid_cached();
1193
1194         r = safe_fork("(sd-pam)", 0, &pam_pid);
1195         if (r < 0)
1196                 goto fail;
1197         if (r == 0) {
1198                 int sig, ret = EXIT_PAM;
1199
1200                 /* The child's job is to reset the PAM session on
1201                  * termination */
1202                 barrier_set_role(&barrier, BARRIER_CHILD);
1203
1204                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1205                  * are open here that have been opened by PAM. */
1206                 (void) close_many(fds, n_fds);
1207
1208                 /* Drop privileges - we don't need any to pam_close_session
1209                  * and this will make PR_SET_PDEATHSIG work in most cases.
1210                  * If this fails, ignore the error - but expect sd-pam threads
1211                  * to fail to exit normally */
1212
1213                 r = maybe_setgroups(0, NULL);
1214                 if (r < 0)
1215                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1216                 if (setresgid(gid, gid, gid) < 0)
1217                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1218                 if (setresuid(uid, uid, uid) < 0)
1219                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1220
1221                 (void) ignore_signals(SIGPIPE, -1);
1222
1223                 /* Wait until our parent died. This will only work if
1224                  * the above setresuid() succeeds, otherwise the kernel
1225                  * will not allow unprivileged parents kill their privileged
1226                  * children this way. We rely on the control groups kill logic
1227                  * to do the rest for us. */
1228                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1229                         goto child_finish;
1230
1231                 /* Tell the parent that our setup is done. This is especially
1232                  * important regarding dropping privileges. Otherwise, unit
1233                  * setup might race against our setresuid(2) call.
1234                  *
1235                  * If the parent aborted, we'll detect this below, hence ignore
1236                  * return failure here. */
1237                 (void) barrier_place(&barrier);
1238
1239                 /* Check if our parent process might already have died? */
1240                 if (getppid() == parent_pid) {
1241                         sigset_t ss;
1242
1243                         assert_se(sigemptyset(&ss) >= 0);
1244                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1245
1246                         for (;;) {
1247                                 if (sigwait(&ss, &sig) < 0) {
1248                                         if (errno == EINTR)
1249                                                 continue;
1250
1251                                         goto child_finish;
1252                                 }
1253
1254                                 assert(sig == SIGTERM);
1255                                 break;
1256                         }
1257                 }
1258
1259                 /* If our parent died we'll end the session */
1260                 if (getppid() != parent_pid) {
1261                         pam_code = pam_close_session(handle, flags);
1262                         if (pam_code != PAM_SUCCESS)
1263                                 goto child_finish;
1264                 }
1265
1266                 ret = 0;
1267
1268         child_finish:
1269                 pam_end(handle, pam_code | flags);
1270                 _exit(ret);
1271         }
1272
1273         barrier_set_role(&barrier, BARRIER_PARENT);
1274
1275         /* If the child was forked off successfully it will do all the
1276          * cleanups, so forget about the handle here. */
1277         handle = NULL;
1278
1279         /* Unblock SIGTERM again in the parent */
1280         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1281
1282         /* We close the log explicitly here, since the PAM modules
1283          * might have opened it, but we don't want this fd around. */
1284         closelog();
1285
1286         /* Synchronously wait for the child to initialize. We don't care for
1287          * errors as we cannot recover. However, warn loudly if it happens. */
1288         if (!barrier_place_and_sync(&barrier))
1289                 log_error("PAM initialization failed");
1290
1291         return strv_free_and_replace(*env, e);
1292
1293 fail:
1294         if (pam_code != PAM_SUCCESS) {
1295                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1296                 r = -EPERM;  /* PAM errors do not map to errno */
1297         } else
1298                 log_error_errno(r, "PAM failed: %m");
1299
1300         if (handle) {
1301                 if (close_session)
1302                         pam_code = pam_close_session(handle, flags);
1303
1304                 pam_end(handle, pam_code | flags);
1305         }
1306
1307         strv_free(e);
1308         closelog();
1309
1310         return r;
1311 #else
1312         return 0;
1313 #endif
1314 }
1315
1316 static void rename_process_from_path(const char *path) {
1317         char process_name[11];
1318         const char *p;
1319         size_t l;
1320
1321         /* This resulting string must fit in 10 chars (i.e. the length
1322          * of "/sbin/init") to look pretty in /bin/ps */
1323
1324         p = basename(path);
1325         if (isempty(p)) {
1326                 rename_process("(...)");
1327                 return;
1328         }
1329
1330         l = strlen(p);
1331         if (l > 8) {
1332                 /* The end of the process name is usually more
1333                  * interesting, since the first bit might just be
1334                  * "systemd-" */
1335                 p = p + l - 8;
1336                 l = 8;
1337         }
1338
1339         process_name[0] = '(';
1340         memcpy(process_name+1, p, l);
1341         process_name[1+l] = ')';
1342         process_name[1+l+1] = 0;
1343
1344         rename_process(process_name);
1345 }
1346
1347 static bool context_has_address_families(const ExecContext *c) {
1348         assert(c);
1349
1350         return c->address_families_whitelist ||
1351                 !set_isempty(c->address_families);
1352 }
1353
1354 static bool context_has_syscall_filters(const ExecContext *c) {
1355         assert(c);
1356
1357         return c->syscall_whitelist ||
1358                 !hashmap_isempty(c->syscall_filter);
1359 }
1360
1361 static bool context_has_no_new_privileges(const ExecContext *c) {
1362         assert(c);
1363
1364         if (c->no_new_privileges)
1365                 return true;
1366
1367         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1368                 return false;
1369
1370         /* We need NNP if we have any form of seccomp and are unprivileged */
1371         return context_has_address_families(c) ||
1372                 c->memory_deny_write_execute ||
1373                 c->restrict_realtime ||
1374                 exec_context_restrict_namespaces_set(c) ||
1375                 c->protect_kernel_tunables ||
1376                 c->protect_kernel_modules ||
1377                 c->private_devices ||
1378                 context_has_syscall_filters(c) ||
1379                 !set_isempty(c->syscall_archs) ||
1380                 c->lock_personality;
1381 }
1382
1383 #if HAVE_SECCOMP
1384
1385 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1386
1387         if (is_seccomp_available())
1388                 return false;
1389
1390         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1391         return true;
1392 }
1393
1394 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1395         uint32_t negative_action, default_action, action;
1396         int r;
1397
1398         assert(u);
1399         assert(c);
1400
1401         if (!context_has_syscall_filters(c))
1402                 return 0;
1403
1404         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1405                 return 0;
1406
1407         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1408
1409         if (c->syscall_whitelist) {
1410                 default_action = negative_action;
1411                 action = SCMP_ACT_ALLOW;
1412         } else {
1413                 default_action = SCMP_ACT_ALLOW;
1414                 action = negative_action;
1415         }
1416
1417         if (needs_ambient_hack) {
1418                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1419                 if (r < 0)
1420                         return r;
1421         }
1422
1423         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1424 }
1425
1426 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1427         assert(u);
1428         assert(c);
1429
1430         if (set_isempty(c->syscall_archs))
1431                 return 0;
1432
1433         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1434                 return 0;
1435
1436         return seccomp_restrict_archs(c->syscall_archs);
1437 }
1438
1439 static int apply_address_families(const Unit* u, const ExecContext *c) {
1440         assert(u);
1441         assert(c);
1442
1443         if (!context_has_address_families(c))
1444                 return 0;
1445
1446         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1447                 return 0;
1448
1449         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1450 }
1451
1452 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1453         assert(u);
1454         assert(c);
1455
1456         if (!c->memory_deny_write_execute)
1457                 return 0;
1458
1459         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1460                 return 0;
1461
1462         return seccomp_memory_deny_write_execute();
1463 }
1464
1465 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1466         assert(u);
1467         assert(c);
1468
1469         if (!c->restrict_realtime)
1470                 return 0;
1471
1472         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1473                 return 0;
1474
1475         return seccomp_restrict_realtime();
1476 }
1477
1478 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1479         assert(u);
1480         assert(c);
1481
1482         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1483          * let's protect even those systems where this is left on in the kernel. */
1484
1485         if (!c->protect_kernel_tunables)
1486                 return 0;
1487
1488         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1489                 return 0;
1490
1491         return seccomp_protect_sysctl();
1492 }
1493
1494 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1495         assert(u);
1496         assert(c);
1497
1498         /* Turn off module syscalls on ProtectKernelModules=yes */
1499
1500         if (!c->protect_kernel_modules)
1501                 return 0;
1502
1503         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1504                 return 0;
1505
1506         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1507 }
1508
1509 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1510         assert(u);
1511         assert(c);
1512
1513         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1514
1515         if (!c->private_devices)
1516                 return 0;
1517
1518         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1519                 return 0;
1520
1521         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1522 }
1523
1524 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1525         assert(u);
1526         assert(c);
1527
1528         if (!exec_context_restrict_namespaces_set(c))
1529                 return 0;
1530
1531         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1532                 return 0;
1533
1534         return seccomp_restrict_namespaces(c->restrict_namespaces);
1535 }
1536
1537 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1538         unsigned long personality;
1539         int r;
1540
1541         assert(u);
1542         assert(c);
1543
1544         if (!c->lock_personality)
1545                 return 0;
1546
1547         if (skip_seccomp_unavailable(u, "LockPersonality="))
1548                 return 0;
1549
1550         personality = c->personality;
1551
1552         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1553         if (personality == PERSONALITY_INVALID) {
1554
1555                 r = opinionated_personality(&personality);
1556                 if (r < 0)
1557                         return r;
1558         }
1559
1560         return seccomp_lock_personality(personality);
1561 }
1562
1563 #endif
1564
1565 static void do_idle_pipe_dance(int idle_pipe[4]) {
1566         assert(idle_pipe);
1567
1568         idle_pipe[1] = safe_close(idle_pipe[1]);
1569         idle_pipe[2] = safe_close(idle_pipe[2]);
1570
1571         if (idle_pipe[0] >= 0) {
1572                 int r;
1573
1574                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1575
1576                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1577                         ssize_t n;
1578
1579                         /* Signal systemd that we are bored and want to continue. */
1580                         n = write(idle_pipe[3], "x", 1);
1581                         if (n > 0)
1582                                 /* Wait for systemd to react to the signal above. */
1583                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1584                 }
1585
1586                 idle_pipe[0] = safe_close(idle_pipe[0]);
1587
1588         }
1589
1590         idle_pipe[3] = safe_close(idle_pipe[3]);
1591 }
1592
1593 static int build_environment(
1594                 const Unit *u,
1595                 const ExecContext *c,
1596                 const ExecParameters *p,
1597                 size_t n_fds,
1598                 const char *home,
1599                 const char *username,
1600                 const char *shell,
1601                 dev_t journal_stream_dev,
1602                 ino_t journal_stream_ino,
1603                 char ***ret) {
1604
1605         _cleanup_strv_free_ char **our_env = NULL;
1606         size_t n_env = 0;
1607         char *x;
1608
1609         assert(u);
1610         assert(c);
1611         assert(ret);
1612
1613         our_env = new0(char*, 14);
1614         if (!our_env)
1615                 return -ENOMEM;
1616
1617         if (n_fds > 0) {
1618                 _cleanup_free_ char *joined = NULL;
1619
1620                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1621                         return -ENOMEM;
1622                 our_env[n_env++] = x;
1623
1624                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1625                         return -ENOMEM;
1626                 our_env[n_env++] = x;
1627
1628                 joined = strv_join(p->fd_names, ":");
1629                 if (!joined)
1630                         return -ENOMEM;
1631
1632                 x = strjoin("LISTEN_FDNAMES=", joined);
1633                 if (!x)
1634                         return -ENOMEM;
1635                 our_env[n_env++] = x;
1636         }
1637
1638         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1639                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1640                         return -ENOMEM;
1641                 our_env[n_env++] = x;
1642
1643                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1644                         return -ENOMEM;
1645                 our_env[n_env++] = x;
1646         }
1647
1648         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1649          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1650          * check the database directly. */
1651         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1652                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1653                 if (!x)
1654                         return -ENOMEM;
1655                 our_env[n_env++] = x;
1656         }
1657
1658         if (home) {
1659                 x = strappend("HOME=", home);
1660                 if (!x)
1661                         return -ENOMEM;
1662                 our_env[n_env++] = x;
1663         }
1664
1665         if (username) {
1666                 x = strappend("LOGNAME=", username);
1667                 if (!x)
1668                         return -ENOMEM;
1669                 our_env[n_env++] = x;
1670
1671                 x = strappend("USER=", username);
1672                 if (!x)
1673                         return -ENOMEM;
1674                 our_env[n_env++] = x;
1675         }
1676
1677         if (shell) {
1678                 x = strappend("SHELL=", shell);
1679                 if (!x)
1680                         return -ENOMEM;
1681                 our_env[n_env++] = x;
1682         }
1683
1684         if (!sd_id128_is_null(u->invocation_id)) {
1685                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1686                         return -ENOMEM;
1687
1688                 our_env[n_env++] = x;
1689         }
1690
1691         if (exec_context_needs_term(c)) {
1692                 const char *tty_path, *term = NULL;
1693
1694                 tty_path = exec_context_tty_path(c);
1695
1696                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1697                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1698                  * passes to PID 1 ends up all the way in the console login shown. */
1699
1700                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1701                         term = getenv("TERM");
1702                 if (!term)
1703                         term = default_term_for_tty(tty_path);
1704
1705                 x = strappend("TERM=", term);
1706                 if (!x)
1707                         return -ENOMEM;
1708                 our_env[n_env++] = x;
1709         }
1710
1711         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1712                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1713                         return -ENOMEM;
1714
1715                 our_env[n_env++] = x;
1716         }
1717
1718         our_env[n_env++] = NULL;
1719         assert(n_env <= 12);
1720
1721         *ret = TAKE_PTR(our_env);
1722
1723         return 0;
1724 }
1725
1726 static int build_pass_environment(const ExecContext *c, char ***ret) {
1727         _cleanup_strv_free_ char **pass_env = NULL;
1728         size_t n_env = 0, n_bufsize = 0;
1729         char **i;
1730
1731         STRV_FOREACH(i, c->pass_environment) {
1732                 _cleanup_free_ char *x = NULL;
1733                 char *v;
1734
1735                 v = getenv(*i);
1736                 if (!v)
1737                         continue;
1738                 x = strjoin(*i, "=", v);
1739                 if (!x)
1740                         return -ENOMEM;
1741
1742                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1743                         return -ENOMEM;
1744
1745                 pass_env[n_env++] = TAKE_PTR(x);
1746                 pass_env[n_env] = NULL;
1747         }
1748
1749         *ret = TAKE_PTR(pass_env);
1750
1751         return 0;
1752 }
1753
1754 static bool exec_needs_mount_namespace(
1755                 const ExecContext *context,
1756                 const ExecParameters *params,
1757                 const ExecRuntime *runtime) {
1758
1759         assert(context);
1760         assert(params);
1761
1762         if (context->root_image)
1763                 return true;
1764
1765         if (!strv_isempty(context->read_write_paths) ||
1766             !strv_isempty(context->read_only_paths) ||
1767             !strv_isempty(context->inaccessible_paths))
1768                 return true;
1769
1770         if (context->n_bind_mounts > 0)
1771                 return true;
1772
1773         if (context->n_temporary_filesystems > 0)
1774                 return true;
1775
1776         if (context->mount_flags != 0)
1777                 return true;
1778
1779         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1780                 return true;
1781
1782         if (context->private_devices ||
1783             context->protect_system != PROTECT_SYSTEM_NO ||
1784             context->protect_home != PROTECT_HOME_NO ||
1785             context->protect_kernel_tunables ||
1786             context->protect_kernel_modules ||
1787             context->protect_control_groups)
1788                 return true;
1789
1790         if (context->mount_apivfs && (context->root_image || context->root_directory))
1791                 return true;
1792
1793         if (context->dynamic_user &&
1794             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1795              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1796              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1797                 return true;
1798
1799         return false;
1800 }
1801
1802 static int setup_private_users(uid_t uid, gid_t gid) {
1803         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1804         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1805         _cleanup_close_ int unshare_ready_fd = -1;
1806         _cleanup_(sigkill_waitp) pid_t pid = 0;
1807         uint64_t c = 1;
1808         ssize_t n;
1809         int r;
1810
1811         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1812          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1813          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1814          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1815          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1816          * continues execution normally. */
1817
1818         if (uid != 0 && uid_is_valid(uid)) {
1819                 r = asprintf(&uid_map,
1820                              "0 0 1\n"                      /* Map root → root */
1821                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1822                              uid, uid);
1823                 if (r < 0)
1824                         return -ENOMEM;
1825         } else {
1826                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1827                 if (!uid_map)
1828                         return -ENOMEM;
1829         }
1830
1831         if (gid != 0 && gid_is_valid(gid)) {
1832                 r = asprintf(&gid_map,
1833                              "0 0 1\n"                      /* Map root → root */
1834                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1835                              gid, gid);
1836                 if (r < 0)
1837                         return -ENOMEM;
1838         } else {
1839                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1840                 if (!gid_map)
1841                         return -ENOMEM;
1842         }
1843
1844         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1845          * namespace. */
1846         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1847         if (unshare_ready_fd < 0)
1848                 return -errno;
1849
1850         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1851          * failed. */
1852         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1853                 return -errno;
1854
1855         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1856         if (r < 0)
1857                 return r;
1858         if (r == 0) {
1859                 _cleanup_close_ int fd = -1;
1860                 const char *a;
1861                 pid_t ppid;
1862
1863                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1864                  * here, after the parent opened its own user namespace. */
1865
1866                 ppid = getppid();
1867                 errno_pipe[0] = safe_close(errno_pipe[0]);
1868
1869                 /* Wait until the parent unshared the user namespace */
1870                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1871                         r = -errno;
1872                         goto child_fail;
1873                 }
1874
1875                 /* Disable the setgroups() system call in the child user namespace, for good. */
1876                 a = procfs_file_alloca(ppid, "setgroups");
1877                 fd = open(a, O_WRONLY|O_CLOEXEC);
1878                 if (fd < 0) {
1879                         if (errno != ENOENT) {
1880                                 r = -errno;
1881                                 goto child_fail;
1882                         }
1883
1884                         /* If the file is missing the kernel is too old, let's continue anyway. */
1885                 } else {
1886                         if (write(fd, "deny\n", 5) < 0) {
1887                                 r = -errno;
1888                                 goto child_fail;
1889                         }
1890
1891                         fd = safe_close(fd);
1892                 }
1893
1894                 /* First write the GID map */
1895                 a = procfs_file_alloca(ppid, "gid_map");
1896                 fd = open(a, O_WRONLY|O_CLOEXEC);
1897                 if (fd < 0) {
1898                         r = -errno;
1899                         goto child_fail;
1900                 }
1901                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1902                         r = -errno;
1903                         goto child_fail;
1904                 }
1905                 fd = safe_close(fd);
1906
1907                 /* The write the UID map */
1908                 a = procfs_file_alloca(ppid, "uid_map");
1909                 fd = open(a, O_WRONLY|O_CLOEXEC);
1910                 if (fd < 0) {
1911                         r = -errno;
1912                         goto child_fail;
1913                 }
1914                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1915                         r = -errno;
1916                         goto child_fail;
1917                 }
1918
1919                 _exit(EXIT_SUCCESS);
1920
1921         child_fail:
1922                 (void) write(errno_pipe[1], &r, sizeof(r));
1923                 _exit(EXIT_FAILURE);
1924         }
1925
1926         errno_pipe[1] = safe_close(errno_pipe[1]);
1927
1928         if (unshare(CLONE_NEWUSER) < 0)
1929                 return -errno;
1930
1931         /* Let the child know that the namespace is ready now */
1932         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1933                 return -errno;
1934
1935         /* Try to read an error code from the child */
1936         n = read(errno_pipe[0], &r, sizeof(r));
1937         if (n < 0)
1938                 return -errno;
1939         if (n == sizeof(r)) { /* an error code was sent to us */
1940                 if (r < 0)
1941                         return r;
1942                 return -EIO;
1943         }
1944         if (n != 0) /* on success we should have read 0 bytes */
1945                 return -EIO;
1946
1947         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1948         pid = 0;
1949         if (r < 0)
1950                 return r;
1951         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1952                 return -EIO;
1953
1954         return 0;
1955 }
1956
1957 static int setup_exec_directory(
1958                 const ExecContext *context,
1959                 const ExecParameters *params,
1960                 uid_t uid,
1961                 gid_t gid,
1962                 ExecDirectoryType type,
1963                 int *exit_status) {
1964
1965         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1966                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1967                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1968                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1969                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1970                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1971         };
1972         char **rt;
1973         int r;
1974
1975         assert(context);
1976         assert(params);
1977         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1978         assert(exit_status);
1979
1980         if (!params->prefix[type])
1981                 return 0;
1982
1983         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1984                 if (!uid_is_valid(uid))
1985                         uid = 0;
1986                 if (!gid_is_valid(gid))
1987                         gid = 0;
1988         }
1989
1990         STRV_FOREACH(rt, context->directories[type].paths) {
1991                 _cleanup_free_ char *p = NULL, *pp = NULL;
1992
1993                 p = strjoin(params->prefix[type], "/", *rt);
1994                 if (!p) {
1995                         r = -ENOMEM;
1996                         goto fail;
1997                 }
1998
1999                 r = mkdir_parents_label(p, 0755);
2000                 if (r < 0)
2001                         goto fail;
2002
2003                 if (context->dynamic_user &&
2004                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2005                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2006
2007                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2008                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2009                          * whose UID is later on reused. To lock this down we use the same trick used by container
2010                          * managers to prohibit host users to get access to files of the same UID in containers: we
2011                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2012                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2013                          * to make this directory permeable for the service itself.
2014                          *
2015                          * Specifically: for a service which wants a special directory "foo/" we first create a
2016                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2017                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2018                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2019                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2020                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2021                          * disabling the access boundary for the service and making sure it only gets access to the
2022                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2023                          *
2024                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2025                          * owned by the service itself.
2026                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2027                          * files or sockets with other services. */
2028
2029                         private_root = strjoin(params->prefix[type], "/private");
2030                         if (!private_root) {
2031                                 r = -ENOMEM;
2032                                 goto fail;
2033                         }
2034
2035                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2036                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2037                         if (r < 0)
2038                                 goto fail;
2039
2040                         pp = strjoin(private_root, "/", *rt);
2041                         if (!pp) {
2042                                 r = -ENOMEM;
2043                                 goto fail;
2044                         }
2045
2046                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2047                         r = mkdir_parents_label(pp, 0755);
2048                         if (r < 0)
2049                                 goto fail;
2050
2051                         if (is_dir(p, false) > 0 &&
2052                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2053
2054                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2055                                  * it over. Most likely the service has been upgraded from one that didn't use
2056                                  * DynamicUser=1, to one that does. */
2057
2058                                 if (rename(p, pp) < 0) {
2059                                         r = -errno;
2060                                         goto fail;
2061                                 }
2062                         } else {
2063                                 /* Otherwise, create the actual directory for the service */
2064
2065                                 r = mkdir_label(pp, context->directories[type].mode);
2066                                 if (r < 0 && r != -EEXIST)
2067                                         goto fail;
2068                         }
2069
2070                         parent = dirname_malloc(p);
2071                         if (!parent) {
2072                                 r = -ENOMEM;
2073                                 goto fail;
2074                         }
2075
2076                         r = path_make_relative(parent, pp, &relative);
2077                         if (r < 0)
2078                                 goto fail;
2079
2080                         /* And link it up from the original place */
2081                         r = symlink_idempotent(relative, p);
2082                         if (r < 0)
2083                                 goto fail;
2084
2085                         /* Lock down the access mode */
2086                         if (chmod(pp, context->directories[type].mode) < 0) {
2087                                 r = -errno;
2088                                 goto fail;
2089                         }
2090                 } else {
2091                         r = mkdir_label(p, context->directories[type].mode);
2092                         if (r == -EEXIST)
2093                                 continue;
2094                         if (r < 0)
2095                                 goto fail;
2096                 }
2097
2098                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2099                  * a service, and shall not be writable. */
2100                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2101                         continue;
2102
2103                 /* Then, change the ownership of the whole tree, if necessary */
2104                 r = path_chown_recursive(pp ?: p, uid, gid);
2105                 if (r < 0)
2106                         goto fail;
2107         }
2108
2109         return 0;
2110
2111 fail:
2112         *exit_status = exit_status_table[type];
2113         return r;
2114 }
2115
2116 #if ENABLE_SMACK
2117 static int setup_smack(
2118                 const ExecContext *context,
2119                 const ExecCommand *command) {
2120
2121         int r;
2122
2123         assert(context);
2124         assert(command);
2125
2126         if (context->smack_process_label) {
2127                 r = mac_smack_apply_pid(0, context->smack_process_label);
2128                 if (r < 0)
2129                         return r;
2130         }
2131 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2132         else {
2133                 _cleanup_free_ char *exec_label = NULL;
2134
2135                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2136                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2137                         return r;
2138
2139                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2140                 if (r < 0)
2141                         return r;
2142         }
2143 #endif
2144
2145         return 0;
2146 }
2147 #endif
2148
2149 static int compile_bind_mounts(
2150                 const ExecContext *context,
2151                 const ExecParameters *params,
2152                 BindMount **ret_bind_mounts,
2153                 size_t *ret_n_bind_mounts,
2154                 char ***ret_empty_directories) {
2155
2156         _cleanup_strv_free_ char **empty_directories = NULL;
2157         BindMount *bind_mounts;
2158         size_t n, h = 0, i;
2159         ExecDirectoryType t;
2160         int r;
2161
2162         assert(context);
2163         assert(params);
2164         assert(ret_bind_mounts);
2165         assert(ret_n_bind_mounts);
2166         assert(ret_empty_directories);
2167
2168         n = context->n_bind_mounts;
2169         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2170                 if (!params->prefix[t])
2171                         continue;
2172
2173                 n += strv_length(context->directories[t].paths);
2174         }
2175
2176         if (n <= 0) {
2177                 *ret_bind_mounts = NULL;
2178                 *ret_n_bind_mounts = 0;
2179                 *ret_empty_directories = NULL;
2180                 return 0;
2181         }
2182
2183         bind_mounts = new(BindMount, n);
2184         if (!bind_mounts)
2185                 return -ENOMEM;
2186
2187         for (i = 0; i < context->n_bind_mounts; i++) {
2188                 BindMount *item = context->bind_mounts + i;
2189                 char *s, *d;
2190
2191                 s = strdup(item->source);
2192                 if (!s) {
2193                         r = -ENOMEM;
2194                         goto finish;
2195                 }
2196
2197                 d = strdup(item->destination);
2198                 if (!d) {
2199                         free(s);
2200                         r = -ENOMEM;
2201                         goto finish;
2202                 }
2203
2204                 bind_mounts[h++] = (BindMount) {
2205                         .source = s,
2206                         .destination = d,
2207                         .read_only = item->read_only,
2208                         .recursive = item->recursive,
2209                         .ignore_enoent = item->ignore_enoent,
2210                 };
2211         }
2212
2213         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2214                 char **suffix;
2215
2216                 if (!params->prefix[t])
2217                         continue;
2218
2219                 if (strv_isempty(context->directories[t].paths))
2220                         continue;
2221
2222                 if (context->dynamic_user &&
2223                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2224                         char *private_root;
2225
2226                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2227                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2228                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2229
2230                         private_root = strjoin(params->prefix[t], "/private");
2231                         if (!private_root) {
2232                                 r = -ENOMEM;
2233                                 goto finish;
2234                         }
2235
2236                         r = strv_consume(&empty_directories, private_root);
2237                         if (r < 0)
2238                                 goto finish;
2239                 }
2240
2241                 STRV_FOREACH(suffix, context->directories[t].paths) {
2242                         char *s, *d;
2243
2244                         if (context->dynamic_user &&
2245                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2246                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2247                         else
2248                                 s = strjoin(params->prefix[t], "/", *suffix);
2249                         if (!s) {
2250                                 r = -ENOMEM;
2251                                 goto finish;
2252                         }
2253
2254                         d = strdup(s);
2255                         if (!d) {
2256                                 free(s);
2257                                 r = -ENOMEM;
2258                                 goto finish;
2259                         }
2260
2261                         bind_mounts[h++] = (BindMount) {
2262                                 .source = s,
2263                                 .destination = d,
2264                                 .read_only = false,
2265                                 .recursive = true,
2266                                 .ignore_enoent = false,
2267                         };
2268                 }
2269         }
2270
2271         assert(h == n);
2272
2273         *ret_bind_mounts = bind_mounts;
2274         *ret_n_bind_mounts = n;
2275         *ret_empty_directories = TAKE_PTR(empty_directories);
2276
2277         return (int) n;
2278
2279 finish:
2280         bind_mount_free_many(bind_mounts, h);
2281         return r;
2282 }
2283
2284 static int apply_mount_namespace(
2285                 const Unit *u,
2286                 const ExecCommand *command,
2287                 const ExecContext *context,
2288                 const ExecParameters *params,
2289                 const ExecRuntime *runtime) {
2290
2291         _cleanup_strv_free_ char **empty_directories = NULL;
2292         char *tmp = NULL, *var = NULL;
2293         const char *root_dir = NULL, *root_image = NULL;
2294         NamespaceInfo ns_info = {};
2295         bool needs_sandboxing;
2296         BindMount *bind_mounts = NULL;
2297         size_t n_bind_mounts = 0;
2298         int r;
2299
2300         assert(context);
2301
2302         /* The runtime struct only contains the parent of the private /tmp,
2303          * which is non-accessible to world users. Inside of it there's a /tmp
2304          * that is sticky, and that's the one we want to use here. */
2305
2306         if (context->private_tmp && runtime) {
2307                 if (runtime->tmp_dir)
2308                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2309                 if (runtime->var_tmp_dir)
2310                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2311         }
2312
2313         if (params->flags & EXEC_APPLY_CHROOT) {
2314                 root_image = context->root_image;
2315
2316                 if (!root_image)
2317                         root_dir = context->root_directory;
2318         }
2319
2320         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2321         if (r < 0)
2322                 return r;
2323
2324         /*
2325          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2326          * sandbox info, otherwise enforce it, don't ignore protected paths and
2327          * fail if we are enable to apply the sandbox inside the mount namespace.
2328          */
2329         if (!context->dynamic_user && root_dir)
2330                 ns_info.ignore_protect_paths = true;
2331
2332         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2333
2334         if (needs_sandboxing)
2335                 ns_info = (NamespaceInfo) {
2336                         .ignore_protect_paths = false,
2337                         .private_dev = context->private_devices,
2338                         .protect_control_groups = context->protect_control_groups,
2339                         .protect_kernel_tunables = context->protect_kernel_tunables,
2340                         .protect_kernel_modules = context->protect_kernel_modules,
2341                         .mount_apivfs = context->mount_apivfs,
2342                 };
2343
2344         r = setup_namespace(root_dir, root_image,
2345                             &ns_info, context->read_write_paths,
2346                             needs_sandboxing ? context->read_only_paths : NULL,
2347                             needs_sandboxing ? context->inaccessible_paths : NULL,
2348                             empty_directories,
2349                             bind_mounts,
2350                             n_bind_mounts,
2351                             context->temporary_filesystems,
2352                             context->n_temporary_filesystems,
2353                             tmp,
2354                             var,
2355                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2356                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2357                             context->mount_flags,
2358                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2359
2360         bind_mount_free_many(bind_mounts, n_bind_mounts);
2361
2362         /* If we couldn't set up the namespace this is probably due to a
2363          * missing capability. In this case, silently proceeed. */
2364         if (IN_SET(r, -EPERM, -EACCES)) {
2365                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2366                 return 0;
2367         }
2368
2369         return r;
2370 }
2371
2372 static int apply_working_directory(
2373                 const ExecContext *context,
2374                 const ExecParameters *params,
2375                 const char *home,
2376                 const bool needs_mount_ns,
2377                 int *exit_status) {
2378
2379         const char *d, *wd;
2380
2381         assert(context);
2382         assert(exit_status);
2383
2384         if (context->working_directory_home) {
2385
2386                 if (!home) {
2387                         *exit_status = EXIT_CHDIR;
2388                         return -ENXIO;
2389                 }
2390
2391                 wd = home;
2392
2393         } else if (context->working_directory)
2394                 wd = context->working_directory;
2395         else
2396                 wd = "/";
2397
2398         if (params->flags & EXEC_APPLY_CHROOT) {
2399                 if (!needs_mount_ns && context->root_directory)
2400                         if (chroot(context->root_directory) < 0) {
2401                                 *exit_status = EXIT_CHROOT;
2402                                 return -errno;
2403                         }
2404
2405                 d = wd;
2406         } else
2407                 d = prefix_roota(context->root_directory, wd);
2408
2409         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2410                 *exit_status = EXIT_CHDIR;
2411                 return -errno;
2412         }
2413
2414         return 0;
2415 }
2416
2417 static int setup_keyring(
2418                 const Unit *u,
2419                 const ExecContext *context,
2420                 const ExecParameters *p,
2421                 uid_t uid, gid_t gid) {
2422
2423         key_serial_t keyring;
2424         int r = 0;
2425         uid_t saved_uid;
2426         gid_t saved_gid;
2427
2428         assert(u);
2429         assert(context);
2430         assert(p);
2431
2432         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2433          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2434          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2435          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2436          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2437          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2438
2439         if (!(p->flags & EXEC_NEW_KEYRING))
2440                 return 0;
2441
2442         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2443                 return 0;
2444
2445         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2446          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2447          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2448          * & group is just as nasty as acquiring a reference to the user keyring. */
2449
2450         saved_uid = getuid();
2451         saved_gid = getgid();
2452
2453         if (gid_is_valid(gid) && gid != saved_gid) {
2454                 if (setregid(gid, -1) < 0)
2455                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2456         }
2457
2458         if (uid_is_valid(uid) && uid != saved_uid) {
2459                 if (setreuid(uid, -1) < 0) {
2460                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2461                         goto out;
2462                 }
2463         }
2464
2465         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2466         if (keyring == -1) {
2467                 if (errno == ENOSYS)
2468                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2469                 else if (IN_SET(errno, EACCES, EPERM))
2470                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2471                 else if (errno == EDQUOT)
2472                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2473                 else
2474                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2475
2476                 goto out;
2477         }
2478
2479         /* When requested link the user keyring into the session keyring. */
2480         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2481
2482                 if (keyctl(KEYCTL_LINK,
2483                            KEY_SPEC_USER_KEYRING,
2484                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2485                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2486                         goto out;
2487                 }
2488         }
2489
2490         /* Restore uid/gid back */
2491         if (uid_is_valid(uid) && uid != saved_uid) {
2492                 if (setreuid(saved_uid, -1) < 0) {
2493                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2494                         goto out;
2495                 }
2496         }
2497
2498         if (gid_is_valid(gid) && gid != saved_gid) {
2499                 if (setregid(saved_gid, -1) < 0)
2500                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2501         }
2502
2503         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2504         if (!sd_id128_is_null(u->invocation_id)) {
2505                 key_serial_t key;
2506
2507                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2508                 if (key == -1)
2509                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2510                 else {
2511                         if (keyctl(KEYCTL_SETPERM, key,
2512                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2513                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2514                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2515                 }
2516         }
2517
2518 out:
2519         /* Revert back uid & gid for the the last time, and exit */
2520         /* no extra logging, as only the first already reported error matters */
2521         if (getuid() != saved_uid)
2522                 (void) setreuid(saved_uid, -1);
2523
2524         if (getgid() != saved_gid)
2525                 (void) setregid(saved_gid, -1);
2526
2527         return r;
2528 }
2529
2530 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2531         assert(array);
2532         assert(n);
2533
2534         if (!pair)
2535                 return;
2536
2537         if (pair[0] >= 0)
2538                 array[(*n)++] = pair[0];
2539         if (pair[1] >= 0)
2540                 array[(*n)++] = pair[1];
2541 }
2542
2543 static int close_remaining_fds(
2544                 const ExecParameters *params,
2545                 const ExecRuntime *runtime,
2546                 const DynamicCreds *dcreds,
2547                 int user_lookup_fd,
2548                 int socket_fd,
2549                 int *fds, size_t n_fds) {
2550
2551         size_t n_dont_close = 0;
2552         int dont_close[n_fds + 12];
2553
2554         assert(params);
2555
2556         if (params->stdin_fd >= 0)
2557                 dont_close[n_dont_close++] = params->stdin_fd;
2558         if (params->stdout_fd >= 0)
2559                 dont_close[n_dont_close++] = params->stdout_fd;
2560         if (params->stderr_fd >= 0)
2561                 dont_close[n_dont_close++] = params->stderr_fd;
2562
2563         if (socket_fd >= 0)
2564                 dont_close[n_dont_close++] = socket_fd;
2565         if (n_fds > 0) {
2566                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2567                 n_dont_close += n_fds;
2568         }
2569
2570         if (runtime)
2571                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2572
2573         if (dcreds) {
2574                 if (dcreds->user)
2575                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2576                 if (dcreds->group)
2577                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2578         }
2579
2580         if (user_lookup_fd >= 0)
2581                 dont_close[n_dont_close++] = user_lookup_fd;
2582
2583         return close_all_fds(dont_close, n_dont_close);
2584 }
2585
2586 static int send_user_lookup(
2587                 Unit *unit,
2588                 int user_lookup_fd,
2589                 uid_t uid,
2590                 gid_t gid) {
2591
2592         assert(unit);
2593
2594         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2595          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2596          * specified. */
2597
2598         if (user_lookup_fd < 0)
2599                 return 0;
2600
2601         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2602                 return 0;
2603
2604         if (writev(user_lookup_fd,
2605                (struct iovec[]) {
2606                            IOVEC_INIT(&uid, sizeof(uid)),
2607                            IOVEC_INIT(&gid, sizeof(gid)),
2608                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2609                 return -errno;
2610
2611         return 0;
2612 }
2613
2614 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2615         int r;
2616
2617         assert(c);
2618         assert(home);
2619         assert(buf);
2620
2621         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2622
2623         if (*home)
2624                 return 0;
2625
2626         if (!c->working_directory_home)
2627                 return 0;
2628
2629         if (uid == 0) {
2630                 /* Hardcode /root as home directory for UID 0 */
2631                 *home = "/root";
2632                 return 1;
2633         }
2634
2635         r = get_home_dir(buf);
2636         if (r < 0)
2637                 return r;
2638
2639         *home = *buf;
2640         return 1;
2641 }
2642
2643 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2644         _cleanup_strv_free_ char ** list = NULL;
2645         ExecDirectoryType t;
2646         int r;
2647
2648         assert(c);
2649         assert(p);
2650         assert(ret);
2651
2652         assert(c->dynamic_user);
2653
2654         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2655          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2656          * directories. */
2657
2658         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2659                 char **i;
2660
2661                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2662                         continue;
2663
2664                 if (!p->prefix[t])
2665                         continue;
2666
2667                 STRV_FOREACH(i, c->directories[t].paths) {
2668                         char *e;
2669
2670                         if (t == EXEC_DIRECTORY_RUNTIME)
2671                                 e = strjoin(p->prefix[t], "/", *i);
2672                         else
2673                                 e = strjoin(p->prefix[t], "/private/", *i);
2674                         if (!e)
2675                                 return -ENOMEM;
2676
2677                         r = strv_consume(&list, e);
2678                         if (r < 0)
2679                                 return r;
2680                 }
2681         }
2682
2683         *ret = TAKE_PTR(list);
2684
2685         return 0;
2686 }
2687
2688 static char *exec_command_line(char **argv);
2689
2690 static int exec_child(
2691                 Unit *unit,
2692                 const ExecCommand *command,
2693                 const ExecContext *context,
2694                 const ExecParameters *params,
2695                 ExecRuntime *runtime,
2696                 DynamicCreds *dcreds,
2697                 char **argv,
2698                 int socket_fd,
2699                 int named_iofds[3],
2700                 int *fds,
2701                 size_t n_storage_fds,
2702                 size_t n_socket_fds,
2703                 char **files_env,
2704                 int user_lookup_fd,
2705                 int *exit_status) {
2706
2707         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2708         _cleanup_free_ char *home_buffer = NULL;
2709         _cleanup_free_ gid_t *supplementary_gids = NULL;
2710         const char *username = NULL, *groupname = NULL;
2711         const char *home = NULL, *shell = NULL;
2712         dev_t journal_stream_dev = 0;
2713         ino_t journal_stream_ino = 0;
2714         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2715                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2716                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2717                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2718 #if HAVE_SELINUX
2719         _cleanup_free_ char *mac_selinux_context_net = NULL;
2720         bool use_selinux = false;
2721 #endif
2722 #if ENABLE_SMACK
2723         bool use_smack = false;
2724 #endif
2725 #if HAVE_APPARMOR
2726         bool use_apparmor = false;
2727 #endif
2728         uid_t uid = UID_INVALID;
2729         gid_t gid = GID_INVALID;
2730         int i, r, ngids = 0;
2731         size_t n_fds;
2732         ExecDirectoryType dt;
2733         int secure_bits;
2734
2735         assert(unit);
2736         assert(command);
2737         assert(context);
2738         assert(params);
2739         assert(exit_status);
2740
2741         rename_process_from_path(command->path);
2742
2743         /* We reset exactly these signals, since they are the
2744          * only ones we set to SIG_IGN in the main daemon. All
2745          * others we leave untouched because we set them to
2746          * SIG_DFL or a valid handler initially, both of which
2747          * will be demoted to SIG_DFL. */
2748         (void) default_signals(SIGNALS_CRASH_HANDLER,
2749                                SIGNALS_IGNORE, -1);
2750
2751         if (context->ignore_sigpipe)
2752                 (void) ignore_signals(SIGPIPE, -1);
2753
2754         r = reset_signal_mask();
2755         if (r < 0) {
2756                 *exit_status = EXIT_SIGNAL_MASK;
2757                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2758         }
2759
2760         if (params->idle_pipe)
2761                 do_idle_pipe_dance(params->idle_pipe);
2762
2763         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2764          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2765          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2766          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2767
2768         log_forget_fds();
2769         log_set_open_when_needed(true);
2770
2771         /* In case anything used libc syslog(), close this here, too */
2772         closelog();
2773
2774         n_fds = n_storage_fds + n_socket_fds;
2775         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2776         if (r < 0) {
2777                 *exit_status = EXIT_FDS;
2778                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2779         }
2780
2781         if (!context->same_pgrp)
2782                 if (setsid() < 0) {
2783                         *exit_status = EXIT_SETSID;
2784                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2785                 }
2786
2787         exec_context_tty_reset(context, params);
2788
2789         if (unit_shall_confirm_spawn(unit)) {
2790                 const char *vc = params->confirm_spawn;
2791                 _cleanup_free_ char *cmdline = NULL;
2792
2793                 cmdline = exec_command_line(argv);
2794                 if (!cmdline) {
2795                         *exit_status = EXIT_MEMORY;
2796                         return log_oom();
2797                 }
2798
2799                 r = ask_for_confirmation(vc, unit, cmdline);
2800                 if (r != CONFIRM_EXECUTE) {
2801                         if (r == CONFIRM_PRETEND_SUCCESS) {
2802                                 *exit_status = EXIT_SUCCESS;
2803                                 return 0;
2804                         }
2805                         *exit_status = EXIT_CONFIRM;
2806                         log_unit_error(unit, "Execution cancelled by the user");
2807                         return -ECANCELED;
2808                 }
2809         }
2810
2811         if (context->dynamic_user && dcreds) {
2812                 _cleanup_strv_free_ char **suggested_paths = NULL;
2813
2814                 /* Make sure we bypass our own NSS module for any NSS checks */
2815                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2816                         *exit_status = EXIT_USER;
2817                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2818                 }
2819
2820                 r = compile_suggested_paths(context, params, &suggested_paths);
2821                 if (r < 0) {
2822                         *exit_status = EXIT_MEMORY;
2823                         return log_oom();
2824                 }
2825
2826                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2827                 if (r < 0) {
2828                         *exit_status = EXIT_USER;
2829                         if (r == -EILSEQ) {
2830                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2831                                 return -EOPNOTSUPP;
2832                         }
2833                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2834                 }
2835
2836                 if (!uid_is_valid(uid)) {
2837                         *exit_status = EXIT_USER;
2838                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2839                         return -ESRCH;
2840                 }
2841
2842                 if (!gid_is_valid(gid)) {
2843                         *exit_status = EXIT_USER;
2844                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2845                         return -ESRCH;
2846                 }
2847
2848                 if (dcreds->user)
2849                         username = dcreds->user->name;
2850
2851         } else {
2852                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2853                 if (r < 0) {
2854                         *exit_status = EXIT_USER;
2855                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2856                 }
2857
2858                 r = get_fixed_group(context, &groupname, &gid);
2859                 if (r < 0) {
2860                         *exit_status = EXIT_GROUP;
2861                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2862                 }
2863         }
2864
2865         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2866         r = get_supplementary_groups(context, username, groupname, gid,
2867                                      &supplementary_gids, &ngids);
2868         if (r < 0) {
2869                 *exit_status = EXIT_GROUP;
2870                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2871         }
2872
2873         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2874         if (r < 0) {
2875                 *exit_status = EXIT_USER;
2876                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2877         }
2878
2879         user_lookup_fd = safe_close(user_lookup_fd);
2880
2881         r = acquire_home(context, uid, &home, &home_buffer);
2882         if (r < 0) {
2883                 *exit_status = EXIT_CHDIR;
2884                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2885         }
2886
2887         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2888          * must sure to drop O_NONBLOCK */
2889         if (socket_fd >= 0)
2890                 (void) fd_nonblock(socket_fd, false);
2891
2892         r = setup_input(context, params, socket_fd, named_iofds);
2893         if (r < 0) {
2894                 *exit_status = EXIT_STDIN;
2895                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2896         }
2897
2898         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2899         if (r < 0) {
2900                 *exit_status = EXIT_STDOUT;
2901                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2902         }
2903
2904         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2905         if (r < 0) {
2906                 *exit_status = EXIT_STDERR;
2907                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2908         }
2909
2910         if (params->cgroup_path) {
2911                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2912                 if (r < 0) {
2913                         *exit_status = EXIT_CGROUP;
2914                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2915                 }
2916         }
2917
2918         if (context->oom_score_adjust_set) {
2919                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2920
2921                 /* When we can't make this change due to EPERM, then
2922                  * let's silently skip over it. User namespaces
2923                  * prohibit write access to this file, and we
2924                  * shouldn't trip up over that. */
2925
2926                 sprintf(t, "%i", context->oom_score_adjust);
2927                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2928                 if (IN_SET(r, -EPERM, -EACCES))
2929                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2930                 else if (r < 0) {
2931                         *exit_status = EXIT_OOM_ADJUST;
2932                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2933                 }
2934         }
2935
2936         if (context->nice_set)
2937                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2938                         *exit_status = EXIT_NICE;
2939                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2940                 }
2941
2942         if (context->cpu_sched_set) {
2943                 struct sched_param param = {
2944                         .sched_priority = context->cpu_sched_priority,
2945                 };
2946
2947                 r = sched_setscheduler(0,
2948                                        context->cpu_sched_policy |
2949                                        (context->cpu_sched_reset_on_fork ?
2950                                         SCHED_RESET_ON_FORK : 0),
2951                                        &param);
2952                 if (r < 0) {
2953                         *exit_status = EXIT_SETSCHEDULER;
2954                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2955                 }
2956         }
2957
2958         if (context->cpuset)
2959                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2960                         *exit_status = EXIT_CPUAFFINITY;
2961                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2962                 }
2963
2964         if (context->ioprio_set)
2965                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2966                         *exit_status = EXIT_IOPRIO;
2967                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2968                 }
2969
2970         if (context->timer_slack_nsec != NSEC_INFINITY)
2971                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2972                         *exit_status = EXIT_TIMERSLACK;
2973                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2974                 }
2975
2976         if (context->personality != PERSONALITY_INVALID) {
2977                 r = safe_personality(context->personality);
2978                 if (r < 0) {
2979                         *exit_status = EXIT_PERSONALITY;
2980                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2981                 }
2982         }
2983
2984         if (context->utmp_id)
2985                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2986                                       context->tty_path,
2987                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
2988                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2989                                       USER_PROCESS,
2990                                       username);
2991
2992         if (context->user) {
2993                 r = chown_terminal(STDIN_FILENO, uid);
2994                 if (r < 0) {
2995                         *exit_status = EXIT_STDIN;
2996                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2997                 }
2998         }
2999
3000         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3001          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3002          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3003          * touch a single hierarchy too. */
3004         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3005                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3006                 if (r < 0) {
3007                         *exit_status = EXIT_CGROUP;
3008                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3009                 }
3010         }
3011
3012         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3013                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3014                 if (r < 0)
3015                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3016         }
3017
3018         r = build_environment(
3019                         unit,
3020                         context,
3021                         params,
3022                         n_fds,
3023                         home,
3024                         username,
3025                         shell,
3026                         journal_stream_dev,
3027                         journal_stream_ino,
3028                         &our_env);
3029         if (r < 0) {
3030                 *exit_status = EXIT_MEMORY;
3031                 return log_oom();
3032         }
3033
3034         r = build_pass_environment(context, &pass_env);
3035         if (r < 0) {
3036                 *exit_status = EXIT_MEMORY;
3037                 return log_oom();
3038         }
3039
3040         accum_env = strv_env_merge(5,
3041                                    params->environment,
3042                                    our_env,
3043                                    pass_env,
3044                                    context->environment,
3045                                    files_env,
3046                                    NULL);
3047         if (!accum_env) {
3048                 *exit_status = EXIT_MEMORY;
3049                 return log_oom();
3050         }
3051         accum_env = strv_env_clean(accum_env);
3052
3053         (void) umask(context->umask);
3054
3055         r = setup_keyring(unit, context, params, uid, gid);
3056         if (r < 0) {
3057                 *exit_status = EXIT_KEYRING;
3058                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3059         }
3060
3061         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3062         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3063
3064         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3065         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3066
3067         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3068         if (needs_ambient_hack)
3069                 needs_setuid = false;
3070         else
3071                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3072
3073         if (needs_sandboxing) {
3074                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3075                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3076                  * impacting our own code paths. */
3077
3078 #if HAVE_SELINUX
3079                 use_selinux = mac_selinux_use();
3080 #endif
3081 #if ENABLE_SMACK
3082                 use_smack = mac_smack_use();
3083 #endif
3084 #if HAVE_APPARMOR
3085                 use_apparmor = mac_apparmor_use();
3086 #endif
3087         }
3088
3089         if (needs_setuid) {
3090                 if (context->pam_name && username) {
3091                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3092                         if (r < 0) {
3093                                 *exit_status = EXIT_PAM;
3094                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3095                         }
3096                 }
3097         }
3098
3099         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3100                 if (ns_type_supported(NAMESPACE_NET)) {
3101                         r = setup_netns(runtime->netns_storage_socket);
3102                         if (r < 0) {
3103                                 *exit_status = EXIT_NETWORK;
3104                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3105                         }
3106                 } else
3107                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3108         }
3109
3110         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3111         if (needs_mount_namespace) {
3112                 r = apply_mount_namespace(unit, command, context, params, runtime);
3113                 if (r < 0) {
3114                         *exit_status = EXIT_NAMESPACE;
3115                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3116                 }
3117         }
3118
3119         /* Apply just after mount namespace setup */
3120         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3121         if (r < 0)
3122                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3123
3124         /* Drop groups as early as possbile */
3125         if (needs_setuid) {
3126                 r = enforce_groups(gid, supplementary_gids, ngids);
3127                 if (r < 0) {
3128                         *exit_status = EXIT_GROUP;
3129                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3130                 }
3131         }
3132
3133         if (needs_sandboxing) {
3134 #if HAVE_SELINUX
3135                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3136                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3137                         if (r < 0) {
3138                                 *exit_status = EXIT_SELINUX_CONTEXT;
3139                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3140                         }
3141                 }
3142 #endif
3143
3144                 if (context->private_users) {
3145                         r = setup_private_users(uid, gid);
3146                         if (r < 0) {
3147                                 *exit_status = EXIT_USER;
3148                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3149                         }
3150                 }
3151         }
3152
3153         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3154          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3155          * was needed to upload the policy and can now be closed as well. */
3156         r = close_all_fds(fds, n_fds);
3157         if (r >= 0)
3158                 r = shift_fds(fds, n_fds);
3159         if (r >= 0)
3160                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3161         if (r < 0) {
3162                 *exit_status = EXIT_FDS;
3163                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3164         }
3165
3166         secure_bits = context->secure_bits;
3167
3168         if (needs_sandboxing) {
3169                 uint64_t bset;
3170
3171                 for (i = 0; i < _RLIMIT_MAX; i++) {
3172
3173                         if (!context->rlimit[i])
3174                                 continue;
3175
3176                         r = setrlimit_closest(i, context->rlimit[i]);
3177                         if (r < 0) {
3178                                 *exit_status = EXIT_LIMITS;
3179                                 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(i));
3180                         }
3181                 }
3182
3183                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3184                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3185                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3186                                 *exit_status = EXIT_LIMITS;
3187                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3188                         }
3189                 }
3190
3191 #if ENABLE_SMACK
3192                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3193                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3194                 if (use_smack) {
3195                         r = setup_smack(context, command);
3196                         if (r < 0) {
3197                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3198                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3199                         }
3200                 }
3201 #endif
3202
3203                 bset = context->capability_bounding_set;
3204                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3205                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3206                  * instead of us doing that */
3207                 if (needs_ambient_hack)
3208                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3209                                 (UINT64_C(1) << CAP_SETUID) |
3210                                 (UINT64_C(1) << CAP_SETGID);
3211
3212                 if (!cap_test_all(bset)) {
3213                         r = capability_bounding_set_drop(bset, false);
3214                         if (r < 0) {
3215                                 *exit_status = EXIT_CAPABILITIES;
3216                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3217                         }
3218                 }
3219
3220                 /* This is done before enforce_user, but ambient set
3221                  * does not survive over setresuid() if keep_caps is not set. */
3222                 if (!needs_ambient_hack &&
3223                     context->capability_ambient_set != 0) {
3224                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3225                         if (r < 0) {
3226                                 *exit_status = EXIT_CAPABILITIES;
3227                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3228                         }
3229                 }
3230         }
3231
3232         if (needs_setuid) {
3233                 if (context->user) {
3234                         r = enforce_user(context, uid);
3235                         if (r < 0) {
3236                                 *exit_status = EXIT_USER;
3237                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3238                         }
3239
3240                         if (!needs_ambient_hack &&
3241                             context->capability_ambient_set != 0) {
3242
3243                                 /* Fix the ambient capabilities after user change. */
3244                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3245                                 if (r < 0) {
3246                                         *exit_status = EXIT_CAPABILITIES;
3247                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3248                                 }
3249
3250                                 /* If we were asked to change user and ambient capabilities
3251                                  * were requested, we had to add keep-caps to the securebits
3252                                  * so that we would maintain the inherited capability set
3253                                  * through the setresuid(). Make sure that the bit is added
3254                                  * also to the context secure_bits so that we don't try to
3255                                  * drop the bit away next. */
3256
3257                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3258                         }
3259                 }
3260         }
3261
3262         if (needs_sandboxing) {
3263                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3264                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3265                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3266                  * are restricted. */
3267
3268 #if HAVE_SELINUX
3269                 if (use_selinux) {
3270                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3271
3272                         if (exec_context) {
3273                                 r = setexeccon(exec_context);
3274                                 if (r < 0) {
3275                                         *exit_status = EXIT_SELINUX_CONTEXT;
3276                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3277                                 }
3278                         }
3279                 }
3280 #endif
3281
3282 #if HAVE_APPARMOR
3283                 if (use_apparmor && context->apparmor_profile) {
3284                         r = aa_change_onexec(context->apparmor_profile);
3285                         if (r < 0 && !context->apparmor_profile_ignore) {
3286                                 *exit_status = EXIT_APPARMOR_PROFILE;
3287                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3288                         }
3289                 }
3290 #endif
3291
3292                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3293                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3294                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3295                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3296                                 *exit_status = EXIT_SECUREBITS;
3297                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3298                         }
3299
3300                 if (context_has_no_new_privileges(context))
3301                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3302                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3303                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3304                         }
3305
3306 #if HAVE_SECCOMP
3307                 r = apply_address_families(unit, context);
3308                 if (r < 0) {
3309                         *exit_status = EXIT_ADDRESS_FAMILIES;
3310                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3311                 }
3312
3313                 r = apply_memory_deny_write_execute(unit, context);
3314                 if (r < 0) {
3315                         *exit_status = EXIT_SECCOMP;
3316                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3317                 }
3318
3319                 r = apply_restrict_realtime(unit, context);
3320                 if (r < 0) {
3321                         *exit_status = EXIT_SECCOMP;
3322                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3323                 }
3324
3325                 r = apply_restrict_namespaces(unit, context);
3326                 if (r < 0) {
3327                         *exit_status = EXIT_SECCOMP;
3328                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3329                 }
3330
3331                 r = apply_protect_sysctl(unit, context);
3332                 if (r < 0) {
3333                         *exit_status = EXIT_SECCOMP;
3334                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3335                 }
3336
3337                 r = apply_protect_kernel_modules(unit, context);
3338                 if (r < 0) {
3339                         *exit_status = EXIT_SECCOMP;
3340                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3341                 }
3342
3343                 r = apply_private_devices(unit, context);
3344                 if (r < 0) {
3345                         *exit_status = EXIT_SECCOMP;
3346                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3347                 }
3348
3349                 r = apply_syscall_archs(unit, context);
3350                 if (r < 0) {
3351                         *exit_status = EXIT_SECCOMP;
3352                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3353                 }
3354
3355                 r = apply_lock_personality(unit, context);
3356                 if (r < 0) {
3357                         *exit_status = EXIT_SECCOMP;
3358                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3359                 }
3360
3361                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3362                  * by the filter as little as possible. */
3363                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3364                 if (r < 0) {
3365                         *exit_status = EXIT_SECCOMP;
3366                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3367                 }
3368 #endif
3369         }
3370
3371         if (!strv_isempty(context->unset_environment)) {
3372                 char **ee = NULL;
3373
3374                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3375                 if (!ee) {
3376                         *exit_status = EXIT_MEMORY;
3377                         return log_oom();
3378                 }
3379
3380                 strv_free_and_replace(accum_env, ee);
3381         }
3382
3383         final_argv = replace_env_argv(argv, accum_env);
3384         if (!final_argv) {
3385                 *exit_status = EXIT_MEMORY;
3386                 return log_oom();
3387         }
3388
3389         if (DEBUG_LOGGING) {
3390                 _cleanup_free_ char *line;
3391
3392                 line = exec_command_line(final_argv);
3393                 if (line) {
3394                         log_struct(LOG_DEBUG,
3395                                    "EXECUTABLE=%s", command->path,
3396                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3397                                    LOG_UNIT_ID(unit),
3398                                    LOG_UNIT_INVOCATION_ID(unit),
3399                                    NULL);
3400                 }
3401         }
3402
3403         execve(command->path, final_argv, accum_env);
3404
3405         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3406
3407                 log_struct_errno(LOG_INFO, errno,
3408                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3409                                  LOG_UNIT_ID(unit),
3410                                  LOG_UNIT_INVOCATION_ID(unit),
3411                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3412                                                   command->path),
3413                                  "EXECUTABLE=%s", command->path,
3414                                  NULL);
3415
3416                 return 0;
3417         }
3418
3419         *exit_status = EXIT_EXEC;
3420         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3421 }
3422
3423 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3424 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3425
3426 int exec_spawn(Unit *unit,
3427                ExecCommand *command,
3428                const ExecContext *context,
3429                const ExecParameters *params,
3430                ExecRuntime *runtime,
3431                DynamicCreds *dcreds,
3432                pid_t *ret) {
3433
3434         _cleanup_strv_free_ char **files_env = NULL;
3435         int *fds = NULL;
3436         size_t n_storage_fds = 0, n_socket_fds = 0;
3437         _cleanup_free_ char *line = NULL;
3438         int socket_fd, r;
3439         int named_iofds[3] = { -1, -1, -1 };
3440         char **argv;
3441         pid_t pid;
3442
3443         assert(unit);
3444         assert(command);
3445         assert(context);
3446         assert(ret);
3447         assert(params);
3448         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3449
3450         if (context->std_input == EXEC_INPUT_SOCKET ||
3451             context->std_output == EXEC_OUTPUT_SOCKET ||
3452             context->std_error == EXEC_OUTPUT_SOCKET) {
3453
3454                 if (params->n_socket_fds > 1) {
3455                         log_unit_error(unit, "Got more than one socket.");
3456                         return -EINVAL;
3457                 }
3458
3459                 if (params->n_socket_fds == 0) {
3460                         log_unit_error(unit, "Got no socket.");
3461                         return -EINVAL;
3462                 }
3463
3464                 socket_fd = params->fds[0];
3465         } else {
3466                 socket_fd = -1;
3467                 fds = params->fds;
3468                 n_storage_fds = params->n_storage_fds;
3469                 n_socket_fds = params->n_socket_fds;
3470         }
3471
3472         r = exec_context_named_iofds(context, params, named_iofds);
3473         if (r < 0)
3474                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3475
3476         r = exec_context_load_environment(unit, context, &files_env);
3477         if (r < 0)
3478                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3479
3480         argv = params->argv ?: command->argv;
3481         line = exec_command_line(argv);
3482         if (!line)
3483                 return log_oom();
3484
3485         log_struct(LOG_DEBUG,
3486                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3487                    "EXECUTABLE=%s", command->path,
3488                    LOG_UNIT_ID(unit),
3489                    LOG_UNIT_INVOCATION_ID(unit),
3490                    NULL);
3491
3492         pid = fork();
3493         if (pid < 0)
3494                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3495
3496         if (pid == 0) {
3497                 int exit_status = EXIT_SUCCESS;
3498
3499                 r = exec_child(unit,
3500                                command,
3501                                context,
3502                                params,
3503                                runtime,
3504                                dcreds,
3505                                argv,
3506                                socket_fd,
3507                                named_iofds,
3508                                fds,
3509                                n_storage_fds,
3510                                n_socket_fds,
3511                                files_env,
3512                                unit->manager->user_lookup_fds[1],
3513                                &exit_status);
3514
3515                 if (r < 0) {
3516                         log_struct_errno(LOG_ERR, r,
3517                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3518                                          LOG_UNIT_ID(unit),
3519                                          LOG_UNIT_INVOCATION_ID(unit),
3520                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3521                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3522                                                           command->path),
3523                                          "EXECUTABLE=%s", command->path,
3524                                          NULL);
3525                 }
3526
3527                 _exit(exit_status);
3528         }
3529
3530         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3531
3532         /* We add the new process to the cgroup both in the child (so
3533          * that we can be sure that no user code is ever executed
3534          * outside of the cgroup) and in the parent (so that we can be
3535          * sure that when we kill the cgroup the process will be
3536          * killed too). */
3537         if (params->cgroup_path)
3538                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3539
3540         exec_status_start(&command->exec_status, pid);
3541
3542         *ret = pid;
3543         return 0;
3544 }
3545
3546 void exec_context_init(ExecContext *c) {
3547         ExecDirectoryType i;
3548
3549         assert(c);
3550
3551         c->umask = 0022;
3552         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3553         c->cpu_sched_policy = SCHED_OTHER;
3554         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3555         c->syslog_level_prefix = true;
3556         c->ignore_sigpipe = true;
3557         c->timer_slack_nsec = NSEC_INFINITY;
3558         c->personality = PERSONALITY_INVALID;
3559         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3560                 c->directories[i].mode = 0755;
3561         c->capability_bounding_set = CAP_ALL;
3562         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3563         c->log_level_max = -1;
3564 }
3565
3566 void exec_context_done(ExecContext *c) {
3567         ExecDirectoryType i;
3568         size_t l;
3569
3570         assert(c);
3571
3572         c->environment = strv_free(c->environment);
3573         c->environment_files = strv_free(c->environment_files);
3574         c->pass_environment = strv_free(c->pass_environment);
3575         c->unset_environment = strv_free(c->unset_environment);
3576
3577         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3578                 c->rlimit[l] = mfree(c->rlimit[l]);
3579
3580         for (l = 0; l < 3; l++) {
3581                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3582                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3583         }
3584
3585         c->working_directory = mfree(c->working_directory);
3586         c->root_directory = mfree(c->root_directory);
3587         c->root_image = mfree(c->root_image);
3588         c->tty_path = mfree(c->tty_path);
3589         c->syslog_identifier = mfree(c->syslog_identifier);
3590         c->user = mfree(c->user);
3591         c->group = mfree(c->group);
3592
3593         c->supplementary_groups = strv_free(c->supplementary_groups);
3594
3595         c->pam_name = mfree(c->pam_name);
3596
3597         c->read_only_paths = strv_free(c->read_only_paths);
3598         c->read_write_paths = strv_free(c->read_write_paths);
3599         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3600
3601         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3602         c->bind_mounts = NULL;
3603         c->n_bind_mounts = 0;
3604         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3605         c->temporary_filesystems = NULL;
3606         c->n_temporary_filesystems = 0;
3607
3608         c->cpuset = cpu_set_mfree(c->cpuset);
3609
3610         c->utmp_id = mfree(c->utmp_id);
3611         c->selinux_context = mfree(c->selinux_context);
3612         c->apparmor_profile = mfree(c->apparmor_profile);
3613         c->smack_process_label = mfree(c->smack_process_label);
3614
3615         c->syscall_filter = hashmap_free(c->syscall_filter);
3616         c->syscall_archs = set_free(c->syscall_archs);
3617         c->address_families = set_free(c->address_families);
3618
3619         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3620                 c->directories[i].paths = strv_free(c->directories[i].paths);
3621
3622         c->log_level_max = -1;
3623
3624         exec_context_free_log_extra_fields(c);
3625
3626         c->stdin_data = mfree(c->stdin_data);
3627         c->stdin_data_size = 0;
3628 }
3629
3630 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3631         char **i;
3632
3633         assert(c);
3634
3635         if (!runtime_prefix)
3636                 return 0;
3637
3638         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3639                 _cleanup_free_ char *p;
3640
3641                 p = strjoin(runtime_prefix, "/", *i);
3642                 if (!p)
3643                         return -ENOMEM;
3644
3645                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3646                  * next. */
3647                 (void) rm_rf(p, REMOVE_ROOT);
3648         }
3649
3650         return 0;
3651 }
3652
3653 static void exec_command_done(ExecCommand *c) {
3654         assert(c);
3655
3656         c->path = mfree(c->path);
3657
3658         c->argv = strv_free(c->argv);
3659 }
3660
3661 void exec_command_done_array(ExecCommand *c, size_t n) {
3662         size_t i;
3663
3664         for (i = 0; i < n; i++)
3665                 exec_command_done(c+i);
3666 }
3667
3668 ExecCommand* exec_command_free_list(ExecCommand *c) {
3669         ExecCommand *i;
3670
3671         while ((i = c)) {
3672                 LIST_REMOVE(command, c, i);
3673                 exec_command_done(i);
3674                 free(i);
3675         }
3676
3677         return NULL;
3678 }
3679
3680 void exec_command_free_array(ExecCommand **c, size_t n) {
3681         size_t i;
3682
3683         for (i = 0; i < n; i++)
3684                 c[i] = exec_command_free_list(c[i]);
3685 }
3686
3687 typedef struct InvalidEnvInfo {
3688         const Unit *unit;
3689         const char *path;
3690 } InvalidEnvInfo;
3691
3692 static void invalid_env(const char *p, void *userdata) {
3693         InvalidEnvInfo *info = userdata;
3694
3695         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3696 }
3697
3698 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3699         assert(c);
3700
3701         switch (fd_index) {
3702
3703         case STDIN_FILENO:
3704                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3705                         return NULL;
3706
3707                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3708
3709         case STDOUT_FILENO:
3710                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3711                         return NULL;
3712
3713                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3714
3715         case STDERR_FILENO:
3716                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3717                         return NULL;
3718
3719                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3720
3721         default:
3722                 return NULL;
3723         }
3724 }
3725
3726 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3727         size_t i, targets;
3728         const char* stdio_fdname[3];
3729         size_t n_fds;
3730
3731         assert(c);
3732         assert(p);
3733
3734         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3735                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3736                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3737
3738         for (i = 0; i < 3; i++)
3739                 stdio_fdname[i] = exec_context_fdname(c, i);
3740
3741         n_fds = p->n_storage_fds + p->n_socket_fds;
3742
3743         for (i = 0; i < n_fds  && targets > 0; i++)
3744                 if (named_iofds[STDIN_FILENO] < 0 &&
3745                     c->std_input == EXEC_INPUT_NAMED_FD &&
3746                     stdio_fdname[STDIN_FILENO] &&
3747                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3748
3749                         named_iofds[STDIN_FILENO] = p->fds[i];
3750                         targets--;
3751
3752                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3753                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3754                            stdio_fdname[STDOUT_FILENO] &&
3755                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3756
3757                         named_iofds[STDOUT_FILENO] = p->fds[i];
3758                         targets--;
3759
3760                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3761                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3762                            stdio_fdname[STDERR_FILENO] &&
3763                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3764
3765                         named_iofds[STDERR_FILENO] = p->fds[i];
3766                         targets--;
3767                 }
3768
3769         return targets == 0 ? 0 : -ENOENT;
3770 }
3771
3772 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3773         char **i, **r = NULL;
3774
3775         assert(c);
3776         assert(l);
3777
3778         STRV_FOREACH(i, c->environment_files) {
3779                 char *fn;
3780                 int k;
3781                 unsigned n;
3782                 bool ignore = false;
3783                 char **p;
3784                 _cleanup_globfree_ glob_t pglob = {};
3785
3786                 fn = *i;
3787
3788                 if (fn[0] == '-') {
3789                         ignore = true;
3790                         fn++;
3791                 }
3792
3793                 if (!path_is_absolute(fn)) {
3794                         if (ignore)
3795                                 continue;
3796
3797                         strv_free(r);
3798                         return -EINVAL;
3799                 }
3800
3801                 /* Filename supports globbing, take all matching files */
3802                 k = safe_glob(fn, 0, &pglob);
3803                 if (k < 0) {
3804                         if (ignore)
3805                                 continue;
3806
3807                         strv_free(r);
3808                         return k;
3809                 }
3810
3811                 /* When we don't match anything, -ENOENT should be returned */
3812                 assert(pglob.gl_pathc > 0);
3813
3814                 for (n = 0; n < pglob.gl_pathc; n++) {
3815                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3816                         if (k < 0) {
3817                                 if (ignore)
3818                                         continue;
3819
3820                                 strv_free(r);
3821                                 return k;
3822                         }
3823                         /* Log invalid environment variables with filename */
3824                         if (p) {
3825                                 InvalidEnvInfo info = {
3826                                         .unit = unit,
3827                                         .path = pglob.gl_pathv[n]
3828                                 };
3829
3830                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3831                         }
3832
3833                         if (!r)
3834                                 r = p;
3835                         else {
3836                                 char **m;
3837
3838                                 m = strv_env_merge(2, r, p);
3839                                 strv_free(r);
3840                                 strv_free(p);
3841                                 if (!m)
3842                                         return -ENOMEM;
3843
3844                                 r = m;
3845                         }
3846                 }
3847         }
3848
3849         *l = r;
3850
3851         return 0;
3852 }
3853
3854 static bool tty_may_match_dev_console(const char *tty) {
3855         _cleanup_free_ char *resolved = NULL;
3856
3857         if (!tty)
3858                 return true;
3859
3860         tty = skip_dev_prefix(tty);
3861
3862         /* trivial identity? */
3863         if (streq(tty, "console"))
3864                 return true;
3865
3866         if (resolve_dev_console(&resolved) < 0)
3867                 return true; /* if we could not resolve, assume it may */
3868
3869         /* "tty0" means the active VC, so it may be the same sometimes */
3870         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3871 }
3872
3873 bool exec_context_may_touch_console(const ExecContext *ec) {
3874
3875         return (ec->tty_reset ||
3876                 ec->tty_vhangup ||
3877                 ec->tty_vt_disallocate ||
3878                 is_terminal_input(ec->std_input) ||
3879                 is_terminal_output(ec->std_output) ||
3880                 is_terminal_output(ec->std_error)) &&
3881                tty_may_match_dev_console(exec_context_tty_path(ec));
3882 }
3883
3884 static void strv_fprintf(FILE *f, char **l) {
3885         char **g;
3886
3887         assert(f);
3888
3889         STRV_FOREACH(g, l)
3890                 fprintf(f, " %s", *g);
3891 }
3892
3893 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3894         ExecDirectoryType dt;
3895         char **e, **d;
3896         unsigned i;
3897         int r;
3898
3899         assert(c);
3900         assert(f);
3901
3902         prefix = strempty(prefix);
3903
3904         fprintf(f,
3905                 "%sUMask: %04o\n"
3906                 "%sWorkingDirectory: %s\n"
3907                 "%sRootDirectory: %s\n"
3908                 "%sNonBlocking: %s\n"
3909                 "%sPrivateTmp: %s\n"
3910                 "%sPrivateDevices: %s\n"
3911                 "%sProtectKernelTunables: %s\n"
3912                 "%sProtectKernelModules: %s\n"
3913                 "%sProtectControlGroups: %s\n"
3914                 "%sPrivateNetwork: %s\n"
3915                 "%sPrivateUsers: %s\n"
3916                 "%sProtectHome: %s\n"
3917                 "%sProtectSystem: %s\n"
3918                 "%sMountAPIVFS: %s\n"
3919                 "%sIgnoreSIGPIPE: %s\n"
3920                 "%sMemoryDenyWriteExecute: %s\n"
3921                 "%sRestrictRealtime: %s\n"
3922                 "%sKeyringMode: %s\n",
3923                 prefix, c->umask,
3924                 prefix, c->working_directory ? c->working_directory : "/",
3925                 prefix, c->root_directory ? c->root_directory : "/",
3926                 prefix, yes_no(c->non_blocking),
3927                 prefix, yes_no(c->private_tmp),
3928                 prefix, yes_no(c->private_devices),
3929                 prefix, yes_no(c->protect_kernel_tunables),
3930                 prefix, yes_no(c->protect_kernel_modules),
3931                 prefix, yes_no(c->protect_control_groups),
3932                 prefix, yes_no(c->private_network),
3933                 prefix, yes_no(c->private_users),
3934                 prefix, protect_home_to_string(c->protect_home),
3935                 prefix, protect_system_to_string(c->protect_system),
3936                 prefix, yes_no(c->mount_apivfs),
3937                 prefix, yes_no(c->ignore_sigpipe),
3938                 prefix, yes_no(c->memory_deny_write_execute),
3939                 prefix, yes_no(c->restrict_realtime),
3940                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3941
3942         if (c->root_image)
3943                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3944
3945         STRV_FOREACH(e, c->environment)
3946                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3947
3948         STRV_FOREACH(e, c->environment_files)
3949                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3950
3951         STRV_FOREACH(e, c->pass_environment)
3952                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3953
3954         STRV_FOREACH(e, c->unset_environment)
3955                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3956
3957         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3958
3959         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3960                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3961
3962                 STRV_FOREACH(d, c->directories[dt].paths)
3963                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3964         }
3965
3966         if (c->nice_set)
3967                 fprintf(f,
3968                         "%sNice: %i\n",
3969                         prefix, c->nice);
3970
3971         if (c->oom_score_adjust_set)
3972                 fprintf(f,
3973                         "%sOOMScoreAdjust: %i\n",
3974                         prefix, c->oom_score_adjust);
3975
3976         for (i = 0; i < RLIM_NLIMITS; i++)
3977                 if (c->rlimit[i]) {
3978                         fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
3979                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3980                         fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
3981                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3982                 }
3983
3984         if (c->ioprio_set) {
3985                 _cleanup_free_ char *class_str = NULL;
3986
3987                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3988                 if (r >= 0)
3989                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3990
3991                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3992         }
3993
3994         if (c->cpu_sched_set) {
3995                 _cleanup_free_ char *policy_str = NULL;
3996
3997                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3998                 if (r >= 0)
3999                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4000
4001                 fprintf(f,
4002                         "%sCPUSchedulingPriority: %i\n"
4003                         "%sCPUSchedulingResetOnFork: %s\n",
4004                         prefix, c->cpu_sched_priority,
4005                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4006         }
4007
4008         if (c->cpuset) {
4009                 fprintf(f, "%sCPUAffinity:", prefix);
4010                 for (i = 0; i < c->cpuset_ncpus; i++)
4011                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4012                                 fprintf(f, " %u", i);
4013                 fputs("\n", f);
4014         }
4015
4016         if (c->timer_slack_nsec != NSEC_INFINITY)
4017                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4018
4019         fprintf(f,
4020                 "%sStandardInput: %s\n"
4021                 "%sStandardOutput: %s\n"
4022                 "%sStandardError: %s\n",
4023                 prefix, exec_input_to_string(c->std_input),
4024                 prefix, exec_output_to_string(c->std_output),
4025                 prefix, exec_output_to_string(c->std_error));
4026
4027         if (c->std_input == EXEC_INPUT_NAMED_FD)
4028                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4029         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4030                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4031         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4032                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4033
4034         if (c->std_input == EXEC_INPUT_FILE)
4035                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4036         if (c->std_output == EXEC_OUTPUT_FILE)
4037                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4038         if (c->std_error == EXEC_OUTPUT_FILE)
4039                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4040
4041         if (c->tty_path)
4042                 fprintf(f,
4043                         "%sTTYPath: %s\n"
4044                         "%sTTYReset: %s\n"
4045                         "%sTTYVHangup: %s\n"
4046                         "%sTTYVTDisallocate: %s\n",
4047                         prefix, c->tty_path,
4048                         prefix, yes_no(c->tty_reset),
4049                         prefix, yes_no(c->tty_vhangup),
4050                         prefix, yes_no(c->tty_vt_disallocate));
4051
4052         if (IN_SET(c->std_output,
4053                    EXEC_OUTPUT_SYSLOG,
4054                    EXEC_OUTPUT_KMSG,
4055                    EXEC_OUTPUT_JOURNAL,
4056                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4057                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4058                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4059             IN_SET(c->std_error,
4060                    EXEC_OUTPUT_SYSLOG,
4061                    EXEC_OUTPUT_KMSG,
4062                    EXEC_OUTPUT_JOURNAL,
4063                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4064                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4065                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4066
4067                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4068
4069                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4070                 if (r >= 0)
4071                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4072
4073                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4074                 if (r >= 0)
4075                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4076         }
4077
4078         if (c->log_level_max >= 0) {
4079                 _cleanup_free_ char *t = NULL;
4080
4081                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4082
4083                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4084         }
4085
4086         if (c->n_log_extra_fields > 0) {
4087                 size_t j;
4088
4089                 for (j = 0; j < c->n_log_extra_fields; j++) {
4090                         fprintf(f, "%sLogExtraFields: ", prefix);
4091                         fwrite(c->log_extra_fields[j].iov_base,
4092                                1, c->log_extra_fields[j].iov_len,
4093                                f);
4094                         fputc('\n', f);
4095                 }
4096         }
4097
4098         if (c->secure_bits) {
4099                 _cleanup_free_ char *str = NULL;
4100
4101                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4102                 if (r >= 0)
4103                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4104         }
4105
4106         if (c->capability_bounding_set != CAP_ALL) {
4107                 _cleanup_free_ char *str = NULL;
4108
4109                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4110                 if (r >= 0)
4111                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4112         }
4113
4114         if (c->capability_ambient_set != 0) {
4115                 _cleanup_free_ char *str = NULL;
4116
4117                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4118                 if (r >= 0)
4119                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4120         }
4121
4122         if (c->user)
4123                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4124         if (c->group)
4125                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4126
4127         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4128
4129         if (!strv_isempty(c->supplementary_groups)) {
4130                 fprintf(f, "%sSupplementaryGroups:", prefix);
4131                 strv_fprintf(f, c->supplementary_groups);
4132                 fputs("\n", f);
4133         }
4134
4135         if (c->pam_name)
4136                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4137
4138         if (!strv_isempty(c->read_write_paths)) {
4139                 fprintf(f, "%sReadWritePaths:", prefix);
4140                 strv_fprintf(f, c->read_write_paths);
4141                 fputs("\n", f);
4142         }
4143
4144         if (!strv_isempty(c->read_only_paths)) {
4145                 fprintf(f, "%sReadOnlyPaths:", prefix);
4146                 strv_fprintf(f, c->read_only_paths);
4147                 fputs("\n", f);
4148         }
4149
4150         if (!strv_isempty(c->inaccessible_paths)) {
4151                 fprintf(f, "%sInaccessiblePaths:", prefix);
4152                 strv_fprintf(f, c->inaccessible_paths);
4153                 fputs("\n", f);
4154         }
4155
4156         if (c->n_bind_mounts > 0)
4157                 for (i = 0; i < c->n_bind_mounts; i++)
4158                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4159                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4160                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4161                                 c->bind_mounts[i].source,
4162                                 c->bind_mounts[i].destination,
4163                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4164
4165         if (c->n_temporary_filesystems > 0)
4166                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4167                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4168
4169                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4170                                 t->path,
4171                                 isempty(t->options) ? "" : ":",
4172                                 strempty(t->options));
4173                 }
4174
4175         if (c->utmp_id)
4176                 fprintf(f,
4177                         "%sUtmpIdentifier: %s\n",
4178                         prefix, c->utmp_id);
4179
4180         if (c->selinux_context)
4181                 fprintf(f,
4182                         "%sSELinuxContext: %s%s\n",
4183                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4184
4185         if (c->apparmor_profile)
4186                 fprintf(f,
4187                         "%sAppArmorProfile: %s%s\n",
4188                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4189
4190         if (c->smack_process_label)
4191                 fprintf(f,
4192                         "%sSmackProcessLabel: %s%s\n",
4193                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4194
4195         if (c->personality != PERSONALITY_INVALID)
4196                 fprintf(f,
4197                         "%sPersonality: %s\n",
4198                         prefix, strna(personality_to_string(c->personality)));
4199
4200         fprintf(f,
4201                 "%sLockPersonality: %s\n",
4202                 prefix, yes_no(c->lock_personality));
4203
4204         if (c->syscall_filter) {
4205 #if HAVE_SECCOMP
4206                 Iterator j;
4207                 void *id, *val;
4208                 bool first = true;
4209 #endif
4210
4211                 fprintf(f,
4212                         "%sSystemCallFilter: ",
4213                         prefix);
4214
4215                 if (!c->syscall_whitelist)
4216                         fputc('~', f);
4217
4218 #if HAVE_SECCOMP
4219                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4220                         _cleanup_free_ char *name = NULL;
4221                         const char *errno_name = NULL;
4222                         int num = PTR_TO_INT(val);
4223
4224                         if (first)
4225                                 first = false;
4226                         else
4227                                 fputc(' ', f);
4228
4229                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4230                         fputs(strna(name), f);
4231
4232                         if (num >= 0) {
4233                                 errno_name = errno_to_name(num);
4234                                 if (errno_name)
4235                                         fprintf(f, ":%s", errno_name);
4236                                 else
4237                                         fprintf(f, ":%d", num);
4238                         }
4239                 }
4240 #endif
4241
4242                 fputc('\n', f);
4243         }
4244
4245         if (c->syscall_archs) {
4246 #if HAVE_SECCOMP
4247                 Iterator j;
4248                 void *id;
4249 #endif
4250
4251                 fprintf(f,
4252                         "%sSystemCallArchitectures:",
4253                         prefix);
4254
4255 #if HAVE_SECCOMP
4256                 SET_FOREACH(id, c->syscall_archs, j)
4257                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4258 #endif
4259                 fputc('\n', f);
4260         }
4261
4262         if (exec_context_restrict_namespaces_set(c)) {
4263                 _cleanup_free_ char *s = NULL;
4264
4265                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4266                 if (r >= 0)
4267                         fprintf(f, "%sRestrictNamespaces: %s\n",
4268                                 prefix, s);
4269         }
4270
4271         if (c->syscall_errno > 0) {
4272                 const char *errno_name;
4273
4274                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4275
4276                 errno_name = errno_to_name(c->syscall_errno);
4277                 if (errno_name)
4278                         fprintf(f, "%s\n", errno_name);
4279                 else
4280                         fprintf(f, "%d\n", c->syscall_errno);
4281         }
4282
4283         if (c->apparmor_profile)
4284                 fprintf(f,
4285                         "%sAppArmorProfile: %s%s\n",
4286                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4287 }
4288
4289 bool exec_context_maintains_privileges(const ExecContext *c) {
4290         assert(c);
4291
4292         /* Returns true if the process forked off would run under
4293          * an unchanged UID or as root. */
4294
4295         if (!c->user)
4296                 return true;
4297
4298         if (streq(c->user, "root") || streq(c->user, "0"))
4299                 return true;
4300
4301         return false;
4302 }
4303
4304 int exec_context_get_effective_ioprio(const ExecContext *c) {
4305         int p;
4306
4307         assert(c);
4308
4309         if (c->ioprio_set)
4310                 return c->ioprio;
4311
4312         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4313         if (p < 0)
4314                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4315
4316         return p;
4317 }
4318
4319 void exec_context_free_log_extra_fields(ExecContext *c) {
4320         size_t l;
4321
4322         assert(c);
4323
4324         for (l = 0; l < c->n_log_extra_fields; l++)
4325                 free(c->log_extra_fields[l].iov_base);
4326         c->log_extra_fields = mfree(c->log_extra_fields);
4327         c->n_log_extra_fields = 0;
4328 }
4329
4330 void exec_status_start(ExecStatus *s, pid_t pid) {
4331         assert(s);
4332
4333         zero(*s);
4334         s->pid = pid;
4335         dual_timestamp_get(&s->start_timestamp);
4336 }
4337
4338 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4339         assert(s);
4340
4341         if (s->pid && s->pid != pid)
4342                 zero(*s);
4343
4344         s->pid = pid;
4345         dual_timestamp_get(&s->exit_timestamp);
4346
4347         s->code = code;
4348         s->status = status;
4349
4350         if (context) {
4351                 if (context->utmp_id)
4352                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4353
4354                 exec_context_tty_reset(context, NULL);
4355         }
4356 }
4357
4358 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4359         char buf[FORMAT_TIMESTAMP_MAX];
4360
4361         assert(s);
4362         assert(f);
4363
4364         if (s->pid <= 0)
4365                 return;
4366
4367         prefix = strempty(prefix);
4368
4369         fprintf(f,
4370                 "%sPID: "PID_FMT"\n",
4371                 prefix, s->pid);
4372
4373         if (dual_timestamp_is_set(&s->start_timestamp))
4374                 fprintf(f,
4375                         "%sStart Timestamp: %s\n",
4376                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4377
4378         if (dual_timestamp_is_set(&s->exit_timestamp))
4379                 fprintf(f,
4380                         "%sExit Timestamp: %s\n"
4381                         "%sExit Code: %s\n"
4382                         "%sExit Status: %i\n",
4383                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4384                         prefix, sigchld_code_to_string(s->code),
4385                         prefix, s->status);
4386 }
4387
4388 static char *exec_command_line(char **argv) {
4389         size_t k;
4390         char *n, *p, **a;
4391         bool first = true;
4392
4393         assert(argv);
4394
4395         k = 1;
4396         STRV_FOREACH(a, argv)
4397                 k += strlen(*a)+3;
4398
4399         n = new(char, k);
4400         if (!n)
4401                 return NULL;
4402
4403         p = n;
4404         STRV_FOREACH(a, argv) {
4405
4406                 if (!first)
4407                         *(p++) = ' ';
4408                 else
4409                         first = false;
4410
4411                 if (strpbrk(*a, WHITESPACE)) {
4412                         *(p++) = '\'';
4413                         p = stpcpy(p, *a);
4414                         *(p++) = '\'';
4415                 } else
4416                         p = stpcpy(p, *a);
4417
4418         }
4419
4420         *p = 0;
4421
4422         /* FIXME: this doesn't really handle arguments that have
4423          * spaces and ticks in them */
4424
4425         return n;
4426 }
4427
4428 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4429         _cleanup_free_ char *cmd = NULL;
4430         const char *prefix2;
4431
4432         assert(c);
4433         assert(f);
4434
4435         prefix = strempty(prefix);
4436         prefix2 = strjoina(prefix, "\t");
4437
4438         cmd = exec_command_line(c->argv);
4439         fprintf(f,
4440                 "%sCommand Line: %s\n",
4441                 prefix, cmd ? cmd : strerror(ENOMEM));
4442
4443         exec_status_dump(&c->exec_status, f, prefix2);
4444 }
4445
4446 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4447         assert(f);
4448
4449         prefix = strempty(prefix);
4450
4451         LIST_FOREACH(command, c, c)
4452                 exec_command_dump(c, f, prefix);
4453 }
4454
4455 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4456         ExecCommand *end;
4457
4458         assert(l);
4459         assert(e);
4460
4461         if (*l) {
4462                 /* It's kind of important, that we keep the order here */
4463                 LIST_FIND_TAIL(command, *l, end);
4464                 LIST_INSERT_AFTER(command, *l, end, e);
4465         } else
4466               *l = e;
4467 }
4468
4469 int exec_command_set(ExecCommand *c, const char *path, ...) {
4470         va_list ap;
4471         char **l, *p;
4472
4473         assert(c);
4474         assert(path);
4475
4476         va_start(ap, path);
4477         l = strv_new_ap(path, ap);
4478         va_end(ap);
4479
4480         if (!l)
4481                 return -ENOMEM;
4482
4483         p = strdup(path);
4484         if (!p) {
4485                 strv_free(l);
4486                 return -ENOMEM;
4487         }
4488
4489         free(c->path);
4490         c->path = p;
4491
4492         return strv_free_and_replace(c->argv, l);
4493 }
4494
4495 int exec_command_append(ExecCommand *c, const char *path, ...) {
4496         _cleanup_strv_free_ char **l = NULL;
4497         va_list ap;
4498         int r;
4499
4500         assert(c);
4501         assert(path);
4502
4503         va_start(ap, path);
4504         l = strv_new_ap(path, ap);
4505         va_end(ap);
4506
4507         if (!l)
4508                 return -ENOMEM;
4509
4510         r = strv_extend_strv(&c->argv, l, false);
4511         if (r < 0)
4512                 return r;
4513
4514         return 0;
4515 }
4516
4517 static void *remove_tmpdir_thread(void *p) {
4518         _cleanup_free_ char *path = p;
4519
4520         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4521         return NULL;
4522 }
4523
4524 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4525         int r;
4526
4527         if (!rt)
4528                 return NULL;
4529
4530         if (rt->manager)
4531                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4532
4533         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4534         if (destroy && rt->tmp_dir) {
4535                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4536
4537                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4538                 if (r < 0) {
4539                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4540                         free(rt->tmp_dir);
4541                 }
4542
4543                 rt->tmp_dir = NULL;
4544         }
4545
4546         if (destroy && rt->var_tmp_dir) {
4547                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4548
4549                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4550                 if (r < 0) {
4551                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4552                         free(rt->var_tmp_dir);
4553                 }
4554
4555                 rt->var_tmp_dir = NULL;
4556         }
4557
4558         rt->id = mfree(rt->id);
4559         rt->tmp_dir = mfree(rt->tmp_dir);
4560         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4561         safe_close_pair(rt->netns_storage_socket);
4562         return mfree(rt);
4563 }
4564
4565 static void exec_runtime_freep(ExecRuntime **rt) {
4566         if (*rt)
4567                 (void) exec_runtime_free(*rt, false);
4568 }
4569
4570 static int exec_runtime_allocate(ExecRuntime **rt) {
4571         assert(rt);
4572
4573         *rt = new0(ExecRuntime, 1);
4574         if (!*rt)
4575                 return -ENOMEM;
4576
4577         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4578         return 0;
4579 }
4580
4581 static int exec_runtime_add(
4582                 Manager *m,
4583                 const char *id,
4584                 const char *tmp_dir,
4585                 const char *var_tmp_dir,
4586                 const int netns_storage_socket[2],
4587                 ExecRuntime **ret) {
4588
4589         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4590         int r;
4591
4592         assert(m);
4593         assert(id);
4594
4595         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4596         if (r < 0)
4597                 return r;
4598
4599         r = exec_runtime_allocate(&rt);
4600         if (r < 0)
4601                 return r;
4602
4603         rt->id = strdup(id);
4604         if (!rt->id)
4605                 return -ENOMEM;
4606
4607         if (tmp_dir) {
4608                 rt->tmp_dir = strdup(tmp_dir);
4609                 if (!rt->tmp_dir)
4610                         return -ENOMEM;
4611
4612                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4613                 assert(var_tmp_dir);
4614                 rt->var_tmp_dir = strdup(var_tmp_dir);
4615                 if (!rt->var_tmp_dir)
4616                         return -ENOMEM;
4617         }
4618
4619         if (netns_storage_socket) {
4620                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4621                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4622         }
4623
4624         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4625         if (r < 0)
4626                 return r;
4627
4628         rt->manager = m;
4629
4630         if (ret)
4631                 *ret = rt;
4632
4633         /* do not remove created ExecRuntime object when the operation succeeds. */
4634         rt = NULL;
4635         return 0;
4636 }
4637
4638 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4639         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4640         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4641         int r;
4642
4643         assert(m);
4644         assert(c);
4645         assert(id);
4646
4647         /* It is not necessary to create ExecRuntime object. */
4648         if (!c->private_network && !c->private_tmp)
4649                 return 0;
4650
4651         if (c->private_tmp) {
4652                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4653                 if (r < 0)
4654                         return r;
4655         }
4656
4657         if (c->private_network) {
4658                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4659                         return -errno;
4660         }
4661
4662         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4663         if (r < 0)
4664                 return r;
4665
4666         /* Avoid cleanup */
4667         netns_storage_socket[0] = -1;
4668         netns_storage_socket[1] = -1;
4669         return 1;
4670 }
4671
4672 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4673         ExecRuntime *rt;
4674         int r;
4675
4676         assert(m);
4677         assert(id);
4678         assert(ret);
4679
4680         rt = hashmap_get(m->exec_runtime_by_id, id);
4681         if (rt)
4682                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4683                 goto ref;
4684
4685         if (!create)
4686                 return 0;
4687
4688         /* If not found, then create a new object. */
4689         r = exec_runtime_make(m, c, id, &rt);
4690         if (r <= 0)
4691                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4692                 return r;
4693
4694 ref:
4695         /* increment reference counter. */
4696         rt->n_ref++;
4697         *ret = rt;
4698         return 1;
4699 }
4700
4701 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4702         if (!rt)
4703                 return NULL;
4704
4705         assert(rt->n_ref > 0);
4706
4707         rt->n_ref--;
4708         if (rt->n_ref > 0)
4709                 return NULL;
4710
4711         return exec_runtime_free(rt, destroy);
4712 }
4713
4714 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4715         ExecRuntime *rt;
4716         Iterator i;
4717
4718         assert(m);
4719         assert(f);
4720         assert(fds);
4721
4722         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4723                 fprintf(f, "exec-runtime=%s", rt->id);
4724
4725                 if (rt->tmp_dir)
4726                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4727
4728                 if (rt->var_tmp_dir)
4729                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4730
4731                 if (rt->netns_storage_socket[0] >= 0) {
4732                         int copy;
4733
4734                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4735                         if (copy < 0)
4736                                 return copy;
4737
4738                         fprintf(f, " netns-socket-0=%i", copy);
4739                 }
4740
4741                 if (rt->netns_storage_socket[1] >= 0) {
4742                         int copy;
4743
4744                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4745                         if (copy < 0)
4746                                 return copy;
4747
4748                         fprintf(f, " netns-socket-1=%i", copy);
4749                 }
4750
4751                 fputc('\n', f);
4752         }
4753
4754         return 0;
4755 }
4756
4757 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4758         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4759         ExecRuntime *rt;
4760         int r;
4761
4762         /* This is for the migration from old (v237 or earlier) deserialization text.
4763          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4764          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4765          * so or not from the serialized text, then we always creates a new object owned by this. */
4766
4767         assert(u);
4768         assert(key);
4769         assert(value);
4770
4771         /* Manager manages ExecRuntime objects by the unit id.
4772          * So, we omit the serialized text when the unit does not have id (yet?)... */
4773         if (isempty(u->id)) {
4774                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4775                 return 0;
4776         }
4777
4778         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4779         if (r < 0) {
4780                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4781                 return 0;
4782         }
4783
4784         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4785         if (!rt) {
4786                 r = exec_runtime_allocate(&rt_create);
4787                 if (r < 0)
4788                         return log_oom();
4789
4790                 rt_create->id = strdup(u->id);
4791                 if (!rt_create->id)
4792                         return log_oom();
4793
4794                 rt = rt_create;
4795         }
4796
4797         if (streq(key, "tmp-dir")) {
4798                 char *copy;
4799
4800                 copy = strdup(value);
4801                 if (!copy)
4802                         return log_oom();
4803
4804                 free_and_replace(rt->tmp_dir, copy);
4805
4806         } else if (streq(key, "var-tmp-dir")) {
4807                 char *copy;
4808
4809                 copy = strdup(value);
4810                 if (!copy)
4811                         return log_oom();
4812
4813                 free_and_replace(rt->var_tmp_dir, copy);
4814
4815         } else if (streq(key, "netns-socket-0")) {
4816                 int fd;
4817
4818                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4819                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4820                         return 0;
4821                 }
4822
4823                 safe_close(rt->netns_storage_socket[0]);
4824                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4825
4826         } else if (streq(key, "netns-socket-1")) {
4827                 int fd;
4828
4829                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4830                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4831                         return 0;
4832                 }
4833
4834                 safe_close(rt->netns_storage_socket[1]);
4835                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4836         } else
4837                 return 0;
4838
4839         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4840         if (rt_create) {
4841                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4842                 if (r < 0) {
4843                         log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4844                         return 0;
4845                 }
4846
4847                 rt_create->manager = u->manager;
4848
4849                 /* Avoid cleanup */
4850                 rt_create = NULL;
4851         }
4852
4853         return 1;
4854 }
4855
4856 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4857         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4858         int r, fd0 = -1, fd1 = -1;
4859         const char *p, *v = value;
4860         size_t n;
4861
4862         assert(m);
4863         assert(value);
4864         assert(fds);
4865
4866         n = strcspn(v, " ");
4867         id = strndupa(v, n);
4868         if (v[n] != ' ')
4869                 goto finalize;
4870         p = v + n + 1;
4871
4872         v = startswith(p, "tmp-dir=");
4873         if (v) {
4874                 n = strcspn(v, " ");
4875                 tmp_dir = strndupa(v, n);
4876                 if (v[n] != ' ')
4877                         goto finalize;
4878                 p = v + n + 1;
4879         }
4880
4881         v = startswith(p, "var-tmp-dir=");
4882         if (v) {
4883                 n = strcspn(v, " ");
4884                 var_tmp_dir = strndupa(v, n);
4885                 if (v[n] != ' ')
4886                         goto finalize;
4887                 p = v + n + 1;
4888         }
4889
4890         v = startswith(p, "netns-socket-0=");
4891         if (v) {
4892                 char *buf;
4893
4894                 n = strcspn(v, " ");
4895                 buf = strndupa(v, n);
4896                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4897                         log_debug("Unable to process exec-runtime netns fd specification.");
4898                         return;
4899                 }
4900                 fd0 = fdset_remove(fds, fd0);
4901                 if (v[n] != ' ')
4902                         goto finalize;
4903                 p = v + n + 1;
4904         }
4905
4906         v = startswith(p, "netns-socket-1=");
4907         if (v) {
4908                 char *buf;
4909
4910                 n = strcspn(v, " ");
4911                 buf = strndupa(v, n);
4912                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4913                         log_debug("Unable to process exec-runtime netns fd specification.");
4914                         return;
4915                 }
4916                 fd1 = fdset_remove(fds, fd1);
4917         }
4918
4919 finalize:
4920
4921         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4922         if (r < 0) {
4923                 log_debug_errno(r, "Failed to add exec-runtime: %m");
4924                 return;
4925         }
4926 }
4927
4928 void exec_runtime_vacuum(Manager *m) {
4929         ExecRuntime *rt;
4930         Iterator i;
4931
4932         assert(m);
4933
4934         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4935
4936         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4937                 if (rt->n_ref > 0)
4938                         continue;
4939
4940                 (void) exec_runtime_free(rt, false);
4941         }
4942 }
4943
4944 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4945         [EXEC_INPUT_NULL] = "null",
4946         [EXEC_INPUT_TTY] = "tty",
4947         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4948         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4949         [EXEC_INPUT_SOCKET] = "socket",
4950         [EXEC_INPUT_NAMED_FD] = "fd",
4951         [EXEC_INPUT_DATA] = "data",
4952         [EXEC_INPUT_FILE] = "file",
4953 };
4954
4955 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4956
4957 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4958         [EXEC_OUTPUT_INHERIT] = "inherit",
4959         [EXEC_OUTPUT_NULL] = "null",
4960         [EXEC_OUTPUT_TTY] = "tty",
4961         [EXEC_OUTPUT_SYSLOG] = "syslog",
4962         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4963         [EXEC_OUTPUT_KMSG] = "kmsg",
4964         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4965         [EXEC_OUTPUT_JOURNAL] = "journal",
4966         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4967         [EXEC_OUTPUT_SOCKET] = "socket",
4968         [EXEC_OUTPUT_NAMED_FD] = "fd",
4969         [EXEC_OUTPUT_FILE] = "file",
4970 };
4971
4972 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4973
4974 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4975         [EXEC_UTMP_INIT] = "init",
4976         [EXEC_UTMP_LOGIN] = "login",
4977         [EXEC_UTMP_USER] = "user",
4978 };
4979
4980 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4981
4982 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4983         [EXEC_PRESERVE_NO] = "no",
4984         [EXEC_PRESERVE_YES] = "yes",
4985         [EXEC_PRESERVE_RESTART] = "restart",
4986 };
4987
4988 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4989
4990 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4991         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4992         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4993         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4994         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4995         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4996 };
4997
4998 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4999
5000 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5001         [EXEC_KEYRING_INHERIT] = "inherit",
5002         [EXEC_KEYRING_PRIVATE] = "private",
5003         [EXEC_KEYRING_SHARED] = "shared",
5004 };
5005
5006 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);