src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-file.h"
  54 #include "env-util.h"
  55 #include "errno-list.h"
  56 #include "execute.h"
  57 #include "exit-status.h"
  58 #include "fd-util.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "memory-util.h"
  69 #include "missing.h"
  70 #include "mkdir.h"
  71 #include "namespace.h"
  72 #include "parse-util.h"
  73 #include "path-util.h"
  74 #include "process-util.h"
  75 #include "rlimit-util.h"
  76 #include "rm-rf.h"
  77 #if HAVE_SECCOMP
  78 #include "seccomp-util.h"
  79 #endif
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "utmp-wtmp.h"
  96
  97 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  98 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  99
 100 #define SNDBUF_SIZE (8*1024*1024)
 101
 102 static int shift_fds(int fds[], size_t n_fds) {
 103         int start, restart_from;
 104
 105         if (n_fds <= 0)
 106                 return 0;
 107
 108         /* Modifies the fds array! (sorts it) */
 109
 110         assert(fds);
 111
 112         start = 0;
 113         for (;;) {
 114                 int i;
 115
 116                 restart_from = -1;
 117
 118                 for (i = start; i < (int) n_fds; i++) {
 119                         int nfd;
 120
 121                         /* Already at right index? */
 122                         if (fds[i] == i+3)
 123                                 continue;
 124
 125                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 126                         if (nfd < 0)
 127                                 return -errno;
 128
 129                         safe_close(fds[i]);
 130                         fds[i] = nfd;
 131
 132                         /* Hmm, the fd we wanted isn't free? Then
 133                          * let's remember that and try again from here */
 134                         if (nfd != i+3 && restart_from < 0)
 135                                 restart_from = i;
 136                 }
 137
 138                 if (restart_from < 0)
 139                         break;
 140
 141                 start = restart_from;
 142         }
 143
 144         return 0;
 145 }
 146
 147 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 148         size_t i, n_fds;
 149         int r;
 150
 151         n_fds = n_socket_fds + n_storage_fds;
 152         if (n_fds <= 0)
 153                 return 0;
 154
 155         assert(fds);
 156
 157         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 158          * O_NONBLOCK only applies to socket activation though. */
 159
 160         for (i = 0; i < n_fds; i++) {
 161
 162                 if (i < n_socket_fds) {
 163                         r = fd_nonblock(fds[i], nonblock);
 164                         if (r < 0)
 165                                 return r;
 166                 }
 167
 168                 /* We unconditionally drop FD_CLOEXEC from the fds,
 169                  * since after all we want to pass these fds to our
 170                  * children */
 171
 172                 r = fd_cloexec(fds[i], false);
 173                 if (r < 0)
 174                         return r;
 175         }
 176
 177         return 0;
 178 }
 179
 180 static const char *exec_context_tty_path(const ExecContext *context) {
 181         assert(context);
 182
 183         if (context->stdio_as_fds)
 184                 return NULL;
 185
 186         if (context->tty_path)
 187                 return context->tty_path;
 188
 189         return "/dev/console";
 190 }
 191
 192 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 193         const char *path;
 194
 195         assert(context);
 196
 197         path = exec_context_tty_path(context);
 198
 199         if (context->tty_vhangup) {
 200                 if (p && p->stdin_fd >= 0)
 201                         (void) terminal_vhangup_fd(p->stdin_fd);
 202                 else if (path)
 203                         (void) terminal_vhangup(path);
 204         }
 205
 206         if (context->tty_reset) {
 207                 if (p && p->stdin_fd >= 0)
 208                         (void) reset_terminal_fd(p->stdin_fd, true);
 209                 else if (path)
 210                         (void) reset_terminal(path);
 211         }
 212
 213         if (context->tty_vt_disallocate && path)
 214                 (void) vt_disallocate(path);
 215 }
 216
 217 static bool is_terminal_input(ExecInput i) {
 218         return IN_SET(i,
 219                       EXEC_INPUT_TTY,
 220                       EXEC_INPUT_TTY_FORCE,
 221                       EXEC_INPUT_TTY_FAIL);
 222 }
 223
 224 static bool is_terminal_output(ExecOutput o) {
 225         return IN_SET(o,
 226                       EXEC_OUTPUT_TTY,
 227                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 228                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 229                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 230 }
 231
 232 static bool is_syslog_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_SYSLOG,
 235                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 236 }
 237
 238 static bool is_kmsg_output(ExecOutput o) {
 239         return IN_SET(o,
 240                       EXEC_OUTPUT_KMSG,
 241                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 242 }
 243
 244 static bool exec_context_needs_term(const ExecContext *c) {
 245         assert(c);
 246
 247         /* Return true if the execution context suggests we should set $TERM to something useful. */
 248
 249         if (is_terminal_input(c->std_input))
 250                 return true;
 251
 252         if (is_terminal_output(c->std_output))
 253                 return true;
 254
 255         if (is_terminal_output(c->std_error))
 256                 return true;
 257
 258         return !!c->tty_path;
 259 }
 260
 261 static int open_null_as(int flags, int nfd) {
 262         int fd;
 263
 264         assert(nfd >= 0);
 265
 266         fd = open("/dev/null", flags|O_NOCTTY);
 267         if (fd < 0)
 268                 return -errno;
 269
 270         return move_fd(fd, nfd, false);
 271 }
 272
 273 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 274         static const union sockaddr_union sa = {
 275                 .un.sun_family = AF_UNIX,
 276                 .un.sun_path = "/run/systemd/journal/stdout",
 277         };
 278         uid_t olduid = UID_INVALID;
 279         gid_t oldgid = GID_INVALID;
 280         int r;
 281
 282         if (gid_is_valid(gid)) {
 283                 oldgid = getgid();
 284
 285                 if (setegid(gid) < 0)
 286                         return -errno;
 287         }
 288
 289         if (uid_is_valid(uid)) {
 290                 olduid = getuid();
 291
 292                 if (seteuid(uid) < 0) {
 293                         r = -errno;
 294                         goto restore_gid;
 295                 }
 296         }
 297
 298         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 299
 300         /* If we fail to restore the uid or gid, things will likely
 301            fail later on. This should only happen if an LSM interferes. */
 302
 303         if (uid_is_valid(uid))
 304                 (void) seteuid(olduid);
 305
 306  restore_gid:
 307         if (gid_is_valid(gid))
 308                 (void) setegid(oldgid);
 309
 310         return r;
 311 }
 312
 313 static int connect_logger_as(
 314                 const Unit *unit,
 315                 const ExecContext *context,
 316                 const ExecParameters *params,
 317                 ExecOutput output,
 318                 const char *ident,
 319                 int nfd,
 320                 uid_t uid,
 321                 gid_t gid) {
 322
 323         _cleanup_close_ int fd = -1;
 324         int r;
 325
 326         assert(context);
 327         assert(params);
 328         assert(output < _EXEC_OUTPUT_MAX);
 329         assert(ident);
 330         assert(nfd >= 0);
 331
 332         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 333         if (fd < 0)
 334                 return -errno;
 335
 336         r = connect_journal_socket(fd, uid, gid);
 337         if (r < 0)
 338                 return r;
 339
 340         if (shutdown(fd, SHUT_RD) < 0)
 341                 return -errno;
 342
 343         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 344
 345         if (dprintf(fd,
 346                 "%s\n"
 347                 "%s\n"
 348                 "%i\n"
 349                 "%i\n"
 350                 "%i\n"
 351                 "%i\n"
 352                 "%i\n",
 353                 context->syslog_identifier ?: ident,
 354                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 355                 context->syslog_priority,
 356                 !!context->syslog_level_prefix,
 357                 is_syslog_output(output),
 358                 is_kmsg_output(output),
 359                 is_terminal_output(output)) < 0)
 360                 return -errno;
 361
 362         return move_fd(TAKE_FD(fd), nfd, false);
 363 }
 364
 365 static int open_terminal_as(const char *path, int flags, int nfd) {
 366         int fd;
 367
 368         assert(path);
 369         assert(nfd >= 0);
 370
 371         fd = open_terminal(path, flags | O_NOCTTY);
 372         if (fd < 0)
 373                 return fd;
 374
 375         return move_fd(fd, nfd, false);
 376 }
 377
 378 static int acquire_path(const char *path, int flags, mode_t mode) {
 379         union sockaddr_union sa = {};
 380         _cleanup_close_ int fd = -1;
 381         int r, salen;
 382
 383         assert(path);
 384
 385         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 386                 flags |= O_CREAT;
 387
 388         fd = open(path, flags|O_NOCTTY, mode);
 389         if (fd >= 0)
 390                 return TAKE_FD(fd);
 391
 392         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 393                 return -errno;
 394         if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 395                 return -ENXIO;
 396
 397         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 398
 399         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 400         if (fd < 0)
 401                 return -errno;
 402
 403         salen = sockaddr_un_set_path(&sa.un, path);
 404         if (salen < 0)
 405                 return salen;
 406
 407         if (connect(fd, &sa.sa, salen) < 0)
 408                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 409                                                            * indication that his wasn't an AF_UNIX socket after all */
 410
 411         if ((flags & O_ACCMODE) == O_RDONLY)
 412                 r = shutdown(fd, SHUT_WR);
 413         else if ((flags & O_ACCMODE) == O_WRONLY)
 414                 r = shutdown(fd, SHUT_RD);
 415         else
 416                 return TAKE_FD(fd);
 417         if (r < 0)
 418                 return -errno;
 419
 420         return TAKE_FD(fd);
 421 }
 422
 423 static int fixup_input(
 424                 const ExecContext *context,
 425                 int socket_fd,
 426                 bool apply_tty_stdin) {
 427
 428         ExecInput std_input;
 429
 430         assert(context);
 431
 432         std_input = context->std_input;
 433
 434         if (is_terminal_input(std_input) && !apply_tty_stdin)
 435                 return EXEC_INPUT_NULL;
 436
 437         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 438                 return EXEC_INPUT_NULL;
 439
 440         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 441                 return EXEC_INPUT_NULL;
 442
 443         return std_input;
 444 }
 445
 446 static int fixup_output(ExecOutput std_output, int socket_fd) {
 447
 448         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 449                 return EXEC_OUTPUT_INHERIT;
 450
 451         return std_output;
 452 }
 453
 454 static int setup_input(
 455                 const ExecContext *context,
 456                 const ExecParameters *params,
 457                 int socket_fd,
 458                 int named_iofds[3]) {
 459
 460         ExecInput i;
 461
 462         assert(context);
 463         assert(params);
 464
 465         if (params->stdin_fd >= 0) {
 466                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 467                         return -errno;
 468
 469                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 470                 if (isatty(STDIN_FILENO)) {
 471                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 472                         (void) reset_terminal_fd(STDIN_FILENO, true);
 473                 }
 474
 475                 return STDIN_FILENO;
 476         }
 477
 478         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 479
 480         switch (i) {
 481
 482         case EXEC_INPUT_NULL:
 483                 return open_null_as(O_RDONLY, STDIN_FILENO);
 484
 485         case EXEC_INPUT_TTY:
 486         case EXEC_INPUT_TTY_FORCE:
 487         case EXEC_INPUT_TTY_FAIL: {
 488                 int fd;
 489
 490                 fd = acquire_terminal(exec_context_tty_path(context),
 491                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 492                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 493                                                                   ACQUIRE_TERMINAL_WAIT,
 494                                       USEC_INFINITY);
 495                 if (fd < 0)
 496                         return fd;
 497
 498                 return move_fd(fd, STDIN_FILENO, false);
 499         }
 500
 501         case EXEC_INPUT_SOCKET:
 502                 assert(socket_fd >= 0);
 503
 504                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 505
 506         case EXEC_INPUT_NAMED_FD:
 507                 assert(named_iofds[STDIN_FILENO] >= 0);
 508
 509                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 510                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 511
 512         case EXEC_INPUT_DATA: {
 513                 int fd;
 514
 515                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 516                 if (fd < 0)
 517                         return fd;
 518
 519                 return move_fd(fd, STDIN_FILENO, false);
 520         }
 521
 522         case EXEC_INPUT_FILE: {
 523                 bool rw;
 524                 int fd;
 525
 526                 assert(context->stdio_file[STDIN_FILENO]);
 527
 528                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 529                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 530
 531                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 532                 if (fd < 0)
 533                         return fd;
 534
 535                 return move_fd(fd, STDIN_FILENO, false);
 536         }
 537
 538         default:
 539                 assert_not_reached("Unknown input type");
 540         }
 541 }
 542
 543 static bool can_inherit_stderr_from_stdout(
 544                 const ExecContext *context,
 545                 ExecOutput o,
 546                 ExecOutput e) {
 547
 548         assert(context);
 549
 550         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 551          * stderr fd */
 552
 553         if (e == EXEC_OUTPUT_INHERIT)
 554                 return true;
 555         if (e != o)
 556                 return false;
 557
 558         if (e == EXEC_OUTPUT_NAMED_FD)
 559                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 560
 561         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 562                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 563
 564         return true;
 565 }
 566
 567 static int setup_output(
 568                 const Unit *unit,
 569                 const ExecContext *context,
 570                 const ExecParameters *params,
 571                 int fileno,
 572                 int socket_fd,
 573                 int named_iofds[3],
 574                 const char *ident,
 575                 uid_t uid,
 576                 gid_t gid,
 577                 dev_t *journal_stream_dev,
 578                 ino_t *journal_stream_ino) {
 579
 580         ExecOutput o;
 581         ExecInput i;
 582         int r;
 583
 584         assert(unit);
 585         assert(context);
 586         assert(params);
 587         assert(ident);
 588         assert(journal_stream_dev);
 589         assert(journal_stream_ino);
 590
 591         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 592
 593                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 594                         return -errno;
 595
 596                 return STDOUT_FILENO;
 597         }
 598
 599         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 600                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 601                         return -errno;
 602
 603                 return STDERR_FILENO;
 604         }
 605
 606         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 607         o = fixup_output(context->std_output, socket_fd);
 608
 609         if (fileno == STDERR_FILENO) {
 610                 ExecOutput e;
 611                 e = fixup_output(context->std_error, socket_fd);
 612
 613                 /* This expects the input and output are already set up */
 614
 615                 /* Don't change the stderr file descriptor if we inherit all
 616                  * the way and are not on a tty */
 617                 if (e == EXEC_OUTPUT_INHERIT &&
 618                     o == EXEC_OUTPUT_INHERIT &&
 619                     i == EXEC_INPUT_NULL &&
 620                     !is_terminal_input(context->std_input) &&
 621                     getppid () != 1)
 622                         return fileno;
 623
 624                 /* Duplicate from stdout if possible */
 625                 if (can_inherit_stderr_from_stdout(context, o, e))
 626                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 627
 628                 o = e;
 629
 630         } else if (o == EXEC_OUTPUT_INHERIT) {
 631                 /* If input got downgraded, inherit the original value */
 632                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 633                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 634
 635                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 636                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 637                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 638
 639                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 640                 if (getppid() != 1)
 641                         return fileno;
 642
 643                 /* We need to open /dev/null here anew, to get the right access mode. */
 644                 return open_null_as(O_WRONLY, fileno);
 645         }
 646
 647         switch (o) {
 648
 649         case EXEC_OUTPUT_NULL:
 650                 return open_null_as(O_WRONLY, fileno);
 651
 652         case EXEC_OUTPUT_TTY:
 653                 if (is_terminal_input(i))
 654                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 655
 656                 /* We don't reset the terminal if this is just about output */
 657                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 658
 659         case EXEC_OUTPUT_SYSLOG:
 660         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 661         case EXEC_OUTPUT_KMSG:
 662         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 663         case EXEC_OUTPUT_JOURNAL:
 664         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 665                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 666                 if (r < 0) {
 667                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 668                         r = open_null_as(O_WRONLY, fileno);
 669                 } else {
 670                         struct stat st;
 671
 672                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 673                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 674                          * services to detect whether they are connected to the journal or not.
 675                          *
 676                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 677                          * about STDERR as that's usually the best way to do logging. */
 678
 679                         if (fstat(fileno, &st) >= 0 &&
 680                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 681                                 *journal_stream_dev = st.st_dev;
 682                                 *journal_stream_ino = st.st_ino;
 683                         }
 684                 }
 685                 return r;
 686
 687         case EXEC_OUTPUT_SOCKET:
 688                 assert(socket_fd >= 0);
 689
 690                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 691
 692         case EXEC_OUTPUT_NAMED_FD:
 693                 assert(named_iofds[fileno] >= 0);
 694
 695                 (void) fd_nonblock(named_iofds[fileno], false);
 696                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 697
 698         case EXEC_OUTPUT_FILE:
 699         case EXEC_OUTPUT_FILE_APPEND: {
 700                 bool rw;
 701                 int fd, flags;
 702
 703                 assert(context->stdio_file[fileno]);
 704
 705                 rw = context->std_input == EXEC_INPUT_FILE &&
 706                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 707
 708                 if (rw)
 709                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 710
 711                 flags = O_WRONLY;
 712                 if (o == EXEC_OUTPUT_FILE_APPEND)
 713                         flags |= O_APPEND;
 714
 715                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 716                 if (fd < 0)
 717                         return fd;
 718
 719                 return move_fd(fd, fileno, 0);
 720         }
 721
 722         default:
 723                 assert_not_reached("Unknown error type");
 724         }
 725 }
 726
 727 static int chown_terminal(int fd, uid_t uid) {
 728         int r;
 729
 730         assert(fd >= 0);
 731
 732         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 733         if (isatty(fd) < 1) {
 734                 if (IN_SET(errno, EINVAL, ENOTTY))
 735                         return 0; /* not a tty */
 736
 737                 return -errno;
 738         }
 739
 740         /* This might fail. What matters are the results. */
 741         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 742         if (r < 0)
 743                 return r;
 744
 745         return 1;
 746 }
 747
 748 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 749         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 750         int r;
 751
 752         assert(_saved_stdin);
 753         assert(_saved_stdout);
 754
 755         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 756         if (saved_stdin < 0)
 757                 return -errno;
 758
 759         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 760         if (saved_stdout < 0)
 761                 return -errno;
 762
 763         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 764         if (fd < 0)
 765                 return fd;
 766
 767         r = chown_terminal(fd, getuid());
 768         if (r < 0)
 769                 return r;
 770
 771         r = reset_terminal_fd(fd, true);
 772         if (r < 0)
 773                 return r;
 774
 775         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 776         fd = -1;
 777         if (r < 0)
 778                 return r;
 779
 780         *_saved_stdin = saved_stdin;
 781         *_saved_stdout = saved_stdout;
 782
 783         saved_stdin = saved_stdout = -1;
 784
 785         return 0;
 786 }
 787
 788 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 789         assert(err < 0);
 790
 791         if (err == -ETIMEDOUT)
 792                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 793         else {
 794                 errno = -err;
 795                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 796         }
 797 }
 798
 799 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 800         _cleanup_close_ int fd = -1;
 801
 802         assert(vc);
 803
 804         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 805         if (fd < 0)
 806                 return;
 807
 808         write_confirm_error_fd(err, fd, u);
 809 }
 810
 811 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 812         int r = 0;
 813
 814         assert(saved_stdin);
 815         assert(saved_stdout);
 816
 817         release_terminal();
 818
 819         if (*saved_stdin >= 0)
 820                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 821                         r = -errno;
 822
 823         if (*saved_stdout >= 0)
 824                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 825                         r = -errno;
 826
 827         *saved_stdin = safe_close(*saved_stdin);
 828         *saved_stdout = safe_close(*saved_stdout);
 829
 830         return r;
 831 }
 832
 833 enum {
 834         CONFIRM_PRETEND_FAILURE = -1,
 835         CONFIRM_PRETEND_SUCCESS =  0,
 836         CONFIRM_EXECUTE = 1,
 837 };
 838
 839 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 840         int saved_stdout = -1, saved_stdin = -1, r;
 841         _cleanup_free_ char *e = NULL;
 842         char c;
 843
 844         /* For any internal errors, assume a positive response. */
 845         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 846         if (r < 0) {
 847                 write_confirm_error(r, vc, u);
 848                 return CONFIRM_EXECUTE;
 849         }
 850
 851         /* confirm_spawn might have been disabled while we were sleeping. */
 852         if (manager_is_confirm_spawn_disabled(u->manager)) {
 853                 r = 1;
 854                 goto restore_stdio;
 855         }
 856
 857         e = ellipsize(cmdline, 60, 100);
 858         if (!e) {
 859                 log_oom();
 860                 r = CONFIRM_EXECUTE;
 861                 goto restore_stdio;
 862         }
 863
 864         for (;;) {
 865                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 866                 if (r < 0) {
 867                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 868                         r = CONFIRM_EXECUTE;
 869                         goto restore_stdio;
 870                 }
 871
 872                 switch (c) {
 873                 case 'c':
 874                         printf("Resuming normal execution.\n");
 875                         manager_disable_confirm_spawn();
 876                         r = 1;
 877                         break;
 878                 case 'D':
 879                         unit_dump(u, stdout, "  ");
 880                         continue; /* ask again */
 881                 case 'f':
 882                         printf("Failing execution.\n");
 883                         r = CONFIRM_PRETEND_FAILURE;
 884                         break;
 885                 case 'h':
 886                         printf("  c - continue, proceed without asking anymore\n"
 887                                "  D - dump, show the state of the unit\n"
 888                                "  f - fail, don't execute the command and pretend it failed\n"
 889                                "  h - help\n"
 890                                "  i - info, show a short summary of the unit\n"
 891                                "  j - jobs, show jobs that are in progress\n"
 892                                "  s - skip, don't execute the command and pretend it succeeded\n"
 893                                "  y - yes, execute the command\n");
 894                         continue; /* ask again */
 895                 case 'i':
 896                         printf("  Description: %s\n"
 897                                "  Unit:        %s\n"
 898                                "  Command:     %s\n",
 899                                u->id, u->description, cmdline);
 900                         continue; /* ask again */
 901                 case 'j':
 902                         manager_dump_jobs(u->manager, stdout, "  ");
 903                         continue; /* ask again */
 904                 case 'n':
 905                         /* 'n' was removed in favor of 'f'. */
 906                         printf("Didn't understand 'n', did you mean 'f'?\n");
 907                         continue; /* ask again */
 908                 case 's':
 909                         printf("Skipping execution.\n");
 910                         r = CONFIRM_PRETEND_SUCCESS;
 911                         break;
 912                 case 'y':
 913                         r = CONFIRM_EXECUTE;
 914                         break;
 915                 default:
 916                         assert_not_reached("Unhandled choice");
 917                 }
 918                 break;
 919         }
 920
 921 restore_stdio:
 922         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 923         return r;
 924 }
 925
 926 static int get_fixed_user(const ExecContext *c, const char **user,
 927                           uid_t *uid, gid_t *gid,
 928                           const char **home, const char **shell) {
 929         int r;
 930         const char *name;
 931
 932         assert(c);
 933
 934         if (!c->user)
 935                 return 0;
 936
 937         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 938          * (i.e. are "/" or "/bin/nologin"). */
 939
 940         name = c->user;
 941         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 942         if (r < 0)
 943                 return r;
 944
 945         *user = name;
 946         return 0;
 947 }
 948
 949 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 950         int r;
 951         const char *name;
 952
 953         assert(c);
 954
 955         if (!c->group)
 956                 return 0;
 957
 958         name = c->group;
 959         r = get_group_creds(&name, gid, 0);
 960         if (r < 0)
 961                 return r;
 962
 963         *group = name;
 964         return 0;
 965 }
 966
 967 static int get_supplementary_groups(const ExecContext *c, const char *user,
 968                                     const char *group, gid_t gid,
 969                                     gid_t **supplementary_gids, int *ngids) {
 970         char **i;
 971         int r, k = 0;
 972         int ngroups_max;
 973         bool keep_groups = false;
 974         gid_t *groups = NULL;
 975         _cleanup_free_ gid_t *l_gids = NULL;
 976
 977         assert(c);
 978
 979         /*
 980          * If user is given, then lookup GID and supplementary groups list.
 981          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 982          * here and as early as possible so we keep the list of supplementary
 983          * groups of the caller.
 984          */
 985         if (user && gid_is_valid(gid) && gid != 0) {
 986                 /* First step, initialize groups from /etc/groups */
 987                 if (initgroups(user, gid) < 0)
 988                         return -errno;
 989
 990                 keep_groups = true;
 991         }
 992
 993         if (strv_isempty(c->supplementary_groups))
 994                 return 0;
 995
 996         /*
 997          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 998          * be positive, otherwise fail.
 999          */
1000         errno = 0;
1001         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1002         if (ngroups_max <= 0) {
1003                 if (errno > 0)
1004                         return -errno;
1005                 else
1006                         return -EOPNOTSUPP; /* For all other values */
1007         }
1008
1009         l_gids = new(gid_t, ngroups_max);
1010         if (!l_gids)
1011                 return -ENOMEM;
1012
1013         if (keep_groups) {
1014                 /*
1015                  * Lookup the list of groups that the user belongs to, we
1016                  * avoid NSS lookups here too for gid=0.
1017                  */
1018                 k = ngroups_max;
1019                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1020                         return -EINVAL;
1021         } else
1022                 k = 0;
1023
1024         STRV_FOREACH(i, c->supplementary_groups) {
1025                 const char *g;
1026
1027                 if (k >= ngroups_max)
1028                         return -E2BIG;
1029
1030                 g = *i;
1031                 r = get_group_creds(&g, l_gids+k, 0);
1032                 if (r < 0)
1033                         return r;
1034
1035                 k++;
1036         }
1037
1038         /*
1039          * Sets ngids to zero to drop all supplementary groups, happens
1040          * when we are under root and SupplementaryGroups= is empty.
1041          */
1042         if (k == 0) {
1043                 *ngids = 0;
1044                 return 0;
1045         }
1046
1047         /* Otherwise get the final list of supplementary groups */
1048         groups = memdup(l_gids, sizeof(gid_t) * k);
1049         if (!groups)
1050                 return -ENOMEM;
1051
1052         *supplementary_gids = groups;
1053         *ngids = k;
1054
1055         groups = NULL;
1056
1057         return 0;
1058 }
1059
1060 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1061         int r;
1062
1063         /* Handle SupplementaryGroups= if it is not empty */
1064         if (ngids > 0) {
1065                 r = maybe_setgroups(ngids, supplementary_gids);
1066                 if (r < 0)
1067                         return r;
1068         }
1069
1070         if (gid_is_valid(gid)) {
1071                 /* Then set our gids */
1072                 if (setresgid(gid, gid, gid) < 0)
1073                         return -errno;
1074         }
1075
1076         return 0;
1077 }
1078
1079 static int enforce_user(const ExecContext *context, uid_t uid) {
1080         assert(context);
1081
1082         if (!uid_is_valid(uid))
1083                 return 0;
1084
1085         /* Sets (but doesn't look up) the uid and make sure we keep the
1086          * capabilities while doing so. */
1087
1088         if (context->capability_ambient_set != 0) {
1089
1090                 /* First step: If we need to keep capabilities but
1091                  * drop privileges we need to make sure we keep our
1092                  * caps, while we drop privileges. */
1093                 if (uid != 0) {
1094                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1095
1096                         if (prctl(PR_GET_SECUREBITS) != sb)
1097                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1098                                         return -errno;
1099                 }
1100         }
1101
1102         /* Second step: actually set the uids */
1103         if (setresuid(uid, uid, uid) < 0)
1104                 return -errno;
1105
1106         /* At this point we should have all necessary capabilities but
1107            are otherwise a normal user. However, the caps might got
1108            corrupted due to the setresuid() so we need clean them up
1109            later. This is done outside of this call. */
1110
1111         return 0;
1112 }
1113
1114 #if HAVE_PAM
1115
1116 static int null_conv(
1117                 int num_msg,
1118                 const struct pam_message **msg,
1119                 struct pam_response **resp,
1120                 void *appdata_ptr) {
1121
1122         /* We don't support conversations */
1123
1124         return PAM_CONV_ERR;
1125 }
1126
1127 #endif
1128
1129 static int setup_pam(
1130                 const char *name,
1131                 const char *user,
1132                 uid_t uid,
1133                 gid_t gid,
1134                 const char *tty,
1135                 char ***env,
1136                 int fds[], size_t n_fds) {
1137
1138 #if HAVE_PAM
1139
1140         static const struct pam_conv conv = {
1141                 .conv = null_conv,
1142                 .appdata_ptr = NULL
1143         };
1144
1145         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1146         pam_handle_t *handle = NULL;
1147         sigset_t old_ss;
1148         int pam_code = PAM_SUCCESS, r;
1149         char **nv, **e = NULL;
1150         bool close_session = false;
1151         pid_t pam_pid = 0, parent_pid;
1152         int flags = 0;
1153
1154         assert(name);
1155         assert(user);
1156         assert(env);
1157
1158         /* We set up PAM in the parent process, then fork. The child
1159          * will then stay around until killed via PR_GET_PDEATHSIG or
1160          * systemd via the cgroup logic. It will then remove the PAM
1161          * session again. The parent process will exec() the actual
1162          * daemon. We do things this way to ensure that the main PID
1163          * of the daemon is the one we initially fork()ed. */
1164
1165         r = barrier_create(&barrier);
1166         if (r < 0)
1167                 goto fail;
1168
1169         if (log_get_max_level() < LOG_DEBUG)
1170                 flags |= PAM_SILENT;
1171
1172         pam_code = pam_start(name, user, &conv, &handle);
1173         if (pam_code != PAM_SUCCESS) {
1174                 handle = NULL;
1175                 goto fail;
1176         }
1177
1178         if (!tty) {
1179                 _cleanup_free_ char *q = NULL;
1180
1181                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1182                  * out if that's the case, and read the TTY off it. */
1183
1184                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1185                         tty = strjoina("/dev/", q);
1186         }
1187
1188         if (tty) {
1189                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1190                 if (pam_code != PAM_SUCCESS)
1191                         goto fail;
1192         }
1193
1194         STRV_FOREACH(nv, *env) {
1195                 pam_code = pam_putenv(handle, *nv);
1196                 if (pam_code != PAM_SUCCESS)
1197                         goto fail;
1198         }
1199
1200         pam_code = pam_acct_mgmt(handle, flags);
1201         if (pam_code != PAM_SUCCESS)
1202                 goto fail;
1203
1204         pam_code = pam_open_session(handle, flags);
1205         if (pam_code != PAM_SUCCESS)
1206                 goto fail;
1207
1208         close_session = true;
1209
1210         e = pam_getenvlist(handle);
1211         if (!e) {
1212                 pam_code = PAM_BUF_ERR;
1213                 goto fail;
1214         }
1215
1216         /* Block SIGTERM, so that we know that it won't get lost in
1217          * the child */
1218
1219         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1220
1221         parent_pid = getpid_cached();
1222
1223         r = safe_fork("(sd-pam)", 0, &pam_pid);
1224         if (r < 0)
1225                 goto fail;
1226         if (r == 0) {
1227                 int sig, ret = EXIT_PAM;
1228
1229                 /* The child's job is to reset the PAM session on
1230                  * termination */
1231                 barrier_set_role(&barrier, BARRIER_CHILD);
1232
1233                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1234                  * are open here that have been opened by PAM. */
1235                 (void) close_many(fds, n_fds);
1236
1237                 /* Drop privileges - we don't need any to pam_close_session
1238                  * and this will make PR_SET_PDEATHSIG work in most cases.
1239                  * If this fails, ignore the error - but expect sd-pam threads
1240                  * to fail to exit normally */
1241
1242                 r = maybe_setgroups(0, NULL);
1243                 if (r < 0)
1244                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1245                 if (setresgid(gid, gid, gid) < 0)
1246                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1247                 if (setresuid(uid, uid, uid) < 0)
1248                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1249
1250                 (void) ignore_signals(SIGPIPE, -1);
1251
1252                 /* Wait until our parent died. This will only work if
1253                  * the above setresuid() succeeds, otherwise the kernel
1254                  * will not allow unprivileged parents kill their privileged
1255                  * children this way. We rely on the control groups kill logic
1256                  * to do the rest for us. */
1257                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1258                         goto child_finish;
1259
1260                 /* Tell the parent that our setup is done. This is especially
1261                  * important regarding dropping privileges. Otherwise, unit
1262                  * setup might race against our setresuid(2) call.
1263                  *
1264                  * If the parent aborted, we'll detect this below, hence ignore
1265                  * return failure here. */
1266                 (void) barrier_place(&barrier);
1267
1268                 /* Check if our parent process might already have died? */
1269                 if (getppid() == parent_pid) {
1270                         sigset_t ss;
1271
1272                         assert_se(sigemptyset(&ss) >= 0);
1273                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1274
1275                         for (;;) {
1276                                 if (sigwait(&ss, &sig) < 0) {
1277                                         if (errno == EINTR)
1278                                                 continue;
1279
1280                                         goto child_finish;
1281                                 }
1282
1283                                 assert(sig == SIGTERM);
1284                                 break;
1285                         }
1286                 }
1287
1288                 /* If our parent died we'll end the session */
1289                 if (getppid() != parent_pid) {
1290                         pam_code = pam_close_session(handle, flags);
1291                         if (pam_code != PAM_SUCCESS)
1292                                 goto child_finish;
1293                 }
1294
1295                 ret = 0;
1296
1297         child_finish:
1298                 pam_end(handle, pam_code | flags);
1299                 _exit(ret);
1300         }
1301
1302         barrier_set_role(&barrier, BARRIER_PARENT);
1303
1304         /* If the child was forked off successfully it will do all the
1305          * cleanups, so forget about the handle here. */
1306         handle = NULL;
1307
1308         /* Unblock SIGTERM again in the parent */
1309         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1310
1311         /* We close the log explicitly here, since the PAM modules
1312          * might have opened it, but we don't want this fd around. */
1313         closelog();
1314
1315         /* Synchronously wait for the child to initialize. We don't care for
1316          * errors as we cannot recover. However, warn loudly if it happens. */
1317         if (!barrier_place_and_sync(&barrier))
1318                 log_error("PAM initialization failed");
1319
1320         return strv_free_and_replace(*env, e);
1321
1322 fail:
1323         if (pam_code != PAM_SUCCESS) {
1324                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1325                 r = -EPERM;  /* PAM errors do not map to errno */
1326         } else
1327                 log_error_errno(r, "PAM failed: %m");
1328
1329         if (handle) {
1330                 if (close_session)
1331                         pam_code = pam_close_session(handle, flags);
1332
1333                 pam_end(handle, pam_code | flags);
1334         }
1335
1336         strv_free(e);
1337         closelog();
1338
1339         return r;
1340 #else
1341         return 0;
1342 #endif
1343 }
1344
1345 static void rename_process_from_path(const char *path) {
1346         char process_name[11];
1347         const char *p;
1348         size_t l;
1349
1350         /* This resulting string must fit in 10 chars (i.e. the length
1351          * of "/sbin/init") to look pretty in /bin/ps */
1352
1353         p = basename(path);
1354         if (isempty(p)) {
1355                 rename_process("(...)");
1356                 return;
1357         }
1358
1359         l = strlen(p);
1360         if (l > 8) {
1361                 /* The end of the process name is usually more
1362                  * interesting, since the first bit might just be
1363                  * "systemd-" */
1364                 p = p + l - 8;
1365                 l = 8;
1366         }
1367
1368         process_name[0] = '(';
1369         memcpy(process_name+1, p, l);
1370         process_name[1+l] = ')';
1371         process_name[1+l+1] = 0;
1372
1373         rename_process(process_name);
1374 }
1375
1376 static bool context_has_address_families(const ExecContext *c) {
1377         assert(c);
1378
1379         return c->address_families_whitelist ||
1380                 !set_isempty(c->address_families);
1381 }
1382
1383 static bool context_has_syscall_filters(const ExecContext *c) {
1384         assert(c);
1385
1386         return c->syscall_whitelist ||
1387                 !hashmap_isempty(c->syscall_filter);
1388 }
1389
1390 static bool context_has_no_new_privileges(const ExecContext *c) {
1391         assert(c);
1392
1393         if (c->no_new_privileges)
1394                 return true;
1395
1396         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1397                 return false;
1398
1399         /* We need NNP if we have any form of seccomp and are unprivileged */
1400         return context_has_address_families(c) ||
1401                 c->memory_deny_write_execute ||
1402                 c->restrict_realtime ||
1403                 c->restrict_suid_sgid ||
1404                 exec_context_restrict_namespaces_set(c) ||
1405                 c->protect_kernel_tunables ||
1406                 c->protect_kernel_modules ||
1407                 c->private_devices ||
1408                 context_has_syscall_filters(c) ||
1409                 !set_isempty(c->syscall_archs) ||
1410                 c->lock_personality ||
1411                 c->protect_hostname;
1412 }
1413
1414 #if HAVE_SECCOMP
1415
1416 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1417
1418         if (is_seccomp_available())
1419                 return false;
1420
1421         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1422         return true;
1423 }
1424
1425 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1426         uint32_t negative_action, default_action, action;
1427         int r;
1428
1429         assert(u);
1430         assert(c);
1431
1432         if (!context_has_syscall_filters(c))
1433                 return 0;
1434
1435         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1436                 return 0;
1437
1438         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1439
1440         if (c->syscall_whitelist) {
1441                 default_action = negative_action;
1442                 action = SCMP_ACT_ALLOW;
1443         } else {
1444                 default_action = SCMP_ACT_ALLOW;
1445                 action = negative_action;
1446         }
1447
1448         if (needs_ambient_hack) {
1449                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1450                 if (r < 0)
1451                         return r;
1452         }
1453
1454         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1455 }
1456
1457 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1458         assert(u);
1459         assert(c);
1460
1461         if (set_isempty(c->syscall_archs))
1462                 return 0;
1463
1464         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1465                 return 0;
1466
1467         return seccomp_restrict_archs(c->syscall_archs);
1468 }
1469
1470 static int apply_address_families(const Unit* u, const ExecContext *c) {
1471         assert(u);
1472         assert(c);
1473
1474         if (!context_has_address_families(c))
1475                 return 0;
1476
1477         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1478                 return 0;
1479
1480         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1481 }
1482
1483 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1484         assert(u);
1485         assert(c);
1486
1487         if (!c->memory_deny_write_execute)
1488                 return 0;
1489
1490         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1491                 return 0;
1492
1493         return seccomp_memory_deny_write_execute();
1494 }
1495
1496 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1497         assert(u);
1498         assert(c);
1499
1500         if (!c->restrict_realtime)
1501                 return 0;
1502
1503         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1504                 return 0;
1505
1506         return seccomp_restrict_realtime();
1507 }
1508
1509 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1510         assert(u);
1511         assert(c);
1512
1513         if (!c->restrict_suid_sgid)
1514                 return 0;
1515
1516         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1517                 return 0;
1518
1519         return seccomp_restrict_suid_sgid();
1520 }
1521
1522 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1523         assert(u);
1524         assert(c);
1525
1526         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1527          * let's protect even those systems where this is left on in the kernel. */
1528
1529         if (!c->protect_kernel_tunables)
1530                 return 0;
1531
1532         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1533                 return 0;
1534
1535         return seccomp_protect_sysctl();
1536 }
1537
1538 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1539         assert(u);
1540         assert(c);
1541
1542         /* Turn off module syscalls on ProtectKernelModules=yes */
1543
1544         if (!c->protect_kernel_modules)
1545                 return 0;
1546
1547         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1548                 return 0;
1549
1550         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1551 }
1552
1553 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1554         assert(u);
1555         assert(c);
1556
1557         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1558
1559         if (!c->private_devices)
1560                 return 0;
1561
1562         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1563                 return 0;
1564
1565         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1566 }
1567
1568 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1569         assert(u);
1570         assert(c);
1571
1572         if (!exec_context_restrict_namespaces_set(c))
1573                 return 0;
1574
1575         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1576                 return 0;
1577
1578         return seccomp_restrict_namespaces(c->restrict_namespaces);
1579 }
1580
1581 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1582         unsigned long personality;
1583         int r;
1584
1585         assert(u);
1586         assert(c);
1587
1588         if (!c->lock_personality)
1589                 return 0;
1590
1591         if (skip_seccomp_unavailable(u, "LockPersonality="))
1592                 return 0;
1593
1594         personality = c->personality;
1595
1596         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1597         if (personality == PERSONALITY_INVALID) {
1598
1599                 r = opinionated_personality(&personality);
1600                 if (r < 0)
1601                         return r;
1602         }
1603
1604         return seccomp_lock_personality(personality);
1605 }
1606
1607 #endif
1608
1609 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1610         assert(idle_pipe);
1611
1612         idle_pipe[1] = safe_close(idle_pipe[1]);
1613         idle_pipe[2] = safe_close(idle_pipe[2]);
1614
1615         if (idle_pipe[0] >= 0) {
1616                 int r;
1617
1618                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1619
1620                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1621                         ssize_t n;
1622
1623                         /* Signal systemd that we are bored and want to continue. */
1624                         n = write(idle_pipe[3], "x", 1);
1625                         if (n > 0)
1626                                 /* Wait for systemd to react to the signal above. */
1627                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1628                 }
1629
1630                 idle_pipe[0] = safe_close(idle_pipe[0]);
1631
1632         }
1633
1634         idle_pipe[3] = safe_close(idle_pipe[3]);
1635 }
1636
1637 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1638
1639 static int build_environment(
1640                 const Unit *u,
1641                 const ExecContext *c,
1642                 const ExecParameters *p,
1643                 size_t n_fds,
1644                 const char *home,
1645                 const char *username,
1646                 const char *shell,
1647                 dev_t journal_stream_dev,
1648                 ino_t journal_stream_ino,
1649                 char ***ret) {
1650
1651         _cleanup_strv_free_ char **our_env = NULL;
1652         ExecDirectoryType t;
1653         size_t n_env = 0;
1654         char *x;
1655
1656         assert(u);
1657         assert(c);
1658         assert(p);
1659         assert(ret);
1660
1661         our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
1662         if (!our_env)
1663                 return -ENOMEM;
1664
1665         if (n_fds > 0) {
1666                 _cleanup_free_ char *joined = NULL;
1667
1668                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1669                         return -ENOMEM;
1670                 our_env[n_env++] = x;
1671
1672                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1673                         return -ENOMEM;
1674                 our_env[n_env++] = x;
1675
1676                 joined = strv_join(p->fd_names, ":");
1677                 if (!joined)
1678                         return -ENOMEM;
1679
1680                 x = strjoin("LISTEN_FDNAMES=", joined);
1681                 if (!x)
1682                         return -ENOMEM;
1683                 our_env[n_env++] = x;
1684         }
1685
1686         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1687                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1688                         return -ENOMEM;
1689                 our_env[n_env++] = x;
1690
1691                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1692                         return -ENOMEM;
1693                 our_env[n_env++] = x;
1694         }
1695
1696         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1697          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1698          * check the database directly. */
1699         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1700                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1701                 if (!x)
1702                         return -ENOMEM;
1703                 our_env[n_env++] = x;
1704         }
1705
1706         if (home) {
1707                 x = strappend("HOME=", home);
1708                 if (!x)
1709                         return -ENOMEM;
1710
1711                 path_simplify(x + 5, true);
1712                 our_env[n_env++] = x;
1713         }
1714
1715         if (username) {
1716                 x = strappend("LOGNAME=", username);
1717                 if (!x)
1718                         return -ENOMEM;
1719                 our_env[n_env++] = x;
1720
1721                 x = strappend("USER=", username);
1722                 if (!x)
1723                         return -ENOMEM;
1724                 our_env[n_env++] = x;
1725         }
1726
1727         if (shell) {
1728                 x = strappend("SHELL=", shell);
1729                 if (!x)
1730                         return -ENOMEM;
1731
1732                 path_simplify(x + 6, true);
1733                 our_env[n_env++] = x;
1734         }
1735
1736         if (!sd_id128_is_null(u->invocation_id)) {
1737                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1738                         return -ENOMEM;
1739
1740                 our_env[n_env++] = x;
1741         }
1742
1743         if (exec_context_needs_term(c)) {
1744                 const char *tty_path, *term = NULL;
1745
1746                 tty_path = exec_context_tty_path(c);
1747
1748                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1749                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1750                  * passes to PID 1 ends up all the way in the console login shown. */
1751
1752                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1753                         term = getenv("TERM");
1754                 if (!term)
1755                         term = default_term_for_tty(tty_path);
1756
1757                 x = strappend("TERM=", term);
1758                 if (!x)
1759                         return -ENOMEM;
1760                 our_env[n_env++] = x;
1761         }
1762
1763         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1764                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1765                         return -ENOMEM;
1766
1767                 our_env[n_env++] = x;
1768         }
1769
1770         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1771                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1772                 const char *n;
1773
1774                 if (!p->prefix[t])
1775                         continue;
1776
1777                 if (strv_isempty(c->directories[t].paths))
1778                         continue;
1779
1780                 n = exec_directory_env_name_to_string(t);
1781                 if (!n)
1782                         continue;
1783
1784                 pre = strjoin(p->prefix[t], "/");
1785                 if (!pre)
1786                         return -ENOMEM;
1787
1788                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1789                 if (!joined)
1790                         return -ENOMEM;
1791
1792                 x = strjoin(n, "=", joined);
1793                 if (!x)
1794                         return -ENOMEM;
1795
1796                 our_env[n_env++] = x;
1797         }
1798
1799         our_env[n_env++] = NULL;
1800         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1801
1802         *ret = TAKE_PTR(our_env);
1803
1804         return 0;
1805 }
1806
1807 static int build_pass_environment(const ExecContext *c, char ***ret) {
1808         _cleanup_strv_free_ char **pass_env = NULL;
1809         size_t n_env = 0, n_bufsize = 0;
1810         char **i;
1811
1812         STRV_FOREACH(i, c->pass_environment) {
1813                 _cleanup_free_ char *x = NULL;
1814                 char *v;
1815
1816                 v = getenv(*i);
1817                 if (!v)
1818                         continue;
1819                 x = strjoin(*i, "=", v);
1820                 if (!x)
1821                         return -ENOMEM;
1822
1823                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1824                         return -ENOMEM;
1825
1826                 pass_env[n_env++] = TAKE_PTR(x);
1827                 pass_env[n_env] = NULL;
1828         }
1829
1830         *ret = TAKE_PTR(pass_env);
1831
1832         return 0;
1833 }
1834
1835 static bool exec_needs_mount_namespace(
1836                 const ExecContext *context,
1837                 const ExecParameters *params,
1838                 const ExecRuntime *runtime) {
1839
1840         assert(context);
1841         assert(params);
1842
1843         if (context->root_image)
1844                 return true;
1845
1846         if (!strv_isempty(context->read_write_paths) ||
1847             !strv_isempty(context->read_only_paths) ||
1848             !strv_isempty(context->inaccessible_paths))
1849                 return true;
1850
1851         if (context->n_bind_mounts > 0)
1852                 return true;
1853
1854         if (context->n_temporary_filesystems > 0)
1855                 return true;
1856
1857         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1858                 return true;
1859
1860         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1861                 return true;
1862
1863         if (context->private_devices ||
1864             context->private_mounts ||
1865             context->protect_system != PROTECT_SYSTEM_NO ||
1866             context->protect_home != PROTECT_HOME_NO ||
1867             context->protect_kernel_tunables ||
1868             context->protect_kernel_modules ||
1869             context->protect_control_groups)
1870                 return true;
1871
1872         if (context->root_directory) {
1873                 ExecDirectoryType t;
1874
1875                 if (context->mount_apivfs)
1876                         return true;
1877
1878                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1879                         if (!params->prefix[t])
1880                                 continue;
1881
1882                         if (!strv_isempty(context->directories[t].paths))
1883                                 return true;
1884                 }
1885         }
1886
1887         if (context->dynamic_user &&
1888             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1889              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1890              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1891                 return true;
1892
1893         return false;
1894 }
1895
1896 static int setup_private_users(uid_t uid, gid_t gid) {
1897         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1898         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1899         _cleanup_close_ int unshare_ready_fd = -1;
1900         _cleanup_(sigkill_waitp) pid_t pid = 0;
1901         uint64_t c = 1;
1902         ssize_t n;
1903         int r;
1904
1905         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1906          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1907          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1908          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1909          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1910          * continues execution normally. */
1911
1912         if (uid != 0 && uid_is_valid(uid)) {
1913                 r = asprintf(&uid_map,
1914                              "0 0 1\n"                      /* Map root → root */
1915                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1916                              uid, uid);
1917                 if (r < 0)
1918                         return -ENOMEM;
1919         } else {
1920                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1921                 if (!uid_map)
1922                         return -ENOMEM;
1923         }
1924
1925         if (gid != 0 && gid_is_valid(gid)) {
1926                 r = asprintf(&gid_map,
1927                              "0 0 1\n"                      /* Map root → root */
1928                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1929                              gid, gid);
1930                 if (r < 0)
1931                         return -ENOMEM;
1932         } else {
1933                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1934                 if (!gid_map)
1935                         return -ENOMEM;
1936         }
1937
1938         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1939          * namespace. */
1940         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1941         if (unshare_ready_fd < 0)
1942                 return -errno;
1943
1944         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1945          * failed. */
1946         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1947                 return -errno;
1948
1949         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1950         if (r < 0)
1951                 return r;
1952         if (r == 0) {
1953                 _cleanup_close_ int fd = -1;
1954                 const char *a;
1955                 pid_t ppid;
1956
1957                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1958                  * here, after the parent opened its own user namespace. */
1959
1960                 ppid = getppid();
1961                 errno_pipe[0] = safe_close(errno_pipe[0]);
1962
1963                 /* Wait until the parent unshared the user namespace */
1964                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1965                         r = -errno;
1966                         goto child_fail;
1967                 }
1968
1969                 /* Disable the setgroups() system call in the child user namespace, for good. */
1970                 a = procfs_file_alloca(ppid, "setgroups");
1971                 fd = open(a, O_WRONLY|O_CLOEXEC);
1972                 if (fd < 0) {
1973                         if (errno != ENOENT) {
1974                                 r = -errno;
1975                                 goto child_fail;
1976                         }
1977
1978                         /* If the file is missing the kernel is too old, let's continue anyway. */
1979                 } else {
1980                         if (write(fd, "deny\n", 5) < 0) {
1981                                 r = -errno;
1982                                 goto child_fail;
1983                         }
1984
1985                         fd = safe_close(fd);
1986                 }
1987
1988                 /* First write the GID map */
1989                 a = procfs_file_alloca(ppid, "gid_map");
1990                 fd = open(a, O_WRONLY|O_CLOEXEC);
1991                 if (fd < 0) {
1992                         r = -errno;
1993                         goto child_fail;
1994                 }
1995                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1996                         r = -errno;
1997                         goto child_fail;
1998                 }
1999                 fd = safe_close(fd);
2000
2001                 /* The write the UID map */
2002                 a = procfs_file_alloca(ppid, "uid_map");
2003                 fd = open(a, O_WRONLY|O_CLOEXEC);
2004                 if (fd < 0) {
2005                         r = -errno;
2006                         goto child_fail;
2007                 }
2008                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2009                         r = -errno;
2010                         goto child_fail;
2011                 }
2012
2013                 _exit(EXIT_SUCCESS);
2014
2015         child_fail:
2016                 (void) write(errno_pipe[1], &r, sizeof(r));
2017                 _exit(EXIT_FAILURE);
2018         }
2019
2020         errno_pipe[1] = safe_close(errno_pipe[1]);
2021
2022         if (unshare(CLONE_NEWUSER) < 0)
2023                 return -errno;
2024
2025         /* Let the child know that the namespace is ready now */
2026         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2027                 return -errno;
2028
2029         /* Try to read an error code from the child */
2030         n = read(errno_pipe[0], &r, sizeof(r));
2031         if (n < 0)
2032                 return -errno;
2033         if (n == sizeof(r)) { /* an error code was sent to us */
2034                 if (r < 0)
2035                         return r;
2036                 return -EIO;
2037         }
2038         if (n != 0) /* on success we should have read 0 bytes */
2039                 return -EIO;
2040
2041         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2042         pid = 0;
2043         if (r < 0)
2044                 return r;
2045         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2046                 return -EIO;
2047
2048         return 0;
2049 }
2050
2051 static int setup_exec_directory(
2052                 const ExecContext *context,
2053                 const ExecParameters *params,
2054                 uid_t uid,
2055                 gid_t gid,
2056                 ExecDirectoryType type,
2057                 int *exit_status) {
2058
2059         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2060                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2061                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2062                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2063                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2064                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2065         };
2066         char **rt;
2067         int r;
2068
2069         assert(context);
2070         assert(params);
2071         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2072         assert(exit_status);
2073
2074         if (!params->prefix[type])
2075                 return 0;
2076
2077         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2078                 if (!uid_is_valid(uid))
2079                         uid = 0;
2080                 if (!gid_is_valid(gid))
2081                         gid = 0;
2082         }
2083
2084         STRV_FOREACH(rt, context->directories[type].paths) {
2085                 _cleanup_free_ char *p = NULL, *pp = NULL;
2086
2087                 p = path_join(params->prefix[type], *rt);
2088                 if (!p) {
2089                         r = -ENOMEM;
2090                         goto fail;
2091                 }
2092
2093                 r = mkdir_parents_label(p, 0755);
2094                 if (r < 0)
2095                         goto fail;
2096
2097                 if (context->dynamic_user &&
2098                     (!IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) ||
2099                      (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode != EXEC_PRESERVE_NO))) {
2100                         _cleanup_free_ char *private_root = NULL;
2101
2102                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2103                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2104                          * whose UID is later on reused. To lock this down we use the same trick used by container
2105                          * managers to prohibit host users to get access to files of the same UID in containers: we
2106                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2107                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2108                          * to make this directory permeable for the service itself.
2109                          *
2110                          * Specifically: for a service which wants a special directory "foo/" we first create a
2111                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2112                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2113                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2114                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2115                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2116                          * disabling the access boundary for the service and making sure it only gets access to the
2117                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2118                          *
2119                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2120                          * owned by the service itself.
2121                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2122                          * files or sockets with other services. */
2123
2124                         private_root = path_join(params->prefix[type], "private");
2125                         if (!private_root) {
2126                                 r = -ENOMEM;
2127                                 goto fail;
2128                         }
2129
2130                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2131                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2132                         if (r < 0)
2133                                 goto fail;
2134
2135                         pp = path_join(private_root, *rt);
2136                         if (!pp) {
2137                                 r = -ENOMEM;
2138                                 goto fail;
2139                         }
2140
2141                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2142                         r = mkdir_parents_label(pp, 0755);
2143                         if (r < 0)
2144                                 goto fail;
2145
2146                         if (is_dir(p, false) > 0 &&
2147                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2148
2149                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2150                                  * it over. Most likely the service has been upgraded from one that didn't use
2151                                  * DynamicUser=1, to one that does. */
2152
2153                                 if (rename(p, pp) < 0) {
2154                                         r = -errno;
2155                                         goto fail;
2156                                 }
2157                         } else {
2158                                 /* Otherwise, create the actual directory for the service */
2159
2160                                 r = mkdir_label(pp, context->directories[type].mode);
2161                                 if (r < 0 && r != -EEXIST)
2162                                         goto fail;
2163                         }
2164
2165                         /* And link it up from the original place */
2166                         r = symlink_idempotent(pp, p, true);
2167                         if (r < 0)
2168                                 goto fail;
2169
2170                 } else {
2171                         r = mkdir_label(p, context->directories[type].mode);
2172                         if (r < 0) {
2173                                 if (r != -EEXIST)
2174                                         goto fail;
2175
2176                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2177                                         struct stat st;
2178
2179                                         /* Don't change the owner/access mode of the configuration directory,
2180                                          * as in the common case it is not written to by a service, and shall
2181                                          * not be writable. */
2182
2183                                         if (stat(p, &st) < 0) {
2184                                                 r = -errno;
2185                                                 goto fail;
2186                                         }
2187
2188                                         /* Still complain if the access mode doesn't match */
2189                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2190                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2191                                                             "(File system: %o %sMode: %o)",
2192                                                             exec_directory_type_to_string(type), *rt,
2193                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2194
2195                                         continue;
2196                                 }
2197                         }
2198                 }
2199
2200                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2201                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2202                  * current UID/GID ownership.) */
2203                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2204                 if (r < 0)
2205                         goto fail;
2206
2207                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2208                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2209                  * assignments to exist.*/
2210                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2211                 if (r < 0)
2212                         goto fail;
2213         }
2214
2215         return 0;
2216
2217 fail:
2218         *exit_status = exit_status_table[type];
2219         return r;
2220 }
2221
2222 #if ENABLE_SMACK
2223 static int setup_smack(
2224                 const ExecContext *context,
2225                 const ExecCommand *command) {
2226
2227         int r;
2228
2229         assert(context);
2230         assert(command);
2231
2232         if (context->smack_process_label) {
2233                 r = mac_smack_apply_pid(0, context->smack_process_label);
2234                 if (r < 0)
2235                         return r;
2236         }
2237 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2238         else {
2239                 _cleanup_free_ char *exec_label = NULL;
2240
2241                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2242                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2243                         return r;
2244
2245                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2246                 if (r < 0)
2247                         return r;
2248         }
2249 #endif
2250
2251         return 0;
2252 }
2253 #endif
2254
2255 static int compile_bind_mounts(
2256                 const ExecContext *context,
2257                 const ExecParameters *params,
2258                 BindMount **ret_bind_mounts,
2259                 size_t *ret_n_bind_mounts,
2260                 char ***ret_empty_directories) {
2261
2262         _cleanup_strv_free_ char **empty_directories = NULL;
2263         BindMount *bind_mounts;
2264         size_t n, h = 0, i;
2265         ExecDirectoryType t;
2266         int r;
2267
2268         assert(context);
2269         assert(params);
2270         assert(ret_bind_mounts);
2271         assert(ret_n_bind_mounts);
2272         assert(ret_empty_directories);
2273
2274         n = context->n_bind_mounts;
2275         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2276                 if (!params->prefix[t])
2277                         continue;
2278
2279                 n += strv_length(context->directories[t].paths);
2280         }
2281
2282         if (n <= 0) {
2283                 *ret_bind_mounts = NULL;
2284                 *ret_n_bind_mounts = 0;
2285                 *ret_empty_directories = NULL;
2286                 return 0;
2287         }
2288
2289         bind_mounts = new(BindMount, n);
2290         if (!bind_mounts)
2291                 return -ENOMEM;
2292
2293         for (i = 0; i < context->n_bind_mounts; i++) {
2294                 BindMount *item = context->bind_mounts + i;
2295                 char *s, *d;
2296
2297                 s = strdup(item->source);
2298                 if (!s) {
2299                         r = -ENOMEM;
2300                         goto finish;
2301                 }
2302
2303                 d = strdup(item->destination);
2304                 if (!d) {
2305                         free(s);
2306                         r = -ENOMEM;
2307                         goto finish;
2308                 }
2309
2310                 bind_mounts[h++] = (BindMount) {
2311                         .source = s,
2312                         .destination = d,
2313                         .read_only = item->read_only,
2314                         .recursive = item->recursive,
2315                         .ignore_enoent = item->ignore_enoent,
2316                 };
2317         }
2318
2319         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2320                 char **suffix;
2321
2322                 if (!params->prefix[t])
2323                         continue;
2324
2325                 if (strv_isempty(context->directories[t].paths))
2326                         continue;
2327
2328                 if (context->dynamic_user &&
2329                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2330                     !(context->root_directory || context->root_image)) {
2331                         char *private_root;
2332
2333                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2334                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2335                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2336
2337                         private_root = strjoin(params->prefix[t], "/private");
2338                         if (!private_root) {
2339                                 r = -ENOMEM;
2340                                 goto finish;
2341                         }
2342
2343                         r = strv_consume(&empty_directories, private_root);
2344                         if (r < 0)
2345                                 goto finish;
2346                 }
2347
2348                 STRV_FOREACH(suffix, context->directories[t].paths) {
2349                         char *s, *d;
2350
2351                         if (context->dynamic_user &&
2352                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2353                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2354                         else
2355                                 s = strjoin(params->prefix[t], "/", *suffix);
2356                         if (!s) {
2357                                 r = -ENOMEM;
2358                                 goto finish;
2359                         }
2360
2361                         if (context->dynamic_user &&
2362                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2363                             (context->root_directory || context->root_image))
2364                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2365                                  * directory is not created on the root directory. So, let's bind-mount the directory
2366                                  * on the 'non-private' place. */
2367                                 d = strjoin(params->prefix[t], "/", *suffix);
2368                         else
2369                                 d = strdup(s);
2370                         if (!d) {
2371                                 free(s);
2372                                 r = -ENOMEM;
2373                                 goto finish;
2374                         }
2375
2376                         bind_mounts[h++] = (BindMount) {
2377                                 .source = s,
2378                                 .destination = d,
2379                                 .read_only = false,
2380                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2381                                 .recursive = true,
2382                                 .ignore_enoent = false,
2383                         };
2384                 }
2385         }
2386
2387         assert(h == n);
2388
2389         *ret_bind_mounts = bind_mounts;
2390         *ret_n_bind_mounts = n;
2391         *ret_empty_directories = TAKE_PTR(empty_directories);
2392
2393         return (int) n;
2394
2395 finish:
2396         bind_mount_free_many(bind_mounts, h);
2397         return r;
2398 }
2399
2400 static int apply_mount_namespace(
2401                 const Unit *u,
2402                 const ExecCommand *command,
2403                 const ExecContext *context,
2404                 const ExecParameters *params,
2405                 const ExecRuntime *runtime,
2406                 char **error_path) {
2407
2408         _cleanup_strv_free_ char **empty_directories = NULL;
2409         char *tmp = NULL, *var = NULL;
2410         const char *root_dir = NULL, *root_image = NULL;
2411         NamespaceInfo ns_info;
2412         bool needs_sandboxing;
2413         BindMount *bind_mounts = NULL;
2414         size_t n_bind_mounts = 0;
2415         int r;
2416
2417         assert(context);
2418
2419         /* The runtime struct only contains the parent of the private /tmp,
2420          * which is non-accessible to world users. Inside of it there's a /tmp
2421          * that is sticky, and that's the one we want to use here. */
2422
2423         if (context->private_tmp && runtime) {
2424                 if (runtime->tmp_dir)
2425                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2426                 if (runtime->var_tmp_dir)
2427                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2428         }
2429
2430         if (params->flags & EXEC_APPLY_CHROOT) {
2431                 root_image = context->root_image;
2432
2433                 if (!root_image)
2434                         root_dir = context->root_directory;
2435         }
2436
2437         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2438         if (r < 0)
2439                 return r;
2440
2441         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2442         if (needs_sandboxing)
2443                 ns_info = (NamespaceInfo) {
2444                         .ignore_protect_paths = false,
2445                         .private_dev = context->private_devices,
2446                         .protect_control_groups = context->protect_control_groups,
2447                         .protect_kernel_tunables = context->protect_kernel_tunables,
2448                         .protect_kernel_modules = context->protect_kernel_modules,
2449                         .protect_hostname = context->protect_hostname,
2450                         .mount_apivfs = context->mount_apivfs,
2451                         .private_mounts = context->private_mounts,
2452                 };
2453         else if (!context->dynamic_user && root_dir)
2454                 /*
2455                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2456                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2457                  * fail if we are enable to apply the sandbox inside the mount namespace.
2458                  */
2459                 ns_info = (NamespaceInfo) {
2460                         .ignore_protect_paths = true,
2461                 };
2462         else
2463                 ns_info = (NamespaceInfo) {};
2464
2465         if (context->mount_flags == MS_SHARED)
2466                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2467
2468         r = setup_namespace(root_dir, root_image,
2469                             &ns_info, context->read_write_paths,
2470                             needs_sandboxing ? context->read_only_paths : NULL,
2471                             needs_sandboxing ? context->inaccessible_paths : NULL,
2472                             empty_directories,
2473                             bind_mounts,
2474                             n_bind_mounts,
2475                             context->temporary_filesystems,
2476                             context->n_temporary_filesystems,
2477                             tmp,
2478                             var,
2479                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2480                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2481                             context->mount_flags,
2482                             DISSECT_IMAGE_DISCARD_ON_LOOP,
2483                             error_path);
2484
2485         bind_mount_free_many(bind_mounts, n_bind_mounts);
2486
2487         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2488          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2489          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2490          * completely different execution environment. */
2491         if (r == -ENOANO) {
2492                 if (n_bind_mounts == 0 &&
2493                     context->n_temporary_filesystems == 0 &&
2494                     !root_dir && !root_image &&
2495                     !context->dynamic_user) {
2496                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2497                         return 0;
2498                 }
2499
2500                 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2501                                "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2502                                n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2503
2504                 return -EOPNOTSUPP;
2505         }
2506
2507         return r;
2508 }
2509
2510 static int apply_working_directory(
2511                 const ExecContext *context,
2512                 const ExecParameters *params,
2513                 const char *home,
2514                 const bool needs_mount_ns,
2515                 int *exit_status) {
2516
2517         const char *d, *wd;
2518
2519         assert(context);
2520         assert(exit_status);
2521
2522         if (context->working_directory_home) {
2523
2524                 if (!home) {
2525                         *exit_status = EXIT_CHDIR;
2526                         return -ENXIO;
2527                 }
2528
2529                 wd = home;
2530
2531         } else if (context->working_directory)
2532                 wd = context->working_directory;
2533         else
2534                 wd = "/";
2535
2536         if (params->flags & EXEC_APPLY_CHROOT) {
2537                 if (!needs_mount_ns && context->root_directory)
2538                         if (chroot(context->root_directory) < 0) {
2539                                 *exit_status = EXIT_CHROOT;
2540                                 return -errno;
2541                         }
2542
2543                 d = wd;
2544         } else
2545                 d = prefix_roota(context->root_directory, wd);
2546
2547         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2548                 *exit_status = EXIT_CHDIR;
2549                 return -errno;
2550         }
2551
2552         return 0;
2553 }
2554
2555 static int setup_keyring(
2556                 const Unit *u,
2557                 const ExecContext *context,
2558                 const ExecParameters *p,
2559                 uid_t uid, gid_t gid) {
2560
2561         key_serial_t keyring;
2562         int r = 0;
2563         uid_t saved_uid;
2564         gid_t saved_gid;
2565
2566         assert(u);
2567         assert(context);
2568         assert(p);
2569
2570         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2571          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2572          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2573          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2574          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2575          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2576
2577         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2578                 return 0;
2579
2580         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2581          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2582          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2583          * & group is just as nasty as acquiring a reference to the user keyring. */
2584
2585         saved_uid = getuid();
2586         saved_gid = getgid();
2587
2588         if (gid_is_valid(gid) && gid != saved_gid) {
2589                 if (setregid(gid, -1) < 0)
2590                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2591         }
2592
2593         if (uid_is_valid(uid) && uid != saved_uid) {
2594                 if (setreuid(uid, -1) < 0) {
2595                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2596                         goto out;
2597                 }
2598         }
2599
2600         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2601         if (keyring == -1) {
2602                 if (errno == ENOSYS)
2603                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2604                 else if (IN_SET(errno, EACCES, EPERM))
2605                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2606                 else if (errno == EDQUOT)
2607                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2608                 else
2609                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2610
2611                 goto out;
2612         }
2613
2614         /* When requested link the user keyring into the session keyring. */
2615         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2616
2617                 if (keyctl(KEYCTL_LINK,
2618                            KEY_SPEC_USER_KEYRING,
2619                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2620                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2621                         goto out;
2622                 }
2623         }
2624
2625         /* Restore uid/gid back */
2626         if (uid_is_valid(uid) && uid != saved_uid) {
2627                 if (setreuid(saved_uid, -1) < 0) {
2628                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2629                         goto out;
2630                 }
2631         }
2632
2633         if (gid_is_valid(gid) && gid != saved_gid) {
2634                 if (setregid(saved_gid, -1) < 0)
2635                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2636         }
2637
2638         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2639         if (!sd_id128_is_null(u->invocation_id)) {
2640                 key_serial_t key;
2641
2642                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2643                 if (key == -1)
2644                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2645                 else {
2646                         if (keyctl(KEYCTL_SETPERM, key,
2647                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2648                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2649                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2650                 }
2651         }
2652
2653 out:
2654         /* Revert back uid & gid for the the last time, and exit */
2655         /* no extra logging, as only the first already reported error matters */
2656         if (getuid() != saved_uid)
2657                 (void) setreuid(saved_uid, -1);
2658
2659         if (getgid() != saved_gid)
2660                 (void) setregid(saved_gid, -1);
2661
2662         return r;
2663 }
2664
2665 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2666         assert(array);
2667         assert(n);
2668
2669         if (!pair)
2670                 return;
2671
2672         if (pair[0] >= 0)
2673                 array[(*n)++] = pair[0];
2674         if (pair[1] >= 0)
2675                 array[(*n)++] = pair[1];
2676 }
2677
2678 static int close_remaining_fds(
2679                 const ExecParameters *params,
2680                 const ExecRuntime *runtime,
2681                 const DynamicCreds *dcreds,
2682                 int user_lookup_fd,
2683                 int socket_fd,
2684                 int exec_fd,
2685                 int *fds, size_t n_fds) {
2686
2687         size_t n_dont_close = 0;
2688         int dont_close[n_fds + 12];
2689
2690         assert(params);
2691
2692         if (params->stdin_fd >= 0)
2693                 dont_close[n_dont_close++] = params->stdin_fd;
2694         if (params->stdout_fd >= 0)
2695                 dont_close[n_dont_close++] = params->stdout_fd;
2696         if (params->stderr_fd >= 0)
2697                 dont_close[n_dont_close++] = params->stderr_fd;
2698
2699         if (socket_fd >= 0)
2700                 dont_close[n_dont_close++] = socket_fd;
2701         if (exec_fd >= 0)
2702                 dont_close[n_dont_close++] = exec_fd;
2703         if (n_fds > 0) {
2704                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2705                 n_dont_close += n_fds;
2706         }
2707
2708         if (runtime)
2709                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2710
2711         if (dcreds) {
2712                 if (dcreds->user)
2713                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2714                 if (dcreds->group)
2715                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2716         }
2717
2718         if (user_lookup_fd >= 0)
2719                 dont_close[n_dont_close++] = user_lookup_fd;
2720
2721         return close_all_fds(dont_close, n_dont_close);
2722 }
2723
2724 static int send_user_lookup(
2725                 Unit *unit,
2726                 int user_lookup_fd,
2727                 uid_t uid,
2728                 gid_t gid) {
2729
2730         assert(unit);
2731
2732         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2733          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2734          * specified. */
2735
2736         if (user_lookup_fd < 0)
2737                 return 0;
2738
2739         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2740                 return 0;
2741
2742         if (writev(user_lookup_fd,
2743                (struct iovec[]) {
2744                            IOVEC_INIT(&uid, sizeof(uid)),
2745                            IOVEC_INIT(&gid, sizeof(gid)),
2746                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2747                 return -errno;
2748
2749         return 0;
2750 }
2751
2752 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2753         int r;
2754
2755         assert(c);
2756         assert(home);
2757         assert(buf);
2758
2759         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2760
2761         if (*home)
2762                 return 0;
2763
2764         if (!c->working_directory_home)
2765                 return 0;
2766
2767         r = get_home_dir(buf);
2768         if (r < 0)
2769                 return r;
2770
2771         *home = *buf;
2772         return 1;
2773 }
2774
2775 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2776         _cleanup_strv_free_ char ** list = NULL;
2777         ExecDirectoryType t;
2778         int r;
2779
2780         assert(c);
2781         assert(p);
2782         assert(ret);
2783
2784         assert(c->dynamic_user);
2785
2786         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2787          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2788          * directories. */
2789
2790         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2791                 char **i;
2792
2793                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2794                         continue;
2795
2796                 if (!p->prefix[t])
2797                         continue;
2798
2799                 STRV_FOREACH(i, c->directories[t].paths) {
2800                         char *e;
2801
2802                         if (t == EXEC_DIRECTORY_RUNTIME)
2803                                 e = strjoin(p->prefix[t], "/", *i);
2804                         else
2805                                 e = strjoin(p->prefix[t], "/private/", *i);
2806                         if (!e)
2807                                 return -ENOMEM;
2808
2809                         r = strv_consume(&list, e);
2810                         if (r < 0)
2811                                 return r;
2812                 }
2813         }
2814
2815         *ret = TAKE_PTR(list);
2816
2817         return 0;
2818 }
2819
2820 static char *exec_command_line(char **argv);
2821
2822 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2823         bool using_subcgroup;
2824         char *p;
2825
2826         assert(params);
2827         assert(ret);
2828
2829         if (!params->cgroup_path)
2830                 return -EINVAL;
2831
2832         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2833          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2834          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2835          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2836          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2837          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2838          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2839          * flag, which is only passed for the former statements, not for the latter. */
2840
2841         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2842         if (using_subcgroup)
2843                 p = strjoin(params->cgroup_path, "/.control");
2844         else
2845                 p = strdup(params->cgroup_path);
2846         if (!p)
2847                 return -ENOMEM;
2848
2849         *ret = p;
2850         return using_subcgroup;
2851 }
2852
2853 static int exec_child(
2854                 Unit *unit,
2855                 const ExecCommand *command,
2856                 const ExecContext *context,
2857                 const ExecParameters *params,
2858                 ExecRuntime *runtime,
2859                 DynamicCreds *dcreds,
2860                 int socket_fd,
2861                 int named_iofds[3],
2862                 int *fds,
2863                 size_t n_socket_fds,
2864                 size_t n_storage_fds,
2865                 char **files_env,
2866                 int user_lookup_fd,
2867                 int *exit_status) {
2868
2869         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
2870         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2871         _cleanup_free_ gid_t *supplementary_gids = NULL;
2872         const char *username = NULL, *groupname = NULL;
2873         _cleanup_free_ char *home_buffer = NULL;
2874         const char *home = NULL, *shell = NULL;
2875         char **final_argv = NULL;
2876         dev_t journal_stream_dev = 0;
2877         ino_t journal_stream_ino = 0;
2878         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2879                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2880                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2881                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2882 #if HAVE_SELINUX
2883         _cleanup_free_ char *mac_selinux_context_net = NULL;
2884         bool use_selinux = false;
2885 #endif
2886 #if ENABLE_SMACK
2887         bool use_smack = false;
2888 #endif
2889 #if HAVE_APPARMOR
2890         bool use_apparmor = false;
2891 #endif
2892         uid_t uid = UID_INVALID;
2893         gid_t gid = GID_INVALID;
2894         size_t n_fds;
2895         ExecDirectoryType dt;
2896         int secure_bits;
2897
2898         assert(unit);
2899         assert(command);
2900         assert(context);
2901         assert(params);
2902         assert(exit_status);
2903
2904         rename_process_from_path(command->path);
2905
2906         /* We reset exactly these signals, since they are the
2907          * only ones we set to SIG_IGN in the main daemon. All
2908          * others we leave untouched because we set them to
2909          * SIG_DFL or a valid handler initially, both of which
2910          * will be demoted to SIG_DFL. */
2911         (void) default_signals(SIGNALS_CRASH_HANDLER,
2912                                SIGNALS_IGNORE, -1);
2913
2914         if (context->ignore_sigpipe)
2915                 (void) ignore_signals(SIGPIPE, -1);
2916
2917         r = reset_signal_mask();
2918         if (r < 0) {
2919                 *exit_status = EXIT_SIGNAL_MASK;
2920                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2921         }
2922
2923         if (params->idle_pipe)
2924                 do_idle_pipe_dance(params->idle_pipe);
2925
2926         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2927          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2928          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2929          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2930
2931         log_forget_fds();
2932         log_set_open_when_needed(true);
2933
2934         /* In case anything used libc syslog(), close this here, too */
2935         closelog();
2936
2937         n_fds = n_socket_fds + n_storage_fds;
2938         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2939         if (r < 0) {
2940                 *exit_status = EXIT_FDS;
2941                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2942         }
2943
2944         if (!context->same_pgrp)
2945                 if (setsid() < 0) {
2946                         *exit_status = EXIT_SETSID;
2947                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2948                 }
2949
2950         exec_context_tty_reset(context, params);
2951
2952         if (unit_shall_confirm_spawn(unit)) {
2953                 const char *vc = params->confirm_spawn;
2954                 _cleanup_free_ char *cmdline = NULL;
2955
2956                 cmdline = exec_command_line(command->argv);
2957                 if (!cmdline) {
2958                         *exit_status = EXIT_MEMORY;
2959                         return log_oom();
2960                 }
2961
2962                 r = ask_for_confirmation(vc, unit, cmdline);
2963                 if (r != CONFIRM_EXECUTE) {
2964                         if (r == CONFIRM_PRETEND_SUCCESS) {
2965                                 *exit_status = EXIT_SUCCESS;
2966                                 return 0;
2967                         }
2968                         *exit_status = EXIT_CONFIRM;
2969                         log_unit_error(unit, "Execution cancelled by the user");
2970                         return -ECANCELED;
2971                 }
2972         }
2973
2974         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2975          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2976          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2977          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2978          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2979         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2980             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2981                 *exit_status = EXIT_MEMORY;
2982                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2983         }
2984
2985         if (context->dynamic_user && dcreds) {
2986                 _cleanup_strv_free_ char **suggested_paths = NULL;
2987
2988                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2989                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2990                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2991                         *exit_status = EXIT_USER;
2992                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2993                 }
2994
2995                 r = compile_suggested_paths(context, params, &suggested_paths);
2996                 if (r < 0) {
2997                         *exit_status = EXIT_MEMORY;
2998                         return log_oom();
2999                 }
3000
3001                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3002                 if (r < 0) {
3003                         *exit_status = EXIT_USER;
3004                         if (r == -EILSEQ) {
3005                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3006                                 return -EOPNOTSUPP;
3007                         }
3008                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3009                 }
3010
3011                 if (!uid_is_valid(uid)) {
3012                         *exit_status = EXIT_USER;
3013                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3014                         return -ESRCH;
3015                 }
3016
3017                 if (!gid_is_valid(gid)) {
3018                         *exit_status = EXIT_USER;
3019                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3020                         return -ESRCH;
3021                 }
3022
3023                 if (dcreds->user)
3024                         username = dcreds->user->name;
3025
3026         } else {
3027                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3028                 if (r < 0) {
3029                         *exit_status = EXIT_USER;
3030                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3031                 }
3032
3033                 r = get_fixed_group(context, &groupname, &gid);
3034                 if (r < 0) {
3035                         *exit_status = EXIT_GROUP;
3036                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3037                 }
3038         }
3039
3040         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3041         r = get_supplementary_groups(context, username, groupname, gid,
3042                                      &supplementary_gids, &ngids);
3043         if (r < 0) {
3044                 *exit_status = EXIT_GROUP;
3045                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3046         }
3047
3048         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3049         if (r < 0) {
3050                 *exit_status = EXIT_USER;
3051                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3052         }
3053
3054         user_lookup_fd = safe_close(user_lookup_fd);
3055
3056         r = acquire_home(context, uid, &home, &home_buffer);
3057         if (r < 0) {
3058                 *exit_status = EXIT_CHDIR;
3059                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3060         }
3061
3062         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3063          * must sure to drop O_NONBLOCK */
3064         if (socket_fd >= 0)
3065                 (void) fd_nonblock(socket_fd, false);
3066
3067         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3068          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3069         if (params->cgroup_path) {
3070                 _cleanup_free_ char *p = NULL;
3071
3072                 r = exec_parameters_get_cgroup_path(params, &p);
3073                 if (r < 0) {
3074                         *exit_status = EXIT_CGROUP;
3075                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3076                 }
3077
3078                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3079                 if (r < 0) {
3080                         *exit_status = EXIT_CGROUP;
3081                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3082                 }
3083         }
3084
3085         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3086                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3087                 if (r < 0) {
3088                         *exit_status = EXIT_NETWORK;
3089                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3090                 }
3091         }
3092
3093         r = setup_input(context, params, socket_fd, named_iofds);
3094         if (r < 0) {
3095                 *exit_status = EXIT_STDIN;
3096                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3097         }
3098
3099         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3100         if (r < 0) {
3101                 *exit_status = EXIT_STDOUT;
3102                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3103         }
3104
3105         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3106         if (r < 0) {
3107                 *exit_status = EXIT_STDERR;
3108                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3109         }
3110
3111         if (context->oom_score_adjust_set) {
3112                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3113                  * prohibit write access to this file, and we shouldn't trip up over that. */
3114                 r = set_oom_score_adjust(context->oom_score_adjust);
3115                 if (IN_SET(r, -EPERM, -EACCES))
3116                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3117                 else if (r < 0) {
3118                         *exit_status = EXIT_OOM_ADJUST;
3119                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3120                 }
3121         }
3122
3123         if (context->nice_set)
3124                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
3125                         *exit_status = EXIT_NICE;
3126                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
3127                 }
3128
3129         if (context->cpu_sched_set) {
3130                 struct sched_param param = {
3131                         .sched_priority = context->cpu_sched_priority,
3132                 };
3133
3134                 r = sched_setscheduler(0,
3135                                        context->cpu_sched_policy |
3136                                        (context->cpu_sched_reset_on_fork ?
3137                                         SCHED_RESET_ON_FORK : 0),
3138                                        &param);
3139                 if (r < 0) {
3140                         *exit_status = EXIT_SETSCHEDULER;
3141                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3142                 }
3143         }
3144
3145         if (context->cpu_set.set)
3146                 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
3147                         *exit_status = EXIT_CPUAFFINITY;
3148                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3149                 }
3150
3151         if (context->ioprio_set)
3152                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3153                         *exit_status = EXIT_IOPRIO;
3154                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3155                 }
3156
3157         if (context->timer_slack_nsec != NSEC_INFINITY)
3158                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3159                         *exit_status = EXIT_TIMERSLACK;
3160                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3161                 }
3162
3163         if (context->personality != PERSONALITY_INVALID) {
3164                 r = safe_personality(context->personality);
3165                 if (r < 0) {
3166                         *exit_status = EXIT_PERSONALITY;
3167                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3168                 }
3169         }
3170
3171         if (context->utmp_id)
3172                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3173                                       context->tty_path,
3174                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3175                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3176                                       USER_PROCESS,
3177                                       username);
3178
3179         if (uid_is_valid(uid)) {
3180                 r = chown_terminal(STDIN_FILENO, uid);
3181                 if (r < 0) {
3182                         *exit_status = EXIT_STDIN;
3183                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3184                 }
3185         }
3186
3187         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3188          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3189          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3190          * touch a single hierarchy too. */
3191         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3192                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3193                 if (r < 0) {
3194                         *exit_status = EXIT_CGROUP;
3195                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3196                 }
3197         }
3198
3199         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3200                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3201                 if (r < 0)
3202                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3203         }
3204
3205         r = build_environment(
3206                         unit,
3207                         context,
3208                         params,
3209                         n_fds,
3210                         home,
3211                         username,
3212                         shell,
3213                         journal_stream_dev,
3214                         journal_stream_ino,
3215                         &our_env);
3216         if (r < 0) {
3217                 *exit_status = EXIT_MEMORY;
3218                 return log_oom();
3219         }
3220
3221         r = build_pass_environment(context, &pass_env);
3222         if (r < 0) {
3223                 *exit_status = EXIT_MEMORY;
3224                 return log_oom();
3225         }
3226
3227         accum_env = strv_env_merge(5,
3228                                    params->environment,
3229                                    our_env,
3230                                    pass_env,
3231                                    context->environment,
3232                                    files_env,
3233                                    NULL);
3234         if (!accum_env) {
3235                 *exit_status = EXIT_MEMORY;
3236                 return log_oom();
3237         }
3238         accum_env = strv_env_clean(accum_env);
3239
3240         (void) umask(context->umask);
3241
3242         r = setup_keyring(unit, context, params, uid, gid);
3243         if (r < 0) {
3244                 *exit_status = EXIT_KEYRING;
3245                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3246         }
3247
3248         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3249         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3250
3251         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3252         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3253
3254         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3255         if (needs_ambient_hack)
3256                 needs_setuid = false;
3257         else
3258                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3259
3260         if (needs_sandboxing) {
3261                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3262                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3263                  * impacting our own code paths. */
3264
3265 #if HAVE_SELINUX
3266                 use_selinux = mac_selinux_use();
3267 #endif
3268 #if ENABLE_SMACK
3269                 use_smack = mac_smack_use();
3270 #endif
3271 #if HAVE_APPARMOR
3272                 use_apparmor = mac_apparmor_use();
3273 #endif
3274         }
3275
3276         if (needs_sandboxing) {
3277                 int which_failed;
3278
3279                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3280                  * is set here. (See below.) */
3281
3282                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3283                 if (r < 0) {
3284                         *exit_status = EXIT_LIMITS;
3285                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3286                 }
3287         }
3288
3289         if (needs_setuid) {
3290
3291                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3292                  * wins here. (See above.) */
3293
3294                 if (context->pam_name && username) {
3295                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3296                         if (r < 0) {
3297                                 *exit_status = EXIT_PAM;
3298                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3299                         }
3300                 }
3301         }
3302
3303         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3304
3305                 if (ns_type_supported(NAMESPACE_NET)) {
3306                         r = setup_netns(runtime->netns_storage_socket);
3307                         if (r < 0) {
3308                                 *exit_status = EXIT_NETWORK;
3309                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3310                         }
3311                 } else if (context->network_namespace_path) {
3312                         *exit_status = EXIT_NETWORK;
3313                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
3314                 } else
3315                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3316         }
3317
3318         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3319         if (needs_mount_namespace) {
3320                 _cleanup_free_ char *error_path = NULL;
3321
3322                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3323                 if (r < 0) {
3324                         *exit_status = EXIT_NAMESPACE;
3325                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3326                                                     error_path ? ": " : "", strempty(error_path));
3327                 }
3328         }
3329
3330         if (context->protect_hostname) {
3331                 if (ns_type_supported(NAMESPACE_UTS)) {
3332                         if (unshare(CLONE_NEWUTS) < 0) {
3333                                 *exit_status = EXIT_NAMESPACE;
3334                                 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3335                         }
3336                 } else
3337                         log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3338 #if HAVE_SECCOMP
3339                 r = seccomp_protect_hostname();
3340                 if (r < 0) {
3341                         *exit_status = EXIT_SECCOMP;
3342                         return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3343                 }
3344 #endif
3345         }
3346
3347         /* Drop groups as early as possbile */
3348         if (needs_setuid) {
3349                 r = enforce_groups(gid, supplementary_gids, ngids);
3350                 if (r < 0) {
3351                         *exit_status = EXIT_GROUP;
3352                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3353                 }
3354         }
3355
3356         if (needs_sandboxing) {
3357 #if HAVE_SELINUX
3358                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3359                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3360                         if (r < 0) {
3361                                 *exit_status = EXIT_SELINUX_CONTEXT;
3362                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3363                         }
3364                 }
3365 #endif
3366
3367                 if (context->private_users) {
3368                         r = setup_private_users(uid, gid);
3369                         if (r < 0) {
3370                                 *exit_status = EXIT_USER;
3371                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3372                         }
3373                 }
3374         }
3375
3376         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3377          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3378          * however if we have it as we want to keep it open until the final execve(). */
3379
3380         if (params->exec_fd >= 0) {
3381                 exec_fd = params->exec_fd;
3382
3383                 if (exec_fd < 3 + (int) n_fds) {
3384                         int moved_fd;
3385
3386                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3387                          * process we are about to execute. */
3388
3389                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3390                         if (moved_fd < 0) {
3391                                 *exit_status = EXIT_FDS;
3392                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3393                         }
3394
3395                         safe_close(exec_fd);
3396                         exec_fd = moved_fd;
3397                 } else {
3398                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3399                         r = fd_cloexec(exec_fd, true);
3400                         if (r < 0) {
3401                                 *exit_status = EXIT_FDS;
3402                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3403                         }
3404                 }
3405
3406                 fds_with_exec_fd = newa(int, n_fds + 1);
3407                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3408                 fds_with_exec_fd[n_fds] = exec_fd;
3409                 n_fds_with_exec_fd = n_fds + 1;
3410         } else {
3411                 fds_with_exec_fd = fds;
3412                 n_fds_with_exec_fd = n_fds;
3413         }
3414
3415         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3416         if (r >= 0)
3417                 r = shift_fds(fds, n_fds);
3418         if (r >= 0)
3419                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3420         if (r < 0) {
3421                 *exit_status = EXIT_FDS;
3422                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3423         }
3424
3425         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3426          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3427          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3428          * came this far. */
3429
3430         secure_bits = context->secure_bits;
3431
3432         if (needs_sandboxing) {
3433                 uint64_t bset;
3434
3435                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3436                  * requested. (Note this is placed after the general resource limit initialization, see
3437                  * above, in order to take precedence.) */
3438                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3439                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3440                                 *exit_status = EXIT_LIMITS;
3441                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3442                         }
3443                 }
3444
3445 #if ENABLE_SMACK
3446                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3447                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3448                 if (use_smack) {
3449                         r = setup_smack(context, command);
3450                         if (r < 0) {
3451                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3452                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3453                         }
3454                 }
3455 #endif
3456
3457                 bset = context->capability_bounding_set;
3458                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3459                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3460                  * instead of us doing that */
3461                 if (needs_ambient_hack)
3462                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3463                                 (UINT64_C(1) << CAP_SETUID) |
3464                                 (UINT64_C(1) << CAP_SETGID);
3465
3466                 if (!cap_test_all(bset)) {
3467                         r = capability_bounding_set_drop(bset, false);
3468                         if (r < 0) {
3469                                 *exit_status = EXIT_CAPABILITIES;
3470                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3471                         }
3472                 }
3473
3474                 /* This is done before enforce_user, but ambient set
3475                  * does not survive over setresuid() if keep_caps is not set. */
3476                 if (!needs_ambient_hack &&
3477                     context->capability_ambient_set != 0) {
3478                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3479                         if (r < 0) {
3480                                 *exit_status = EXIT_CAPABILITIES;
3481                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3482                         }
3483                 }
3484         }
3485
3486         if (needs_setuid) {
3487                 if (uid_is_valid(uid)) {
3488                         r = enforce_user(context, uid);
3489                         if (r < 0) {
3490                                 *exit_status = EXIT_USER;
3491                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3492                         }
3493
3494                         if (!needs_ambient_hack &&
3495                             context->capability_ambient_set != 0) {
3496
3497                                 /* Fix the ambient capabilities after user change. */
3498                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3499                                 if (r < 0) {
3500                                         *exit_status = EXIT_CAPABILITIES;
3501                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3502                                 }
3503
3504                                 /* If we were asked to change user and ambient capabilities
3505                                  * were requested, we had to add keep-caps to the securebits
3506                                  * so that we would maintain the inherited capability set
3507                                  * through the setresuid(). Make sure that the bit is added
3508                                  * also to the context secure_bits so that we don't try to
3509                                  * drop the bit away next. */
3510
3511                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3512                         }
3513                 }
3514         }
3515
3516         /* Apply working directory here, because the working directory might be on NFS and only the user running
3517          * this service might have the correct privilege to change to the working directory */
3518         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3519         if (r < 0)
3520                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3521
3522         if (needs_sandboxing) {
3523                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3524                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3525                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3526                  * are restricted. */
3527
3528 #if HAVE_SELINUX
3529                 if (use_selinux) {
3530                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3531
3532                         if (exec_context) {
3533                                 r = setexeccon(exec_context);
3534                                 if (r < 0) {
3535                                         *exit_status = EXIT_SELINUX_CONTEXT;
3536                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3537                                 }
3538                         }
3539                 }
3540 #endif
3541
3542 #if HAVE_APPARMOR
3543                 if (use_apparmor && context->apparmor_profile) {
3544                         r = aa_change_onexec(context->apparmor_profile);
3545                         if (r < 0 && !context->apparmor_profile_ignore) {
3546                                 *exit_status = EXIT_APPARMOR_PROFILE;
3547                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3548                         }
3549                 }
3550 #endif
3551
3552                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3553                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3554                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3555                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3556                                 *exit_status = EXIT_SECUREBITS;
3557                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3558                         }
3559
3560                 if (context_has_no_new_privileges(context))
3561                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3562                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3563                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3564                         }
3565
3566 #if HAVE_SECCOMP
3567                 r = apply_address_families(unit, context);
3568                 if (r < 0) {
3569                         *exit_status = EXIT_ADDRESS_FAMILIES;
3570                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3571                 }
3572
3573                 r = apply_memory_deny_write_execute(unit, context);
3574                 if (r < 0) {
3575                         *exit_status = EXIT_SECCOMP;
3576                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3577                 }
3578
3579                 r = apply_restrict_realtime(unit, context);
3580                 if (r < 0) {
3581                         *exit_status = EXIT_SECCOMP;
3582                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3583                 }
3584
3585                 r = apply_restrict_suid_sgid(unit, context);
3586                 if (r < 0) {
3587                         *exit_status = EXIT_SECCOMP;
3588                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3589                 }
3590
3591                 r = apply_restrict_namespaces(unit, context);
3592                 if (r < 0) {
3593                         *exit_status = EXIT_SECCOMP;
3594                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3595                 }
3596
3597                 r = apply_protect_sysctl(unit, context);
3598                 if (r < 0) {
3599                         *exit_status = EXIT_SECCOMP;
3600                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3601                 }
3602
3603                 r = apply_protect_kernel_modules(unit, context);
3604                 if (r < 0) {
3605                         *exit_status = EXIT_SECCOMP;
3606                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3607                 }
3608
3609                 r = apply_private_devices(unit, context);
3610                 if (r < 0) {
3611                         *exit_status = EXIT_SECCOMP;
3612                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3613                 }
3614
3615                 r = apply_syscall_archs(unit, context);
3616                 if (r < 0) {
3617                         *exit_status = EXIT_SECCOMP;
3618                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3619                 }
3620
3621                 r = apply_lock_personality(unit, context);
3622                 if (r < 0) {
3623                         *exit_status = EXIT_SECCOMP;
3624                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3625                 }
3626
3627                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3628                  * by the filter as little as possible. */
3629                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3630                 if (r < 0) {
3631                         *exit_status = EXIT_SECCOMP;
3632                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3633                 }
3634 #endif
3635         }
3636
3637         if (!strv_isempty(context->unset_environment)) {
3638                 char **ee = NULL;
3639
3640                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3641                 if (!ee) {
3642                         *exit_status = EXIT_MEMORY;
3643                         return log_oom();
3644                 }
3645
3646                 strv_free_and_replace(accum_env, ee);
3647         }
3648
3649         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3650                 replaced_argv = replace_env_argv(command->argv, accum_env);
3651                 if (!replaced_argv) {
3652                         *exit_status = EXIT_MEMORY;
3653                         return log_oom();
3654                 }
3655                 final_argv = replaced_argv;
3656         } else
3657                 final_argv = command->argv;
3658
3659         if (DEBUG_LOGGING) {
3660                 _cleanup_free_ char *line;
3661
3662                 line = exec_command_line(final_argv);
3663                 if (line)
3664                         log_struct(LOG_DEBUG,
3665                                    "EXECUTABLE=%s", command->path,
3666                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3667                                    LOG_UNIT_ID(unit),
3668                                    LOG_UNIT_INVOCATION_ID(unit));
3669         }
3670
3671         if (exec_fd >= 0) {
3672                 uint8_t hot = 1;
3673
3674                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3675                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3676
3677                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3678                         *exit_status = EXIT_EXEC;
3679                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3680                 }
3681         }
3682
3683         execve(command->path, final_argv, accum_env);
3684         r = -errno;
3685
3686         if (exec_fd >= 0) {
3687                 uint8_t hot = 0;
3688
3689                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3690                  * that POLLHUP on it no longer means execve() succeeded. */
3691
3692                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3693                         *exit_status = EXIT_EXEC;
3694                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3695                 }
3696         }
3697
3698         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3699                 log_struct_errno(LOG_INFO, r,
3700                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3701                                  LOG_UNIT_ID(unit),
3702                                  LOG_UNIT_INVOCATION_ID(unit),
3703                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3704                                                   command->path),
3705                                  "EXECUTABLE=%s", command->path);
3706                 return 0;
3707         }
3708
3709         *exit_status = EXIT_EXEC;
3710         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3711 }
3712
3713 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3714 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3715
3716 int exec_spawn(Unit *unit,
3717                ExecCommand *command,
3718                const ExecContext *context,
3719                const ExecParameters *params,
3720                ExecRuntime *runtime,
3721                DynamicCreds *dcreds,
3722                pid_t *ret) {
3723
3724         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3725         _cleanup_free_ char *subcgroup_path = NULL;
3726         _cleanup_strv_free_ char **files_env = NULL;
3727         size_t n_storage_fds = 0, n_socket_fds = 0;
3728         _cleanup_free_ char *line = NULL;
3729         pid_t pid;
3730
3731         assert(unit);
3732         assert(command);
3733         assert(context);
3734         assert(ret);
3735         assert(params);
3736         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3737
3738         if (context->std_input == EXEC_INPUT_SOCKET ||
3739             context->std_output == EXEC_OUTPUT_SOCKET ||
3740             context->std_error == EXEC_OUTPUT_SOCKET) {
3741
3742                 if (params->n_socket_fds > 1) {
3743                         log_unit_error(unit, "Got more than one socket.");
3744                         return -EINVAL;
3745                 }
3746
3747                 if (params->n_socket_fds == 0) {
3748                         log_unit_error(unit, "Got no socket.");
3749                         return -EINVAL;
3750                 }
3751
3752                 socket_fd = params->fds[0];
3753         } else {
3754                 socket_fd = -1;
3755                 fds = params->fds;
3756                 n_socket_fds = params->n_socket_fds;
3757                 n_storage_fds = params->n_storage_fds;
3758         }
3759
3760         r = exec_context_named_iofds(context, params, named_iofds);
3761         if (r < 0)
3762                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3763
3764         r = exec_context_load_environment(unit, context, &files_env);
3765         if (r < 0)
3766                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3767
3768         line = exec_command_line(command->argv);
3769         if (!line)
3770                 return log_oom();
3771
3772         log_struct(LOG_DEBUG,
3773                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3774                    "EXECUTABLE=%s", command->path,
3775                    LOG_UNIT_ID(unit),
3776                    LOG_UNIT_INVOCATION_ID(unit));
3777
3778         if (params->cgroup_path) {
3779                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3780                 if (r < 0)
3781                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3782                 if (r > 0) { /* We are using a child cgroup */
3783                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3784                         if (r < 0)
3785                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3786                 }
3787         }
3788
3789         pid = fork();
3790         if (pid < 0)
3791                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3792
3793         if (pid == 0) {
3794                 int exit_status = EXIT_SUCCESS;
3795
3796                 r = exec_child(unit,
3797                                command,
3798                                context,
3799                                params,
3800                                runtime,
3801                                dcreds,
3802                                socket_fd,
3803                                named_iofds,
3804                                fds,
3805                                n_socket_fds,
3806                                n_storage_fds,
3807                                files_env,
3808                                unit->manager->user_lookup_fds[1],
3809                                &exit_status);
3810
3811                 if (r < 0)
3812                         log_struct_errno(LOG_ERR, r,
3813                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3814                                          LOG_UNIT_ID(unit),
3815                                          LOG_UNIT_INVOCATION_ID(unit),
3816                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3817                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3818                                                           command->path),
3819                                          "EXECUTABLE=%s", command->path);
3820
3821                 _exit(exit_status);
3822         }
3823
3824         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3825
3826         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3827          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3828          * process will be killed too). */
3829         if (subcgroup_path)
3830                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
3831
3832         exec_status_start(&command->exec_status, pid);
3833
3834         *ret = pid;
3835         return 0;
3836 }
3837
3838 void exec_context_init(ExecContext *c) {
3839         ExecDirectoryType i;
3840
3841         assert(c);
3842
3843         c->umask = 0022;
3844         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3845         c->cpu_sched_policy = SCHED_OTHER;
3846         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3847         c->syslog_level_prefix = true;
3848         c->ignore_sigpipe = true;
3849         c->timer_slack_nsec = NSEC_INFINITY;
3850         c->personality = PERSONALITY_INVALID;
3851         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3852                 c->directories[i].mode = 0755;
3853         c->capability_bounding_set = CAP_ALL;
3854         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3855         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3856         c->log_level_max = -1;
3857 }
3858
3859 void exec_context_done(ExecContext *c) {
3860         ExecDirectoryType i;
3861         size_t l;
3862
3863         assert(c);
3864
3865         c->environment = strv_free(c->environment);
3866         c->environment_files = strv_free(c->environment_files);
3867         c->pass_environment = strv_free(c->pass_environment);
3868         c->unset_environment = strv_free(c->unset_environment);
3869
3870         rlimit_free_all(c->rlimit);
3871
3872         for (l = 0; l < 3; l++) {
3873                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3874                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3875         }
3876
3877         c->working_directory = mfree(c->working_directory);
3878         c->root_directory = mfree(c->root_directory);
3879         c->root_image = mfree(c->root_image);
3880         c->tty_path = mfree(c->tty_path);
3881         c->syslog_identifier = mfree(c->syslog_identifier);
3882         c->user = mfree(c->user);
3883         c->group = mfree(c->group);
3884
3885         c->supplementary_groups = strv_free(c->supplementary_groups);
3886
3887         c->pam_name = mfree(c->pam_name);
3888
3889         c->read_only_paths = strv_free(c->read_only_paths);
3890         c->read_write_paths = strv_free(c->read_write_paths);
3891         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3892
3893         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3894         c->bind_mounts = NULL;
3895         c->n_bind_mounts = 0;
3896         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3897         c->temporary_filesystems = NULL;
3898         c->n_temporary_filesystems = 0;
3899
3900         cpu_set_reset(&c->cpu_set);
3901
3902         c->utmp_id = mfree(c->utmp_id);
3903         c->selinux_context = mfree(c->selinux_context);
3904         c->apparmor_profile = mfree(c->apparmor_profile);
3905         c->smack_process_label = mfree(c->smack_process_label);
3906
3907         c->syscall_filter = hashmap_free(c->syscall_filter);
3908         c->syscall_archs = set_free(c->syscall_archs);
3909         c->address_families = set_free(c->address_families);
3910
3911         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3912                 c->directories[i].paths = strv_free(c->directories[i].paths);
3913
3914         c->log_level_max = -1;
3915
3916         exec_context_free_log_extra_fields(c);
3917
3918         c->log_rate_limit_interval_usec = 0;
3919         c->log_rate_limit_burst = 0;
3920
3921         c->stdin_data = mfree(c->stdin_data);
3922         c->stdin_data_size = 0;
3923
3924         c->network_namespace_path = mfree(c->network_namespace_path);
3925 }
3926
3927 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3928         char **i;
3929
3930         assert(c);
3931
3932         if (!runtime_prefix)
3933                 return 0;
3934
3935         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3936                 _cleanup_free_ char *p;
3937
3938                 p = path_join(runtime_prefix, *i);
3939                 if (!p)
3940                         return -ENOMEM;
3941
3942                 /* We execute this synchronously, since we need to be sure this is gone when we start the
3943                  * service next. */
3944                 (void) rm_rf(p, REMOVE_ROOT);
3945         }
3946
3947         return 0;
3948 }
3949
3950 static void exec_command_done(ExecCommand *c) {
3951         assert(c);
3952
3953         c->path = mfree(c->path);
3954         c->argv = strv_free(c->argv);
3955 }
3956
3957 void exec_command_done_array(ExecCommand *c, size_t n) {
3958         size_t i;
3959
3960         for (i = 0; i < n; i++)
3961                 exec_command_done(c+i);
3962 }
3963
3964 ExecCommand* exec_command_free_list(ExecCommand *c) {
3965         ExecCommand *i;
3966
3967         while ((i = c)) {
3968                 LIST_REMOVE(command, c, i);
3969                 exec_command_done(i);
3970                 free(i);
3971         }
3972
3973         return NULL;
3974 }
3975
3976 void exec_command_free_array(ExecCommand **c, size_t n) {
3977         size_t i;
3978
3979         for (i = 0; i < n; i++)
3980                 c[i] = exec_command_free_list(c[i]);
3981 }
3982
3983 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3984         size_t i;
3985
3986         for (i = 0; i < n; i++)
3987                 exec_status_reset(&c[i].exec_status);
3988 }
3989
3990 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3991         size_t i;
3992
3993         for (i = 0; i < n; i++) {
3994                 ExecCommand *z;
3995
3996                 LIST_FOREACH(command, z, c[i])
3997                         exec_status_reset(&z->exec_status);
3998         }
3999 }
4000
4001 typedef struct InvalidEnvInfo {
4002         const Unit *unit;
4003         const char *path;
4004 } InvalidEnvInfo;
4005
4006 static void invalid_env(const char *p, void *userdata) {
4007         InvalidEnvInfo *info = userdata;
4008
4009         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4010 }
4011
4012 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4013         assert(c);
4014
4015         switch (fd_index) {
4016
4017         case STDIN_FILENO:
4018                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4019                         return NULL;
4020
4021                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4022
4023         case STDOUT_FILENO:
4024                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4025                         return NULL;
4026
4027                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4028
4029         case STDERR_FILENO:
4030                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4031                         return NULL;
4032
4033                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4034
4035         default:
4036                 return NULL;
4037         }
4038 }
4039
4040 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]) {
4041         size_t i, targets;
4042         const char* stdio_fdname[3];
4043         size_t n_fds;
4044
4045         assert(c);
4046         assert(p);
4047
4048         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4049                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4050                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4051
4052         for (i = 0; i < 3; i++)
4053                 stdio_fdname[i] = exec_context_fdname(c, i);
4054
4055         n_fds = p->n_storage_fds + p->n_socket_fds;
4056
4057         for (i = 0; i < n_fds  && targets > 0; i++)
4058                 if (named_iofds[STDIN_FILENO] < 0 &&
4059                     c->std_input == EXEC_INPUT_NAMED_FD &&
4060                     stdio_fdname[STDIN_FILENO] &&
4061                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4062
4063                         named_iofds[STDIN_FILENO] = p->fds[i];
4064                         targets--;
4065
4066                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4067                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4068                            stdio_fdname[STDOUT_FILENO] &&
4069                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4070
4071                         named_iofds[STDOUT_FILENO] = p->fds[i];
4072                         targets--;
4073
4074                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4075                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4076                            stdio_fdname[STDERR_FILENO] &&
4077                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4078
4079                         named_iofds[STDERR_FILENO] = p->fds[i];
4080                         targets--;
4081                 }
4082
4083         return targets == 0 ? 0 : -ENOENT;
4084 }
4085
4086 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4087         char **i, **r = NULL;
4088
4089         assert(c);
4090         assert(l);
4091
4092         STRV_FOREACH(i, c->environment_files) {
4093                 char *fn;
4094                 int k;
4095                 unsigned n;
4096                 bool ignore = false;
4097                 char **p;
4098                 _cleanup_globfree_ glob_t pglob = {};
4099
4100                 fn = *i;
4101
4102                 if (fn[0] == '-') {
4103                         ignore = true;
4104                         fn++;
4105                 }
4106
4107                 if (!path_is_absolute(fn)) {
4108                         if (ignore)
4109                                 continue;
4110
4111                         strv_free(r);
4112                         return -EINVAL;
4113                 }
4114
4115                 /* Filename supports globbing, take all matching files */
4116                 k = safe_glob(fn, 0, &pglob);
4117                 if (k < 0) {
4118                         if (ignore)
4119                                 continue;
4120
4121                         strv_free(r);
4122                         return k;
4123                 }
4124
4125                 /* When we don't match anything, -ENOENT should be returned */
4126                 assert(pglob.gl_pathc > 0);
4127
4128                 for (n = 0; n < pglob.gl_pathc; n++) {
4129                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4130                         if (k < 0) {
4131                                 if (ignore)
4132                                         continue;
4133
4134                                 strv_free(r);
4135                                 return k;
4136                         }
4137                         /* Log invalid environment variables with filename */
4138                         if (p) {
4139                                 InvalidEnvInfo info = {
4140                                         .unit = unit,
4141                                         .path = pglob.gl_pathv[n]
4142                                 };
4143
4144                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4145                         }
4146
4147                         if (!r)
4148                                 r = p;
4149                         else {
4150                                 char **m;
4151
4152                                 m = strv_env_merge(2, r, p);
4153                                 strv_free(r);
4154                                 strv_free(p);
4155                                 if (!m)
4156                                         return -ENOMEM;
4157
4158                                 r = m;
4159                         }
4160                 }
4161         }
4162
4163         *l = r;
4164
4165         return 0;
4166 }
4167
4168 static bool tty_may_match_dev_console(const char *tty) {
4169         _cleanup_free_ char *resolved = NULL;
4170
4171         if (!tty)
4172                 return true;
4173
4174         tty = skip_dev_prefix(tty);
4175
4176         /* trivial identity? */
4177         if (streq(tty, "console"))
4178                 return true;
4179
4180         if (resolve_dev_console(&resolved) < 0)
4181                 return true; /* if we could not resolve, assume it may */
4182
4183         /* "tty0" means the active VC, so it may be the same sometimes */
4184         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4185 }
4186
4187 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4188         assert(ec);
4189
4190         return ec->tty_reset ||
4191                 ec->tty_vhangup ||
4192                 ec->tty_vt_disallocate ||
4193                 is_terminal_input(ec->std_input) ||
4194                 is_terminal_output(ec->std_output) ||
4195                 is_terminal_output(ec->std_error);
4196 }
4197
4198 bool exec_context_may_touch_console(const ExecContext *ec) {
4199
4200         return exec_context_may_touch_tty(ec) &&
4201                tty_may_match_dev_console(exec_context_tty_path(ec));
4202 }
4203
4204 static void strv_fprintf(FILE *f, char **l) {
4205         char **g;
4206
4207         assert(f);
4208
4209         STRV_FOREACH(g, l)
4210                 fprintf(f, " %s", *g);
4211 }
4212
4213 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4214         ExecDirectoryType dt;
4215         char **e, **d;
4216         unsigned i;
4217         int r;
4218
4219         assert(c);
4220         assert(f);
4221
4222         prefix = strempty(prefix);
4223
4224         fprintf(f,
4225                 "%sUMask: %04o\n"
4226                 "%sWorkingDirectory: %s\n"
4227                 "%sRootDirectory: %s\n"
4228                 "%sNonBlocking: %s\n"
4229                 "%sPrivateTmp: %s\n"
4230                 "%sPrivateDevices: %s\n"
4231                 "%sProtectKernelTunables: %s\n"
4232                 "%sProtectKernelModules: %s\n"
4233                 "%sProtectControlGroups: %s\n"
4234                 "%sPrivateNetwork: %s\n"
4235                 "%sPrivateUsers: %s\n"
4236                 "%sProtectHome: %s\n"
4237                 "%sProtectSystem: %s\n"
4238                 "%sMountAPIVFS: %s\n"
4239                 "%sIgnoreSIGPIPE: %s\n"
4240                 "%sMemoryDenyWriteExecute: %s\n"
4241                 "%sRestrictRealtime: %s\n"
4242                 "%sRestrictSUIDSGID: %s\n"
4243                 "%sKeyringMode: %s\n"
4244                 "%sProtectHostname: %s\n",
4245                 prefix, c->umask,
4246                 prefix, c->working_directory ? c->working_directory : "/",
4247                 prefix, c->root_directory ? c->root_directory : "/",
4248                 prefix, yes_no(c->non_blocking),
4249                 prefix, yes_no(c->private_tmp),
4250                 prefix, yes_no(c->private_devices),
4251                 prefix, yes_no(c->protect_kernel_tunables),
4252                 prefix, yes_no(c->protect_kernel_modules),
4253                 prefix, yes_no(c->protect_control_groups),
4254                 prefix, yes_no(c->private_network),
4255                 prefix, yes_no(c->private_users),
4256                 prefix, protect_home_to_string(c->protect_home),
4257                 prefix, protect_system_to_string(c->protect_system),
4258                 prefix, yes_no(c->mount_apivfs),
4259                 prefix, yes_no(c->ignore_sigpipe),
4260                 prefix, yes_no(c->memory_deny_write_execute),
4261                 prefix, yes_no(c->restrict_realtime),
4262                 prefix, yes_no(c->restrict_suid_sgid),
4263                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4264                 prefix, yes_no(c->protect_hostname));
4265
4266         if (c->root_image)
4267                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4268
4269         STRV_FOREACH(e, c->environment)
4270                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4271
4272         STRV_FOREACH(e, c->environment_files)
4273                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4274
4275         STRV_FOREACH(e, c->pass_environment)
4276                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4277
4278         STRV_FOREACH(e, c->unset_environment)
4279                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4280
4281         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4282
4283         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4284                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4285
4286                 STRV_FOREACH(d, c->directories[dt].paths)
4287                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4288         }
4289
4290         if (c->nice_set)
4291                 fprintf(f,
4292                         "%sNice: %i\n",
4293                         prefix, c->nice);
4294
4295         if (c->oom_score_adjust_set)
4296                 fprintf(f,
4297                         "%sOOMScoreAdjust: %i\n",
4298                         prefix, c->oom_score_adjust);
4299
4300         for (i = 0; i < RLIM_NLIMITS; i++)
4301                 if (c->rlimit[i]) {
4302                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4303                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4304                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4305                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4306                 }
4307
4308         if (c->ioprio_set) {
4309                 _cleanup_free_ char *class_str = NULL;
4310
4311                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4312                 if (r >= 0)
4313                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4314
4315                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4316         }
4317
4318         if (c->cpu_sched_set) {
4319                 _cleanup_free_ char *policy_str = NULL;
4320
4321                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4322                 if (r >= 0)
4323                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4324
4325                 fprintf(f,
4326                         "%sCPUSchedulingPriority: %i\n"
4327                         "%sCPUSchedulingResetOnFork: %s\n",
4328                         prefix, c->cpu_sched_priority,
4329                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4330         }
4331
4332         if (c->cpu_set.set) {
4333                 _cleanup_free_ char *affinity = NULL;
4334
4335                 affinity = cpu_set_to_range_string(&c->cpu_set);
4336                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4337         }
4338
4339         if (c->timer_slack_nsec != NSEC_INFINITY)
4340                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4341
4342         fprintf(f,
4343                 "%sStandardInput: %s\n"
4344                 "%sStandardOutput: %s\n"
4345                 "%sStandardError: %s\n",
4346                 prefix, exec_input_to_string(c->std_input),
4347                 prefix, exec_output_to_string(c->std_output),
4348                 prefix, exec_output_to_string(c->std_error));
4349
4350         if (c->std_input == EXEC_INPUT_NAMED_FD)
4351                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4352         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4353                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4354         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4355                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4356
4357         if (c->std_input == EXEC_INPUT_FILE)
4358                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4359         if (c->std_output == EXEC_OUTPUT_FILE)
4360                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4361         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4362                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4363         if (c->std_error == EXEC_OUTPUT_FILE)
4364                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4365         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4366                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4367
4368         if (c->tty_path)
4369                 fprintf(f,
4370                         "%sTTYPath: %s\n"
4371                         "%sTTYReset: %s\n"
4372                         "%sTTYVHangup: %s\n"
4373                         "%sTTYVTDisallocate: %s\n",
4374                         prefix, c->tty_path,
4375                         prefix, yes_no(c->tty_reset),
4376                         prefix, yes_no(c->tty_vhangup),
4377                         prefix, yes_no(c->tty_vt_disallocate));
4378
4379         if (IN_SET(c->std_output,
4380                    EXEC_OUTPUT_SYSLOG,
4381                    EXEC_OUTPUT_KMSG,
4382                    EXEC_OUTPUT_JOURNAL,
4383                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4384                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4385                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4386             IN_SET(c->std_error,
4387                    EXEC_OUTPUT_SYSLOG,
4388                    EXEC_OUTPUT_KMSG,
4389                    EXEC_OUTPUT_JOURNAL,
4390                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4391                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4392                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4393
4394                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4395
4396                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4397                 if (r >= 0)
4398                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4399
4400                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4401                 if (r >= 0)
4402                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4403         }
4404
4405         if (c->log_level_max >= 0) {
4406                 _cleanup_free_ char *t = NULL;
4407
4408                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4409
4410                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4411         }
4412
4413         if (c->log_rate_limit_interval_usec > 0) {
4414                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4415
4416                 fprintf(f,
4417                         "%sLogRateLimitIntervalSec: %s\n",
4418                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4419         }
4420
4421         if (c->log_rate_limit_burst > 0)
4422                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4423
4424         if (c->n_log_extra_fields > 0) {
4425                 size_t j;
4426
4427                 for (j = 0; j < c->n_log_extra_fields; j++) {
4428                         fprintf(f, "%sLogExtraFields: ", prefix);
4429                         fwrite(c->log_extra_fields[j].iov_base,
4430                                1, c->log_extra_fields[j].iov_len,
4431                                f);
4432                         fputc('\n', f);
4433                 }
4434         }
4435
4436         if (c->secure_bits) {
4437                 _cleanup_free_ char *str = NULL;
4438
4439                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4440                 if (r >= 0)
4441                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4442         }
4443
4444         if (c->capability_bounding_set != CAP_ALL) {
4445                 _cleanup_free_ char *str = NULL;
4446
4447                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4448                 if (r >= 0)
4449                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4450         }
4451
4452         if (c->capability_ambient_set != 0) {
4453                 _cleanup_free_ char *str = NULL;
4454
4455                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4456                 if (r >= 0)
4457                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4458         }
4459
4460         if (c->user)
4461                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4462         if (c->group)
4463                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4464
4465         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4466
4467         if (!strv_isempty(c->supplementary_groups)) {
4468                 fprintf(f, "%sSupplementaryGroups:", prefix);
4469                 strv_fprintf(f, c->supplementary_groups);
4470                 fputs("\n", f);
4471         }
4472
4473         if (c->pam_name)
4474                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4475
4476         if (!strv_isempty(c->read_write_paths)) {
4477                 fprintf(f, "%sReadWritePaths:", prefix);
4478                 strv_fprintf(f, c->read_write_paths);
4479                 fputs("\n", f);
4480         }
4481
4482         if (!strv_isempty(c->read_only_paths)) {
4483                 fprintf(f, "%sReadOnlyPaths:", prefix);
4484                 strv_fprintf(f, c->read_only_paths);
4485                 fputs("\n", f);
4486         }
4487
4488         if (!strv_isempty(c->inaccessible_paths)) {
4489                 fprintf(f, "%sInaccessiblePaths:", prefix);
4490                 strv_fprintf(f, c->inaccessible_paths);
4491                 fputs("\n", f);
4492         }
4493
4494         if (c->n_bind_mounts > 0)
4495                 for (i = 0; i < c->n_bind_mounts; i++)
4496                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4497                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4498                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4499                                 c->bind_mounts[i].source,
4500                                 c->bind_mounts[i].destination,
4501                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4502
4503         if (c->n_temporary_filesystems > 0)
4504                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4505                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4506
4507                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4508                                 t->path,
4509                                 isempty(t->options) ? "" : ":",
4510                                 strempty(t->options));
4511                 }
4512
4513         if (c->utmp_id)
4514                 fprintf(f,
4515                         "%sUtmpIdentifier: %s\n",
4516                         prefix, c->utmp_id);
4517
4518         if (c->selinux_context)
4519                 fprintf(f,
4520                         "%sSELinuxContext: %s%s\n",
4521                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4522
4523         if (c->apparmor_profile)
4524                 fprintf(f,
4525                         "%sAppArmorProfile: %s%s\n",
4526                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4527
4528         if (c->smack_process_label)
4529                 fprintf(f,
4530                         "%sSmackProcessLabel: %s%s\n",
4531                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4532
4533         if (c->personality != PERSONALITY_INVALID)
4534                 fprintf(f,
4535                         "%sPersonality: %s\n",
4536                         prefix, strna(personality_to_string(c->personality)));
4537
4538         fprintf(f,
4539                 "%sLockPersonality: %s\n",
4540                 prefix, yes_no(c->lock_personality));
4541
4542         if (c->syscall_filter) {
4543 #if HAVE_SECCOMP
4544                 Iterator j;
4545                 void *id, *val;
4546                 bool first = true;
4547 #endif
4548
4549                 fprintf(f,
4550                         "%sSystemCallFilter: ",
4551                         prefix);
4552
4553                 if (!c->syscall_whitelist)
4554                         fputc('~', f);
4555
4556 #if HAVE_SECCOMP
4557                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4558                         _cleanup_free_ char *name = NULL;
4559                         const char *errno_name = NULL;
4560                         int num = PTR_TO_INT(val);
4561
4562                         if (first)
4563                                 first = false;
4564                         else
4565                                 fputc(' ', f);
4566
4567                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4568                         fputs(strna(name), f);
4569
4570                         if (num >= 0) {
4571                                 errno_name = errno_to_name(num);
4572                                 if (errno_name)
4573                                         fprintf(f, ":%s", errno_name);
4574                                 else
4575                                         fprintf(f, ":%d", num);
4576                         }
4577                 }
4578 #endif
4579
4580                 fputc('\n', f);
4581         }
4582
4583         if (c->syscall_archs) {
4584 #if HAVE_SECCOMP
4585                 Iterator j;
4586                 void *id;
4587 #endif
4588
4589                 fprintf(f,
4590                         "%sSystemCallArchitectures:",
4591                         prefix);
4592
4593 #if HAVE_SECCOMP
4594                 SET_FOREACH(id, c->syscall_archs, j)
4595                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4596 #endif
4597                 fputc('\n', f);
4598         }
4599
4600         if (exec_context_restrict_namespaces_set(c)) {
4601                 _cleanup_free_ char *s = NULL;
4602
4603                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4604                 if (r >= 0)
4605                         fprintf(f, "%sRestrictNamespaces: %s\n",
4606                                 prefix, s);
4607         }
4608
4609         if (c->network_namespace_path)
4610                 fprintf(f,
4611                         "%sNetworkNamespacePath: %s\n",
4612                         prefix, c->network_namespace_path);
4613
4614         if (c->syscall_errno > 0) {
4615                 const char *errno_name;
4616
4617                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4618
4619                 errno_name = errno_to_name(c->syscall_errno);
4620                 if (errno_name)
4621                         fprintf(f, "%s\n", errno_name);
4622                 else
4623                         fprintf(f, "%d\n", c->syscall_errno);
4624         }
4625 }
4626
4627 bool exec_context_maintains_privileges(const ExecContext *c) {
4628         assert(c);
4629
4630         /* Returns true if the process forked off would run under
4631          * an unchanged UID or as root. */
4632
4633         if (!c->user)
4634                 return true;
4635
4636         if (streq(c->user, "root") || streq(c->user, "0"))
4637                 return true;
4638
4639         return false;
4640 }
4641
4642 int exec_context_get_effective_ioprio(const ExecContext *c) {
4643         int p;
4644
4645         assert(c);
4646
4647         if (c->ioprio_set)
4648                 return c->ioprio;
4649
4650         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4651         if (p < 0)
4652                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4653
4654         return p;
4655 }
4656
4657 void exec_context_free_log_extra_fields(ExecContext *c) {
4658         size_t l;
4659
4660         assert(c);
4661
4662         for (l = 0; l < c->n_log_extra_fields; l++)
4663                 free(c->log_extra_fields[l].iov_base);
4664         c->log_extra_fields = mfree(c->log_extra_fields);
4665         c->n_log_extra_fields = 0;
4666 }
4667
4668 void exec_context_revert_tty(ExecContext *c) {
4669         int r;
4670
4671         assert(c);
4672
4673         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4674         exec_context_tty_reset(c, NULL);
4675
4676         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4677          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4678          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4679
4680         if (exec_context_may_touch_tty(c)) {
4681                 const char *path;
4682
4683                 path = exec_context_tty_path(c);
4684                 if (path) {
4685                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4686                         if (r < 0 && r != -ENOENT)
4687                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4688                 }
4689         }
4690 }
4691
4692 void exec_status_start(ExecStatus *s, pid_t pid) {
4693         assert(s);
4694
4695         *s = (ExecStatus) {
4696                 .pid = pid,
4697         };
4698
4699         dual_timestamp_get(&s->start_timestamp);
4700 }
4701
4702 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4703         assert(s);
4704
4705         if (s->pid != pid) {
4706                 *s = (ExecStatus) {
4707                         .pid = pid,
4708                 };
4709         }
4710
4711         dual_timestamp_get(&s->exit_timestamp);
4712
4713         s->code = code;
4714         s->status = status;
4715
4716         if (context && context->utmp_id)
4717                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4718 }
4719
4720 void exec_status_reset(ExecStatus *s) {
4721         assert(s);
4722
4723         *s = (ExecStatus) {};
4724 }
4725
4726 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4727         char buf[FORMAT_TIMESTAMP_MAX];
4728
4729         assert(s);
4730         assert(f);
4731
4732         if (s->pid <= 0)
4733                 return;
4734
4735         prefix = strempty(prefix);
4736
4737         fprintf(f,
4738                 "%sPID: "PID_FMT"\n",
4739                 prefix, s->pid);
4740
4741         if (dual_timestamp_is_set(&s->start_timestamp))
4742                 fprintf(f,
4743                         "%sStart Timestamp: %s\n",
4744                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4745
4746         if (dual_timestamp_is_set(&s->exit_timestamp))
4747                 fprintf(f,
4748                         "%sExit Timestamp: %s\n"
4749                         "%sExit Code: %s\n"
4750                         "%sExit Status: %i\n",
4751                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4752                         prefix, sigchld_code_to_string(s->code),
4753                         prefix, s->status);
4754 }
4755
4756 static char *exec_command_line(char **argv) {
4757         size_t k;
4758         char *n, *p, **a;
4759         bool first = true;
4760
4761         assert(argv);
4762
4763         k = 1;
4764         STRV_FOREACH(a, argv)
4765                 k += strlen(*a)+3;
4766
4767         n = new(char, k);
4768         if (!n)
4769                 return NULL;
4770
4771         p = n;
4772         STRV_FOREACH(a, argv) {
4773
4774                 if (!first)
4775                         *(p++) = ' ';
4776                 else
4777                         first = false;
4778
4779                 if (strpbrk(*a, WHITESPACE)) {
4780                         *(p++) = '\'';
4781                         p = stpcpy(p, *a);
4782                         *(p++) = '\'';
4783                 } else
4784                         p = stpcpy(p, *a);
4785
4786         }
4787
4788         *p = 0;
4789
4790         /* FIXME: this doesn't really handle arguments that have
4791          * spaces and ticks in them */
4792
4793         return n;
4794 }
4795
4796 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4797         _cleanup_free_ char *cmd = NULL;
4798         const char *prefix2;
4799
4800         assert(c);
4801         assert(f);
4802
4803         prefix = strempty(prefix);
4804         prefix2 = strjoina(prefix, "\t");
4805
4806         cmd = exec_command_line(c->argv);
4807         fprintf(f,
4808                 "%sCommand Line: %s\n",
4809                 prefix, cmd ? cmd : strerror(ENOMEM));
4810
4811         exec_status_dump(&c->exec_status, f, prefix2);
4812 }
4813
4814 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4815         assert(f);
4816
4817         prefix = strempty(prefix);
4818
4819         LIST_FOREACH(command, c, c)
4820                 exec_command_dump(c, f, prefix);
4821 }
4822
4823 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4824         ExecCommand *end;
4825
4826         assert(l);
4827         assert(e);
4828
4829         if (*l) {
4830                 /* It's kind of important, that we keep the order here */
4831                 LIST_FIND_TAIL(command, *l, end);
4832                 LIST_INSERT_AFTER(command, *l, end, e);
4833         } else
4834               *l = e;
4835 }
4836
4837 int exec_command_set(ExecCommand *c, const char *path, ...) {
4838         va_list ap;
4839         char **l, *p;
4840
4841         assert(c);
4842         assert(path);
4843
4844         va_start(ap, path);
4845         l = strv_new_ap(path, ap);
4846         va_end(ap);
4847
4848         if (!l)
4849                 return -ENOMEM;
4850
4851         p = strdup(path);
4852         if (!p) {
4853                 strv_free(l);
4854                 return -ENOMEM;
4855         }
4856
4857         free_and_replace(c->path, p);
4858
4859         return strv_free_and_replace(c->argv, l);
4860 }
4861
4862 int exec_command_append(ExecCommand *c, const char *path, ...) {
4863         _cleanup_strv_free_ char **l = NULL;
4864         va_list ap;
4865         int r;
4866
4867         assert(c);
4868         assert(path);
4869
4870         va_start(ap, path);
4871         l = strv_new_ap(path, ap);
4872         va_end(ap);
4873
4874         if (!l)
4875                 return -ENOMEM;
4876
4877         r = strv_extend_strv(&c->argv, l, false);
4878         if (r < 0)
4879                 return r;
4880
4881         return 0;
4882 }
4883
4884 static void *remove_tmpdir_thread(void *p) {
4885         _cleanup_free_ char *path = p;
4886
4887         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4888         return NULL;
4889 }
4890
4891 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4892         int r;
4893
4894         if (!rt)
4895                 return NULL;
4896
4897         if (rt->manager)
4898                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4899
4900         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4901         if (destroy && rt->tmp_dir) {
4902                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4903
4904                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4905                 if (r < 0) {
4906                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4907                         free(rt->tmp_dir);
4908                 }
4909
4910                 rt->tmp_dir = NULL;
4911         }
4912
4913         if (destroy && rt->var_tmp_dir) {
4914                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4915
4916                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4917                 if (r < 0) {
4918                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4919                         free(rt->var_tmp_dir);
4920                 }
4921
4922                 rt->var_tmp_dir = NULL;
4923         }
4924
4925         rt->id = mfree(rt->id);
4926         rt->tmp_dir = mfree(rt->tmp_dir);
4927         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4928         safe_close_pair(rt->netns_storage_socket);
4929         return mfree(rt);
4930 }
4931
4932 static void exec_runtime_freep(ExecRuntime **rt) {
4933         (void) exec_runtime_free(*rt, false);
4934 }
4935
4936 static int exec_runtime_allocate(ExecRuntime **ret) {
4937         ExecRuntime *n;
4938
4939         assert(ret);
4940
4941         n = new(ExecRuntime, 1);
4942         if (!n)
4943                 return -ENOMEM;
4944
4945         *n = (ExecRuntime) {
4946                 .netns_storage_socket = { -1, -1 },
4947         };
4948
4949         *ret = n;
4950         return 0;
4951 }
4952
4953 static int exec_runtime_add(
4954                 Manager *m,
4955                 const char *id,
4956                 const char *tmp_dir,
4957                 const char *var_tmp_dir,
4958                 const int netns_storage_socket[2],
4959                 ExecRuntime **ret) {
4960
4961         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4962         int r;
4963
4964         assert(m);
4965         assert(id);
4966
4967         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4968         if (r < 0)
4969                 return r;
4970
4971         r = exec_runtime_allocate(&rt);
4972         if (r < 0)
4973                 return r;
4974
4975         rt->id = strdup(id);
4976         if (!rt->id)
4977                 return -ENOMEM;
4978
4979         if (tmp_dir) {
4980                 rt->tmp_dir = strdup(tmp_dir);
4981                 if (!rt->tmp_dir)
4982                         return -ENOMEM;
4983
4984                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4985                 assert(var_tmp_dir);
4986                 rt->var_tmp_dir = strdup(var_tmp_dir);
4987                 if (!rt->var_tmp_dir)
4988                         return -ENOMEM;
4989         }
4990
4991         if (netns_storage_socket) {
4992                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4993                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4994         }
4995
4996         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4997         if (r < 0)
4998                 return r;
4999
5000         rt->manager = m;
5001
5002         if (ret)
5003                 *ret = rt;
5004
5005         /* do not remove created ExecRuntime object when the operation succeeds. */
5006         rt = NULL;
5007         return 0;
5008 }
5009
5010 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5011         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5012         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5013         int r;
5014
5015         assert(m);
5016         assert(c);
5017         assert(id);
5018
5019         /* It is not necessary to create ExecRuntime object. */
5020         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5021                 return 0;
5022
5023         if (c->private_tmp) {
5024                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5025                 if (r < 0)
5026                         return r;
5027         }
5028
5029         if (c->private_network || c->network_namespace_path) {
5030                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5031                         return -errno;
5032         }
5033
5034         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5035         if (r < 0)
5036                 return r;
5037
5038         /* Avoid cleanup */
5039         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5040         return 1;
5041 }
5042
5043 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5044         ExecRuntime *rt;
5045         int r;
5046
5047         assert(m);
5048         assert(id);
5049         assert(ret);
5050
5051         rt = hashmap_get(m->exec_runtime_by_id, id);
5052         if (rt)
5053                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5054                 goto ref;
5055
5056         if (!create)
5057                 return 0;
5058
5059         /* If not found, then create a new object. */
5060         r = exec_runtime_make(m, c, id, &rt);
5061         if (r <= 0)
5062                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5063                 return r;
5064
5065 ref:
5066         /* increment reference counter. */
5067         rt->n_ref++;
5068         *ret = rt;
5069         return 1;
5070 }
5071
5072 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5073         if (!rt)
5074                 return NULL;
5075
5076         assert(rt->n_ref > 0);
5077
5078         rt->n_ref--;
5079         if (rt->n_ref > 0)
5080                 return NULL;
5081
5082         return exec_runtime_free(rt, destroy);
5083 }
5084
5085 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5086         ExecRuntime *rt;
5087         Iterator i;
5088
5089         assert(m);
5090         assert(f);
5091         assert(fds);
5092
5093         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5094                 fprintf(f, "exec-runtime=%s", rt->id);
5095
5096                 if (rt->tmp_dir)
5097                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5098
5099                 if (rt->var_tmp_dir)
5100                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5101
5102                 if (rt->netns_storage_socket[0] >= 0) {
5103                         int copy;
5104
5105                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5106                         if (copy < 0)
5107                                 return copy;
5108
5109                         fprintf(f, " netns-socket-0=%i", copy);
5110                 }
5111
5112                 if (rt->netns_storage_socket[1] >= 0) {
5113                         int copy;
5114
5115                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5116                         if (copy < 0)
5117                                 return copy;
5118
5119                         fprintf(f, " netns-socket-1=%i", copy);
5120                 }
5121
5122                 fputc('\n', f);
5123         }
5124
5125         return 0;
5126 }
5127
5128 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5129         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5130         ExecRuntime *rt;
5131         int r;
5132
5133         /* This is for the migration from old (v237 or earlier) deserialization text.
5134          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5135          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5136          * so or not from the serialized text, then we always creates a new object owned by this. */
5137
5138         assert(u);
5139         assert(key);
5140         assert(value);
5141
5142         /* Manager manages ExecRuntime objects by the unit id.
5143          * So, we omit the serialized text when the unit does not have id (yet?)... */
5144         if (isempty(u->id)) {
5145                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5146                 return 0;
5147         }
5148
5149         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5150         if (r < 0) {
5151                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5152                 return 0;
5153         }
5154
5155         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5156         if (!rt) {
5157                 r = exec_runtime_allocate(&rt_create);
5158                 if (r < 0)
5159                         return log_oom();
5160
5161                 rt_create->id = strdup(u->id);
5162                 if (!rt_create->id)
5163                         return log_oom();
5164
5165                 rt = rt_create;
5166         }
5167
5168         if (streq(key, "tmp-dir")) {
5169                 char *copy;
5170
5171                 copy = strdup(value);
5172                 if (!copy)
5173                         return log_oom();
5174
5175                 free_and_replace(rt->tmp_dir, copy);
5176
5177         } else if (streq(key, "var-tmp-dir")) {
5178                 char *copy;
5179
5180                 copy = strdup(value);
5181                 if (!copy)
5182                         return log_oom();
5183
5184                 free_and_replace(rt->var_tmp_dir, copy);
5185
5186         } else if (streq(key, "netns-socket-0")) {
5187                 int fd;
5188
5189                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5190                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5191                         return 0;
5192                 }
5193
5194                 safe_close(rt->netns_storage_socket[0]);
5195                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5196
5197         } else if (streq(key, "netns-socket-1")) {
5198                 int fd;
5199
5200                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5201                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5202                         return 0;
5203                 }
5204
5205                 safe_close(rt->netns_storage_socket[1]);
5206                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5207         } else
5208                 return 0;
5209
5210         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5211         if (rt_create) {
5212                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5213                 if (r < 0) {
5214                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5215                         return 0;
5216                 }
5217
5218                 rt_create->manager = u->manager;
5219
5220                 /* Avoid cleanup */
5221                 rt_create = NULL;
5222         }
5223
5224         return 1;
5225 }
5226
5227 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5228         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5229         int r, fd0 = -1, fd1 = -1;
5230         const char *p, *v = value;
5231         size_t n;
5232
5233         assert(m);
5234         assert(value);
5235         assert(fds);
5236
5237         n = strcspn(v, " ");
5238         id = strndupa(v, n);
5239         if (v[n] != ' ')
5240                 goto finalize;
5241         p = v + n + 1;
5242
5243         v = startswith(p, "tmp-dir=");
5244         if (v) {
5245                 n = strcspn(v, " ");
5246                 tmp_dir = strndupa(v, n);
5247                 if (v[n] != ' ')
5248                         goto finalize;
5249                 p = v + n + 1;
5250         }
5251
5252         v = startswith(p, "var-tmp-dir=");
5253         if (v) {
5254                 n = strcspn(v, " ");
5255                 var_tmp_dir = strndupa(v, n);
5256                 if (v[n] != ' ')
5257                         goto finalize;
5258                 p = v + n + 1;
5259         }
5260
5261         v = startswith(p, "netns-socket-0=");
5262         if (v) {
5263                 char *buf;
5264
5265                 n = strcspn(v, " ");
5266                 buf = strndupa(v, n);
5267                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5268                         log_debug("Unable to process exec-runtime netns fd specification.");
5269                         return;
5270                 }
5271                 fd0 = fdset_remove(fds, fd0);
5272                 if (v[n] != ' ')
5273                         goto finalize;
5274                 p = v + n + 1;
5275         }
5276
5277         v = startswith(p, "netns-socket-1=");
5278         if (v) {
5279                 char *buf;
5280
5281                 n = strcspn(v, " ");
5282                 buf = strndupa(v, n);
5283                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5284                         log_debug("Unable to process exec-runtime netns fd specification.");
5285                         return;
5286                 }
5287                 fd1 = fdset_remove(fds, fd1);
5288         }
5289
5290 finalize:
5291
5292         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5293         if (r < 0)
5294                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5295 }
5296
5297 void exec_runtime_vacuum(Manager *m) {
5298         ExecRuntime *rt;
5299         Iterator i;
5300
5301         assert(m);
5302
5303         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5304
5305         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5306                 if (rt->n_ref > 0)
5307                         continue;
5308
5309                 (void) exec_runtime_free(rt, false);
5310         }
5311 }
5312
5313 void exec_params_clear(ExecParameters *p) {
5314         if (!p)
5315                 return;
5316
5317         strv_free(p->environment);
5318 }
5319
5320 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5321         [EXEC_INPUT_NULL] = "null",
5322         [EXEC_INPUT_TTY] = "tty",
5323         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5324         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5325         [EXEC_INPUT_SOCKET] = "socket",
5326         [EXEC_INPUT_NAMED_FD] = "fd",
5327         [EXEC_INPUT_DATA] = "data",
5328         [EXEC_INPUT_FILE] = "file",
5329 };
5330
5331 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5332
5333 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5334         [EXEC_OUTPUT_INHERIT] = "inherit",
5335         [EXEC_OUTPUT_NULL] = "null",
5336         [EXEC_OUTPUT_TTY] = "tty",
5337         [EXEC_OUTPUT_SYSLOG] = "syslog",
5338         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5339         [EXEC_OUTPUT_KMSG] = "kmsg",
5340         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5341         [EXEC_OUTPUT_JOURNAL] = "journal",
5342         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5343         [EXEC_OUTPUT_SOCKET] = "socket",
5344         [EXEC_OUTPUT_NAMED_FD] = "fd",
5345         [EXEC_OUTPUT_FILE] = "file",
5346         [EXEC_OUTPUT_FILE_APPEND] = "append",
5347 };
5348
5349 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5350
5351 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5352         [EXEC_UTMP_INIT] = "init",
5353         [EXEC_UTMP_LOGIN] = "login",
5354         [EXEC_UTMP_USER] = "user",
5355 };
5356
5357 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5358
5359 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5360         [EXEC_PRESERVE_NO] = "no",
5361         [EXEC_PRESERVE_YES] = "yes",
5362         [EXEC_PRESERVE_RESTART] = "restart",
5363 };
5364
5365 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5366
5367 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5368         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5369         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5370         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5371         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5372         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5373 };
5374
5375 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5376
5377 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5378         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5379         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5380         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5381         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5382         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5383 };
5384
5385 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5386
5387 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5388         [EXEC_KEYRING_INHERIT] = "inherit",
5389         [EXEC_KEYRING_PRIVATE] = "private",
5390         [EXEC_KEYRING_SHARED] = "shared",
5391 };
5392
5393 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);