src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <glob.h>
  24 #include <grp.h>
  25 #include <poll.h>
  26 #include <signal.h>
  27 #include <string.h>
  28 #include <sys/capability.h>
  29 #include <sys/eventfd.h>
  30 #include <sys/mman.h>
  31 #include <sys/personality.h>
  32 #include <sys/prctl.h>
  33 #include <sys/shm.h>
  34 #include <sys/socket.h>
  35 #include <sys/stat.h>
  36 #include <sys/types.h>
  37 #include <sys/un.h>
  38 #include <unistd.h>
  39 #include <utmpx.h>
  40
  41 #if HAVE_PAM
  42 #include <security/pam_appl.h>
  43 #endif
  44
  45 #if HAVE_SELINUX
  46 #include <selinux/selinux.h>
  47 #endif
  48
  49 #if HAVE_SECCOMP
  50 #include <seccomp.h>
  51 #endif
  52
  53 #if HAVE_APPARMOR
  54 #include <sys/apparmor.h>
  55 #endif
  56
  57 #include "sd-messages.h"
  58
  59 #include "af-list.h"
  60 #include "alloc-util.h"
  61 #if HAVE_APPARMOR
  62 #include "apparmor-util.h"
  63 #endif
  64 #include "async.h"
  65 #include "barrier.h"
  66 #include "cap-list.h"
  67 #include "capability-util.h"
  68 #include "chown-recursive.h"
  69 #include "cpu-set-util.h"
  70 #include "def.h"
  71 #include "env-util.h"
  72 #include "errno-list.h"
  73 #include "execute.h"
  74 #include "exit-status.h"
  75 #include "fd-util.h"
  76 #include "fileio.h"
  77 #include "format-util.h"
  78 #include "fs-util.h"
  79 #include "glob-util.h"
  80 #include "io-util.h"
  81 #include "ioprio.h"
  82 #include "label.h"
  83 #include "log.h"
  84 #include "macro.h"
  85 #include "manager.h"
  86 #include "missing.h"
  87 #include "mkdir.h"
  88 #include "namespace.h"
  89 #include "parse-util.h"
  90 #include "path-util.h"
  91 #include "process-util.h"
  92 #include "rlimit-util.h"
  93 #include "rm-rf.h"
  94 #if HAVE_SECCOMP
  95 #include "seccomp-util.h"
  96 #endif
  97 #include "securebits.h"
  98 #include "securebits-util.h"
  99 #include "selinux-util.h"
 100 #include "signal-util.h"
 101 #include "smack-util.h"
 102 #include "special.h"
 103 #include "stat-util.h"
 104 #include "string-table.h"
 105 #include "string-util.h"
 106 #include "strv.h"
 107 #include "syslog-util.h"
 108 #include "terminal-util.h"
 109 #include "unit.h"
 110 #include "user-util.h"
 111 #include "util.h"
 112 #include "utmp-wtmp.h"
 113
 114 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 115 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 116
 117 /* This assumes there is a 'tty' group */
 118 #define TTY_MODE 0620
 119
 120 #define SNDBUF_SIZE (8*1024*1024)
 121
 122 static int shift_fds(int fds[], unsigned n_fds) {
 123         int start, restart_from;
 124
 125         if (n_fds <= 0)
 126                 return 0;
 127
 128         /* Modifies the fds array! (sorts it) */
 129
 130         assert(fds);
 131
 132         start = 0;
 133         for (;;) {
 134                 int i;
 135
 136                 restart_from = -1;
 137
 138                 for (i = start; i < (int) n_fds; i++) {
 139                         int nfd;
 140
 141                         /* Already at right index? */
 142                         if (fds[i] == i+3)
 143                                 continue;
 144
 145                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 146                         if (nfd < 0)
 147                                 return -errno;
 148
 149                         safe_close(fds[i]);
 150                         fds[i] = nfd;
 151
 152                         /* Hmm, the fd we wanted isn't free? Then
 153                          * let's remember that and try again from here */
 154                         if (nfd != i+3 && restart_from < 0)
 155                                 restart_from = i;
 156                 }
 157
 158                 if (restart_from < 0)
 159                         break;
 160
 161                 start = restart_from;
 162         }
 163
 164         return 0;
 165 }
 166
 167 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 168         unsigned i, n_fds;
 169         int r;
 170
 171         n_fds = n_storage_fds + n_socket_fds;
 172         if (n_fds <= 0)
 173                 return 0;
 174
 175         assert(fds);
 176
 177         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 178          * O_NONBLOCK only applies to socket activation though. */
 179
 180         for (i = 0; i < n_fds; i++) {
 181
 182                 if (i < n_socket_fds) {
 183                         r = fd_nonblock(fds[i], nonblock);
 184                         if (r < 0)
 185                                 return r;
 186                 }
 187
 188                 /* We unconditionally drop FD_CLOEXEC from the fds,
 189                  * since after all we want to pass these fds to our
 190                  * children */
 191
 192                 r = fd_cloexec(fds[i], false);
 193                 if (r < 0)
 194                         return r;
 195         }
 196
 197         return 0;
 198 }
 199
 200 static const char *exec_context_tty_path(const ExecContext *context) {
 201         assert(context);
 202
 203         if (context->stdio_as_fds)
 204                 return NULL;
 205
 206         if (context->tty_path)
 207                 return context->tty_path;
 208
 209         return "/dev/console";
 210 }
 211
 212 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 213         const char *path;
 214
 215         assert(context);
 216
 217         path = exec_context_tty_path(context);
 218
 219         if (context->tty_vhangup) {
 220                 if (p && p->stdin_fd >= 0)
 221                         (void) terminal_vhangup_fd(p->stdin_fd);
 222                 else if (path)
 223                         (void) terminal_vhangup(path);
 224         }
 225
 226         if (context->tty_reset) {
 227                 if (p && p->stdin_fd >= 0)
 228                         (void) reset_terminal_fd(p->stdin_fd, true);
 229                 else if (path)
 230                         (void) reset_terminal(path);
 231         }
 232
 233         if (context->tty_vt_disallocate && path)
 234                 (void) vt_disallocate(path);
 235 }
 236
 237 static bool is_terminal_input(ExecInput i) {
 238         return IN_SET(i,
 239                       EXEC_INPUT_TTY,
 240                       EXEC_INPUT_TTY_FORCE,
 241                       EXEC_INPUT_TTY_FAIL);
 242 }
 243
 244 static bool is_terminal_output(ExecOutput o) {
 245         return IN_SET(o,
 246                       EXEC_OUTPUT_TTY,
 247                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 248                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 249                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 250 }
 251
 252 static bool is_syslog_output(ExecOutput o) {
 253         return IN_SET(o,
 254                       EXEC_OUTPUT_SYSLOG,
 255                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 256 }
 257
 258 static bool is_kmsg_output(ExecOutput o) {
 259         return IN_SET(o,
 260                       EXEC_OUTPUT_KMSG,
 261                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 262 }
 263
 264 static bool exec_context_needs_term(const ExecContext *c) {
 265         assert(c);
 266
 267         /* Return true if the execution context suggests we should set $TERM to something useful. */
 268
 269         if (is_terminal_input(c->std_input))
 270                 return true;
 271
 272         if (is_terminal_output(c->std_output))
 273                 return true;
 274
 275         if (is_terminal_output(c->std_error))
 276                 return true;
 277
 278         return !!c->tty_path;
 279 }
 280
 281 static int open_null_as(int flags, int nfd) {
 282         int fd;
 283
 284         assert(nfd >= 0);
 285
 286         fd = open("/dev/null", flags|O_NOCTTY);
 287         if (fd < 0)
 288                 return -errno;
 289
 290         return move_fd(fd, nfd, false);
 291 }
 292
 293 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 294         static const union sockaddr_union sa = {
 295                 .un.sun_family = AF_UNIX,
 296                 .un.sun_path = "/run/systemd/journal/stdout",
 297         };
 298         uid_t olduid = UID_INVALID;
 299         gid_t oldgid = GID_INVALID;
 300         int r;
 301
 302         if (gid_is_valid(gid)) {
 303                 oldgid = getgid();
 304
 305                 if (setegid(gid) < 0)
 306                         return -errno;
 307         }
 308
 309         if (uid_is_valid(uid)) {
 310                 olduid = getuid();
 311
 312                 if (seteuid(uid) < 0) {
 313                         r = -errno;
 314                         goto restore_gid;
 315                 }
 316         }
 317
 318         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 319
 320         /* If we fail to restore the uid or gid, things will likely
 321            fail later on. This should only happen if an LSM interferes. */
 322
 323         if (uid_is_valid(uid))
 324                 (void) seteuid(olduid);
 325
 326  restore_gid:
 327         if (gid_is_valid(gid))
 328                 (void) setegid(oldgid);
 329
 330         return r;
 331 }
 332
 333 static int connect_logger_as(
 334                 const Unit *unit,
 335                 const ExecContext *context,
 336                 const ExecParameters *params,
 337                 ExecOutput output,
 338                 const char *ident,
 339                 int nfd,
 340                 uid_t uid,
 341                 gid_t gid) {
 342
 343         int fd, r;
 344
 345         assert(context);
 346         assert(params);
 347         assert(output < _EXEC_OUTPUT_MAX);
 348         assert(ident);
 349         assert(nfd >= 0);
 350
 351         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 352         if (fd < 0)
 353                 return -errno;
 354
 355         r = connect_journal_socket(fd, uid, gid);
 356         if (r < 0)
 357                 return r;
 358
 359         if (shutdown(fd, SHUT_RD) < 0) {
 360                 safe_close(fd);
 361                 return -errno;
 362         }
 363
 364         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 365
 366         dprintf(fd,
 367                 "%s\n"
 368                 "%s\n"
 369                 "%i\n"
 370                 "%i\n"
 371                 "%i\n"
 372                 "%i\n"
 373                 "%i\n",
 374                 context->syslog_identifier ?: ident,
 375                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 376                 context->syslog_priority,
 377                 !!context->syslog_level_prefix,
 378                 is_syslog_output(output),
 379                 is_kmsg_output(output),
 380                 is_terminal_output(output));
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384 static int open_terminal_as(const char *path, int flags, int nfd) {
 385         int fd;
 386
 387         assert(path);
 388         assert(nfd >= 0);
 389
 390         fd = open_terminal(path, flags | O_NOCTTY);
 391         if (fd < 0)
 392                 return fd;
 393
 394         return move_fd(fd, nfd, false);
 395 }
 396
 397 static int acquire_path(const char *path, int flags, mode_t mode) {
 398         union sockaddr_union sa = {
 399                 .sa.sa_family = AF_UNIX,
 400         };
 401         int fd, r;
 402
 403         assert(path);
 404
 405         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 406                 flags |= O_CREAT;
 407
 408         fd = open(path, flags|O_NOCTTY, mode);
 409         if (fd >= 0)
 410                 return fd;
 411
 412         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 413                 return -errno;
 414         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 415                 return -ENXIO;
 416
 417         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 418
 419         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 420         if (fd < 0)
 421                 return -errno;
 422
 423         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 424         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 425                 safe_close(fd);
 426                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 427                                                            * indication that his wasn't an AF_UNIX socket after all */
 428         }
 429
 430         if ((flags & O_ACCMODE) == O_RDONLY)
 431                 r = shutdown(fd, SHUT_WR);
 432         else if ((flags & O_ACCMODE) == O_WRONLY)
 433                 r = shutdown(fd, SHUT_RD);
 434         else
 435                 return fd;
 436         if (r < 0) {
 437                 safe_close(fd);
 438                 return -errno;
 439         }
 440
 441         return fd;
 442 }
 443
 444 static int fixup_input(
 445                 const ExecContext *context,
 446                 int socket_fd,
 447                 bool apply_tty_stdin) {
 448
 449         ExecInput std_input;
 450
 451         assert(context);
 452
 453         std_input = context->std_input;
 454
 455         if (is_terminal_input(std_input) && !apply_tty_stdin)
 456                 return EXEC_INPUT_NULL;
 457
 458         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 459                 return EXEC_INPUT_NULL;
 460
 461         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 462                 return EXEC_INPUT_NULL;
 463
 464         return std_input;
 465 }
 466
 467 static int fixup_output(ExecOutput std_output, int socket_fd) {
 468
 469         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 470                 return EXEC_OUTPUT_INHERIT;
 471
 472         return std_output;
 473 }
 474
 475 static int setup_input(
 476                 const ExecContext *context,
 477                 const ExecParameters *params,
 478                 int socket_fd,
 479                 int named_iofds[3]) {
 480
 481         ExecInput i;
 482
 483         assert(context);
 484         assert(params);
 485
 486         if (params->stdin_fd >= 0) {
 487                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 488                         return -errno;
 489
 490                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 491                 if (isatty(STDIN_FILENO)) {
 492                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 493                         (void) reset_terminal_fd(STDIN_FILENO, true);
 494                 }
 495
 496                 return STDIN_FILENO;
 497         }
 498
 499         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 500
 501         switch (i) {
 502
 503         case EXEC_INPUT_NULL:
 504                 return open_null_as(O_RDONLY, STDIN_FILENO);
 505
 506         case EXEC_INPUT_TTY:
 507         case EXEC_INPUT_TTY_FORCE:
 508         case EXEC_INPUT_TTY_FAIL: {
 509                 int fd;
 510
 511                 fd = acquire_terminal(exec_context_tty_path(context),
 512                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 513                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 514                                                                   ACQUIRE_TERMINAL_WAIT,
 515                                       USEC_INFINITY);
 516                 if (fd < 0)
 517                         return fd;
 518
 519                 return move_fd(fd, STDIN_FILENO, false);
 520         }
 521
 522         case EXEC_INPUT_SOCKET:
 523                 assert(socket_fd >= 0);
 524
 525                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 526
 527         case EXEC_INPUT_NAMED_FD:
 528                 assert(named_iofds[STDIN_FILENO] >= 0);
 529
 530                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 531                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 532
 533         case EXEC_INPUT_DATA: {
 534                 int fd;
 535
 536                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 537                 if (fd < 0)
 538                         return fd;
 539
 540                 return move_fd(fd, STDIN_FILENO, false);
 541         }
 542
 543         case EXEC_INPUT_FILE: {
 544                 bool rw;
 545                 int fd;
 546
 547                 assert(context->stdio_file[STDIN_FILENO]);
 548
 549                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 550                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 551
 552                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 553                 if (fd < 0)
 554                         return fd;
 555
 556                 return move_fd(fd, STDIN_FILENO, false);
 557         }
 558
 559         default:
 560                 assert_not_reached("Unknown input type");
 561         }
 562 }
 563
 564 static int setup_output(
 565                 const Unit *unit,
 566                 const ExecContext *context,
 567                 const ExecParameters *params,
 568                 int fileno,
 569                 int socket_fd,
 570                 int named_iofds[3],
 571                 const char *ident,
 572                 uid_t uid,
 573                 gid_t gid,
 574                 dev_t *journal_stream_dev,
 575                 ino_t *journal_stream_ino) {
 576
 577         ExecOutput o;
 578         ExecInput i;
 579         int r;
 580
 581         assert(unit);
 582         assert(context);
 583         assert(params);
 584         assert(ident);
 585         assert(journal_stream_dev);
 586         assert(journal_stream_ino);
 587
 588         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 589
 590                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 591                         return -errno;
 592
 593                 return STDOUT_FILENO;
 594         }
 595
 596         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 597                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 598                         return -errno;
 599
 600                 return STDERR_FILENO;
 601         }
 602
 603         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 604         o = fixup_output(context->std_output, socket_fd);
 605
 606         if (fileno == STDERR_FILENO) {
 607                 ExecOutput e;
 608                 e = fixup_output(context->std_error, socket_fd);
 609
 610                 /* This expects the input and output are already set up */
 611
 612                 /* Don't change the stderr file descriptor if we inherit all
 613                  * the way and are not on a tty */
 614                 if (e == EXEC_OUTPUT_INHERIT &&
 615                     o == EXEC_OUTPUT_INHERIT &&
 616                     i == EXEC_INPUT_NULL &&
 617                     !is_terminal_input(context->std_input) &&
 618                     getppid () != 1)
 619                         return fileno;
 620
 621                 /* Duplicate from stdout if possible */
 622                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 623                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 624
 625                 o = e;
 626
 627         } else if (o == EXEC_OUTPUT_INHERIT) {
 628                 /* If input got downgraded, inherit the original value */
 629                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 630                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 631
 632                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 633                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 634                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 635
 636                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 637                 if (getppid() != 1)
 638                         return fileno;
 639
 640                 /* We need to open /dev/null here anew, to get the right access mode. */
 641                 return open_null_as(O_WRONLY, fileno);
 642         }
 643
 644         switch (o) {
 645
 646         case EXEC_OUTPUT_NULL:
 647                 return open_null_as(O_WRONLY, fileno);
 648
 649         case EXEC_OUTPUT_TTY:
 650                 if (is_terminal_input(i))
 651                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 652
 653                 /* We don't reset the terminal if this is just about output */
 654                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 655
 656         case EXEC_OUTPUT_SYSLOG:
 657         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 658         case EXEC_OUTPUT_KMSG:
 659         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 660         case EXEC_OUTPUT_JOURNAL:
 661         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 662                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 663                 if (r < 0) {
 664                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 665                         r = open_null_as(O_WRONLY, fileno);
 666                 } else {
 667                         struct stat st;
 668
 669                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 670                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 671                          * services to detect whether they are connected to the journal or not.
 672                          *
 673                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 674                          * about STDERR as that's usually the best way to do logging. */
 675
 676                         if (fstat(fileno, &st) >= 0 &&
 677                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 678                                 *journal_stream_dev = st.st_dev;
 679                                 *journal_stream_ino = st.st_ino;
 680                         }
 681                 }
 682                 return r;
 683
 684         case EXEC_OUTPUT_SOCKET:
 685                 assert(socket_fd >= 0);
 686
 687                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 688
 689         case EXEC_OUTPUT_NAMED_FD:
 690                 assert(named_iofds[fileno] >= 0);
 691
 692                 (void) fd_nonblock(named_iofds[fileno], false);
 693                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 694
 695         case EXEC_OUTPUT_FILE: {
 696                 bool rw;
 697                 int fd;
 698
 699                 assert(context->stdio_file[fileno]);
 700
 701                 rw = context->std_input == EXEC_INPUT_FILE &&
 702                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 703
 704                 if (rw)
 705                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 706
 707                 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
 708                 if (fd < 0)
 709                         return fd;
 710
 711                 return move_fd(fd, fileno, false);
 712         }
 713
 714         default:
 715                 assert_not_reached("Unknown error type");
 716         }
 717 }
 718
 719 static int chown_terminal(int fd, uid_t uid) {
 720         struct stat st;
 721
 722         assert(fd >= 0);
 723
 724         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 725         if (isatty(fd) < 1)
 726                 return 0;
 727
 728         /* This might fail. What matters are the results. */
 729         (void) fchown(fd, uid, -1);
 730         (void) fchmod(fd, TTY_MODE);
 731
 732         if (fstat(fd, &st) < 0)
 733                 return -errno;
 734
 735         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 736                 return -EPERM;
 737
 738         return 0;
 739 }
 740
 741 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 742         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 743         int r;
 744
 745         assert(_saved_stdin);
 746         assert(_saved_stdout);
 747
 748         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 749         if (saved_stdin < 0)
 750                 return -errno;
 751
 752         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 753         if (saved_stdout < 0)
 754                 return -errno;
 755
 756         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 757         if (fd < 0)
 758                 return fd;
 759
 760         r = chown_terminal(fd, getuid());
 761         if (r < 0)
 762                 return r;
 763
 764         r = reset_terminal_fd(fd, true);
 765         if (r < 0)
 766                 return r;
 767
 768         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 769         fd = -1;
 770         if (r < 0)
 771                 return r;
 772
 773         *_saved_stdin = saved_stdin;
 774         *_saved_stdout = saved_stdout;
 775
 776         saved_stdin = saved_stdout = -1;
 777
 778         return 0;
 779 }
 780
 781 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 782         assert(err < 0);
 783
 784         if (err == -ETIMEDOUT)
 785                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 786         else {
 787                 errno = -err;
 788                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 789         }
 790 }
 791
 792 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 793         _cleanup_close_ int fd = -1;
 794
 795         assert(vc);
 796
 797         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 798         if (fd < 0)
 799                 return;
 800
 801         write_confirm_error_fd(err, fd, u);
 802 }
 803
 804 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 805         int r = 0;
 806
 807         assert(saved_stdin);
 808         assert(saved_stdout);
 809
 810         release_terminal();
 811
 812         if (*saved_stdin >= 0)
 813                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 814                         r = -errno;
 815
 816         if (*saved_stdout >= 0)
 817                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 818                         r = -errno;
 819
 820         *saved_stdin = safe_close(*saved_stdin);
 821         *saved_stdout = safe_close(*saved_stdout);
 822
 823         return r;
 824 }
 825
 826 enum {
 827         CONFIRM_PRETEND_FAILURE = -1,
 828         CONFIRM_PRETEND_SUCCESS =  0,
 829         CONFIRM_EXECUTE = 1,
 830 };
 831
 832 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 833         int saved_stdout = -1, saved_stdin = -1, r;
 834         _cleanup_free_ char *e = NULL;
 835         char c;
 836
 837         /* For any internal errors, assume a positive response. */
 838         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 839         if (r < 0) {
 840                 write_confirm_error(r, vc, u);
 841                 return CONFIRM_EXECUTE;
 842         }
 843
 844         /* confirm_spawn might have been disabled while we were sleeping. */
 845         if (manager_is_confirm_spawn_disabled(u->manager)) {
 846                 r = 1;
 847                 goto restore_stdio;
 848         }
 849
 850         e = ellipsize(cmdline, 60, 100);
 851         if (!e) {
 852                 log_oom();
 853                 r = CONFIRM_EXECUTE;
 854                 goto restore_stdio;
 855         }
 856
 857         for (;;) {
 858                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 859                 if (r < 0) {
 860                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 861                         r = CONFIRM_EXECUTE;
 862                         goto restore_stdio;
 863                 }
 864
 865                 switch (c) {
 866                 case 'c':
 867                         printf("Resuming normal execution.\n");
 868                         manager_disable_confirm_spawn();
 869                         r = 1;
 870                         break;
 871                 case 'D':
 872                         unit_dump(u, stdout, "  ");
 873                         continue; /* ask again */
 874                 case 'f':
 875                         printf("Failing execution.\n");
 876                         r = CONFIRM_PRETEND_FAILURE;
 877                         break;
 878                 case 'h':
 879                         printf("  c - continue, proceed without asking anymore\n"
 880                                "  D - dump, show the state of the unit\n"
 881                                "  f - fail, don't execute the command and pretend it failed\n"
 882                                "  h - help\n"
 883                                "  i - info, show a short summary of the unit\n"
 884                                "  j - jobs, show jobs that are in progress\n"
 885                                "  s - skip, don't execute the command and pretend it succeeded\n"
 886                                "  y - yes, execute the command\n");
 887                         continue; /* ask again */
 888                 case 'i':
 889                         printf("  Description: %s\n"
 890                                "  Unit:        %s\n"
 891                                "  Command:     %s\n",
 892                                u->id, u->description, cmdline);
 893                         continue; /* ask again */
 894                 case 'j':
 895                         manager_dump_jobs(u->manager, stdout, "  ");
 896                         continue; /* ask again */
 897                 case 'n':
 898                         /* 'n' was removed in favor of 'f'. */
 899                         printf("Didn't understand 'n', did you mean 'f'?\n");
 900                         continue; /* ask again */
 901                 case 's':
 902                         printf("Skipping execution.\n");
 903                         r = CONFIRM_PRETEND_SUCCESS;
 904                         break;
 905                 case 'y':
 906                         r = CONFIRM_EXECUTE;
 907                         break;
 908                 default:
 909                         assert_not_reached("Unhandled choice");
 910                 }
 911                 break;
 912         }
 913
 914 restore_stdio:
 915         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 916         return r;
 917 }
 918
 919 static int get_fixed_user(const ExecContext *c, const char **user,
 920                           uid_t *uid, gid_t *gid,
 921                           const char **home, const char **shell) {
 922         int r;
 923         const char *name;
 924
 925         assert(c);
 926
 927         if (!c->user)
 928                 return 0;
 929
 930         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 931          * (i.e. are "/" or "/bin/nologin"). */
 932
 933         name = c->user;
 934         r = get_user_creds_clean(&name, uid, gid, home, shell);
 935         if (r < 0)
 936                 return r;
 937
 938         *user = name;
 939         return 0;
 940 }
 941
 942 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 943         int r;
 944         const char *name;
 945
 946         assert(c);
 947
 948         if (!c->group)
 949                 return 0;
 950
 951         name = c->group;
 952         r = get_group_creds(&name, gid);
 953         if (r < 0)
 954                 return r;
 955
 956         *group = name;
 957         return 0;
 958 }
 959
 960 static int get_supplementary_groups(const ExecContext *c, const char *user,
 961                                     const char *group, gid_t gid,
 962                                     gid_t **supplementary_gids, int *ngids) {
 963         char **i;
 964         int r, k = 0;
 965         int ngroups_max;
 966         bool keep_groups = false;
 967         gid_t *groups = NULL;
 968         _cleanup_free_ gid_t *l_gids = NULL;
 969
 970         assert(c);
 971
 972         /*
 973          * If user is given, then lookup GID and supplementary groups list.
 974          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 975          * here and as early as possible so we keep the list of supplementary
 976          * groups of the caller.
 977          */
 978         if (user && gid_is_valid(gid) && gid != 0) {
 979                 /* First step, initialize groups from /etc/groups */
 980                 if (initgroups(user, gid) < 0)
 981                         return -errno;
 982
 983                 keep_groups = true;
 984         }
 985
 986         if (strv_isempty(c->supplementary_groups))
 987                 return 0;
 988
 989         /*
 990          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 991          * be positive, otherwise fail.
 992          */
 993         errno = 0;
 994         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 995         if (ngroups_max <= 0) {
 996                 if (errno > 0)
 997                         return -errno;
 998                 else
 999                         return -EOPNOTSUPP; /* For all other values */
1000         }
1001
1002         l_gids = new(gid_t, ngroups_max);
1003         if (!l_gids)
1004                 return -ENOMEM;
1005
1006         if (keep_groups) {
1007                 /*
1008                  * Lookup the list of groups that the user belongs to, we
1009                  * avoid NSS lookups here too for gid=0.
1010                  */
1011                 k = ngroups_max;
1012                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1013                         return -EINVAL;
1014         } else
1015                 k = 0;
1016
1017         STRV_FOREACH(i, c->supplementary_groups) {
1018                 const char *g;
1019
1020                 if (k >= ngroups_max)
1021                         return -E2BIG;
1022
1023                 g = *i;
1024                 r = get_group_creds(&g, l_gids+k);
1025                 if (r < 0)
1026                         return r;
1027
1028                 k++;
1029         }
1030
1031         /*
1032          * Sets ngids to zero to drop all supplementary groups, happens
1033          * when we are under root and SupplementaryGroups= is empty.
1034          */
1035         if (k == 0) {
1036                 *ngids = 0;
1037                 return 0;
1038         }
1039
1040         /* Otherwise get the final list of supplementary groups */
1041         groups = memdup(l_gids, sizeof(gid_t) * k);
1042         if (!groups)
1043                 return -ENOMEM;
1044
1045         *supplementary_gids = groups;
1046         *ngids = k;
1047
1048         groups = NULL;
1049
1050         return 0;
1051 }
1052
1053 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1054         int r;
1055
1056         /* Handle SupplementaryGroups= if it is not empty */
1057         if (ngids > 0) {
1058                 r = maybe_setgroups(ngids, supplementary_gids);
1059                 if (r < 0)
1060                         return r;
1061         }
1062
1063         if (gid_is_valid(gid)) {
1064                 /* Then set our gids */
1065                 if (setresgid(gid, gid, gid) < 0)
1066                         return -errno;
1067         }
1068
1069         return 0;
1070 }
1071
1072 static int enforce_user(const ExecContext *context, uid_t uid) {
1073         assert(context);
1074
1075         if (!uid_is_valid(uid))
1076                 return 0;
1077
1078         /* Sets (but doesn't look up) the uid and make sure we keep the
1079          * capabilities while doing so. */
1080
1081         if (context->capability_ambient_set != 0) {
1082
1083                 /* First step: If we need to keep capabilities but
1084                  * drop privileges we need to make sure we keep our
1085                  * caps, while we drop privileges. */
1086                 if (uid != 0) {
1087                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1088
1089                         if (prctl(PR_GET_SECUREBITS) != sb)
1090                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1091                                         return -errno;
1092                 }
1093         }
1094
1095         /* Second step: actually set the uids */
1096         if (setresuid(uid, uid, uid) < 0)
1097                 return -errno;
1098
1099         /* At this point we should have all necessary capabilities but
1100            are otherwise a normal user. However, the caps might got
1101            corrupted due to the setresuid() so we need clean them up
1102            later. This is done outside of this call. */
1103
1104         return 0;
1105 }
1106
1107 #if HAVE_PAM
1108
1109 static int null_conv(
1110                 int num_msg,
1111                 const struct pam_message **msg,
1112                 struct pam_response **resp,
1113                 void *appdata_ptr) {
1114
1115         /* We don't support conversations */
1116
1117         return PAM_CONV_ERR;
1118 }
1119
1120 #endif
1121
1122 static int setup_pam(
1123                 const char *name,
1124                 const char *user,
1125                 uid_t uid,
1126                 gid_t gid,
1127                 const char *tty,
1128                 char ***env,
1129                 int fds[], unsigned n_fds) {
1130
1131 #if HAVE_PAM
1132
1133         static const struct pam_conv conv = {
1134                 .conv = null_conv,
1135                 .appdata_ptr = NULL
1136         };
1137
1138         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1139         pam_handle_t *handle = NULL;
1140         sigset_t old_ss;
1141         int pam_code = PAM_SUCCESS, r;
1142         char **nv, **e = NULL;
1143         bool close_session = false;
1144         pid_t pam_pid = 0, parent_pid;
1145         int flags = 0;
1146
1147         assert(name);
1148         assert(user);
1149         assert(env);
1150
1151         /* We set up PAM in the parent process, then fork. The child
1152          * will then stay around until killed via PR_GET_PDEATHSIG or
1153          * systemd via the cgroup logic. It will then remove the PAM
1154          * session again. The parent process will exec() the actual
1155          * daemon. We do things this way to ensure that the main PID
1156          * of the daemon is the one we initially fork()ed. */
1157
1158         r = barrier_create(&barrier);
1159         if (r < 0)
1160                 goto fail;
1161
1162         if (log_get_max_level() < LOG_DEBUG)
1163                 flags |= PAM_SILENT;
1164
1165         pam_code = pam_start(name, user, &conv, &handle);
1166         if (pam_code != PAM_SUCCESS) {
1167                 handle = NULL;
1168                 goto fail;
1169         }
1170
1171         if (tty) {
1172                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1173                 if (pam_code != PAM_SUCCESS)
1174                         goto fail;
1175         }
1176
1177         STRV_FOREACH(nv, *env) {
1178                 pam_code = pam_putenv(handle, *nv);
1179                 if (pam_code != PAM_SUCCESS)
1180                         goto fail;
1181         }
1182
1183         pam_code = pam_acct_mgmt(handle, flags);
1184         if (pam_code != PAM_SUCCESS)
1185                 goto fail;
1186
1187         pam_code = pam_open_session(handle, flags);
1188         if (pam_code != PAM_SUCCESS)
1189                 goto fail;
1190
1191         close_session = true;
1192
1193         e = pam_getenvlist(handle);
1194         if (!e) {
1195                 pam_code = PAM_BUF_ERR;
1196                 goto fail;
1197         }
1198
1199         /* Block SIGTERM, so that we know that it won't get lost in
1200          * the child */
1201
1202         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1203
1204         parent_pid = getpid_cached();
1205
1206         r = safe_fork("(sd-pam)", 0, &pam_pid);
1207         if (r < 0)
1208                 goto fail;
1209         if (r == 0) {
1210                 int sig, ret = EXIT_PAM;
1211
1212                 /* The child's job is to reset the PAM session on
1213                  * termination */
1214                 barrier_set_role(&barrier, BARRIER_CHILD);
1215
1216                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1217                  * are open here that have been opened by PAM. */
1218                 (void) close_many(fds, n_fds);
1219
1220                 /* Drop privileges - we don't need any to pam_close_session
1221                  * and this will make PR_SET_PDEATHSIG work in most cases.
1222                  * If this fails, ignore the error - but expect sd-pam threads
1223                  * to fail to exit normally */
1224
1225                 r = maybe_setgroups(0, NULL);
1226                 if (r < 0)
1227                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1228                 if (setresgid(gid, gid, gid) < 0)
1229                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1230                 if (setresuid(uid, uid, uid) < 0)
1231                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1232
1233                 (void) ignore_signals(SIGPIPE, -1);
1234
1235                 /* Wait until our parent died. This will only work if
1236                  * the above setresuid() succeeds, otherwise the kernel
1237                  * will not allow unprivileged parents kill their privileged
1238                  * children this way. We rely on the control groups kill logic
1239                  * to do the rest for us. */
1240                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1241                         goto child_finish;
1242
1243                 /* Tell the parent that our setup is done. This is especially
1244                  * important regarding dropping privileges. Otherwise, unit
1245                  * setup might race against our setresuid(2) call.
1246                  *
1247                  * If the parent aborted, we'll detect this below, hence ignore
1248                  * return failure here. */
1249                 (void) barrier_place(&barrier);
1250
1251                 /* Check if our parent process might already have died? */
1252                 if (getppid() == parent_pid) {
1253                         sigset_t ss;
1254
1255                         assert_se(sigemptyset(&ss) >= 0);
1256                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1257
1258                         for (;;) {
1259                                 if (sigwait(&ss, &sig) < 0) {
1260                                         if (errno == EINTR)
1261                                                 continue;
1262
1263                                         goto child_finish;
1264                                 }
1265
1266                                 assert(sig == SIGTERM);
1267                                 break;
1268                         }
1269                 }
1270
1271                 /* If our parent died we'll end the session */
1272                 if (getppid() != parent_pid) {
1273                         pam_code = pam_close_session(handle, flags);
1274                         if (pam_code != PAM_SUCCESS)
1275                                 goto child_finish;
1276                 }
1277
1278                 ret = 0;
1279
1280         child_finish:
1281                 pam_end(handle, pam_code | flags);
1282                 _exit(ret);
1283         }
1284
1285         barrier_set_role(&barrier, BARRIER_PARENT);
1286
1287         /* If the child was forked off successfully it will do all the
1288          * cleanups, so forget about the handle here. */
1289         handle = NULL;
1290
1291         /* Unblock SIGTERM again in the parent */
1292         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1293
1294         /* We close the log explicitly here, since the PAM modules
1295          * might have opened it, but we don't want this fd around. */
1296         closelog();
1297
1298         /* Synchronously wait for the child to initialize. We don't care for
1299          * errors as we cannot recover. However, warn loudly if it happens. */
1300         if (!barrier_place_and_sync(&barrier))
1301                 log_error("PAM initialization failed");
1302
1303         strv_free(*env);
1304         *env = e;
1305
1306         return 0;
1307
1308 fail:
1309         if (pam_code != PAM_SUCCESS) {
1310                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1311                 r = -EPERM;  /* PAM errors do not map to errno */
1312         } else
1313                 log_error_errno(r, "PAM failed: %m");
1314
1315         if (handle) {
1316                 if (close_session)
1317                         pam_code = pam_close_session(handle, flags);
1318
1319                 pam_end(handle, pam_code | flags);
1320         }
1321
1322         strv_free(e);
1323         closelog();
1324
1325         return r;
1326 #else
1327         return 0;
1328 #endif
1329 }
1330
1331 static void rename_process_from_path(const char *path) {
1332         char process_name[11];
1333         const char *p;
1334         size_t l;
1335
1336         /* This resulting string must fit in 10 chars (i.e. the length
1337          * of "/sbin/init") to look pretty in /bin/ps */
1338
1339         p = basename(path);
1340         if (isempty(p)) {
1341                 rename_process("(...)");
1342                 return;
1343         }
1344
1345         l = strlen(p);
1346         if (l > 8) {
1347                 /* The end of the process name is usually more
1348                  * interesting, since the first bit might just be
1349                  * "systemd-" */
1350                 p = p + l - 8;
1351                 l = 8;
1352         }
1353
1354         process_name[0] = '(';
1355         memcpy(process_name+1, p, l);
1356         process_name[1+l] = ')';
1357         process_name[1+l+1] = 0;
1358
1359         rename_process(process_name);
1360 }
1361
1362 static bool context_has_address_families(const ExecContext *c) {
1363         assert(c);
1364
1365         return c->address_families_whitelist ||
1366                 !set_isempty(c->address_families);
1367 }
1368
1369 static bool context_has_syscall_filters(const ExecContext *c) {
1370         assert(c);
1371
1372         return c->syscall_whitelist ||
1373                 !hashmap_isempty(c->syscall_filter);
1374 }
1375
1376 static bool context_has_no_new_privileges(const ExecContext *c) {
1377         assert(c);
1378
1379         if (c->no_new_privileges)
1380                 return true;
1381
1382         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1383                 return false;
1384
1385         /* We need NNP if we have any form of seccomp and are unprivileged */
1386         return context_has_address_families(c) ||
1387                 c->memory_deny_write_execute ||
1388                 c->restrict_realtime ||
1389                 exec_context_restrict_namespaces_set(c) ||
1390                 c->protect_kernel_tunables ||
1391                 c->protect_kernel_modules ||
1392                 c->private_devices ||
1393                 context_has_syscall_filters(c) ||
1394                 !set_isempty(c->syscall_archs) ||
1395                 c->lock_personality;
1396 }
1397
1398 #if HAVE_SECCOMP
1399
1400 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1401
1402         if (is_seccomp_available())
1403                 return false;
1404
1405         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1406         return true;
1407 }
1408
1409 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1410         uint32_t negative_action, default_action, action;
1411         int r;
1412
1413         assert(u);
1414         assert(c);
1415
1416         if (!context_has_syscall_filters(c))
1417                 return 0;
1418
1419         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1420                 return 0;
1421
1422         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1423
1424         if (c->syscall_whitelist) {
1425                 default_action = negative_action;
1426                 action = SCMP_ACT_ALLOW;
1427         } else {
1428                 default_action = SCMP_ACT_ALLOW;
1429                 action = negative_action;
1430         }
1431
1432         if (needs_ambient_hack) {
1433                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1434                 if (r < 0)
1435                         return r;
1436         }
1437
1438         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1439 }
1440
1441 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1442         assert(u);
1443         assert(c);
1444
1445         if (set_isempty(c->syscall_archs))
1446                 return 0;
1447
1448         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1449                 return 0;
1450
1451         return seccomp_restrict_archs(c->syscall_archs);
1452 }
1453
1454 static int apply_address_families(const Unit* u, const ExecContext *c) {
1455         assert(u);
1456         assert(c);
1457
1458         if (!context_has_address_families(c))
1459                 return 0;
1460
1461         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1462                 return 0;
1463
1464         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1465 }
1466
1467 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1468         assert(u);
1469         assert(c);
1470
1471         if (!c->memory_deny_write_execute)
1472                 return 0;
1473
1474         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1475                 return 0;
1476
1477         return seccomp_memory_deny_write_execute();
1478 }
1479
1480 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1481         assert(u);
1482         assert(c);
1483
1484         if (!c->restrict_realtime)
1485                 return 0;
1486
1487         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1488                 return 0;
1489
1490         return seccomp_restrict_realtime();
1491 }
1492
1493 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1494         assert(u);
1495         assert(c);
1496
1497         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1498          * let's protect even those systems where this is left on in the kernel. */
1499
1500         if (!c->protect_kernel_tunables)
1501                 return 0;
1502
1503         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1504                 return 0;
1505
1506         return seccomp_protect_sysctl();
1507 }
1508
1509 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1510         assert(u);
1511         assert(c);
1512
1513         /* Turn off module syscalls on ProtectKernelModules=yes */
1514
1515         if (!c->protect_kernel_modules)
1516                 return 0;
1517
1518         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1519                 return 0;
1520
1521         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1522 }
1523
1524 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1525         assert(u);
1526         assert(c);
1527
1528         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1529
1530         if (!c->private_devices)
1531                 return 0;
1532
1533         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1534                 return 0;
1535
1536         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1537 }
1538
1539 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1540         assert(u);
1541         assert(c);
1542
1543         if (!exec_context_restrict_namespaces_set(c))
1544                 return 0;
1545
1546         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1547                 return 0;
1548
1549         return seccomp_restrict_namespaces(c->restrict_namespaces);
1550 }
1551
1552 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1553         unsigned long personality;
1554         int r;
1555
1556         assert(u);
1557         assert(c);
1558
1559         if (!c->lock_personality)
1560                 return 0;
1561
1562         if (skip_seccomp_unavailable(u, "LockPersonality="))
1563                 return 0;
1564
1565         personality = c->personality;
1566
1567         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1568         if (personality == PERSONALITY_INVALID) {
1569
1570                 r = opinionated_personality(&personality);
1571                 if (r < 0)
1572                         return r;
1573         }
1574
1575         return seccomp_lock_personality(personality);
1576 }
1577
1578 #endif
1579
1580 static void do_idle_pipe_dance(int idle_pipe[4]) {
1581         assert(idle_pipe);
1582
1583         idle_pipe[1] = safe_close(idle_pipe[1]);
1584         idle_pipe[2] = safe_close(idle_pipe[2]);
1585
1586         if (idle_pipe[0] >= 0) {
1587                 int r;
1588
1589                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1590
1591                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1592                         ssize_t n;
1593
1594                         /* Signal systemd that we are bored and want to continue. */
1595                         n = write(idle_pipe[3], "x", 1);
1596                         if (n > 0)
1597                                 /* Wait for systemd to react to the signal above. */
1598                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1599                 }
1600
1601                 idle_pipe[0] = safe_close(idle_pipe[0]);
1602
1603         }
1604
1605         idle_pipe[3] = safe_close(idle_pipe[3]);
1606 }
1607
1608 static int build_environment(
1609                 const Unit *u,
1610                 const ExecContext *c,
1611                 const ExecParameters *p,
1612                 unsigned n_fds,
1613                 const char *home,
1614                 const char *username,
1615                 const char *shell,
1616                 dev_t journal_stream_dev,
1617                 ino_t journal_stream_ino,
1618                 char ***ret) {
1619
1620         _cleanup_strv_free_ char **our_env = NULL;
1621         unsigned n_env = 0;
1622         char *x;
1623
1624         assert(u);
1625         assert(c);
1626         assert(ret);
1627
1628         our_env = new0(char*, 14);
1629         if (!our_env)
1630                 return -ENOMEM;
1631
1632         if (n_fds > 0) {
1633                 _cleanup_free_ char *joined = NULL;
1634
1635                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1636                         return -ENOMEM;
1637                 our_env[n_env++] = x;
1638
1639                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1640                         return -ENOMEM;
1641                 our_env[n_env++] = x;
1642
1643                 joined = strv_join(p->fd_names, ":");
1644                 if (!joined)
1645                         return -ENOMEM;
1646
1647                 x = strjoin("LISTEN_FDNAMES=", joined);
1648                 if (!x)
1649                         return -ENOMEM;
1650                 our_env[n_env++] = x;
1651         }
1652
1653         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1654                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1655                         return -ENOMEM;
1656                 our_env[n_env++] = x;
1657
1658                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1659                         return -ENOMEM;
1660                 our_env[n_env++] = x;
1661         }
1662
1663         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1664          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1665          * check the database directly. */
1666         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1667                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1668                 if (!x)
1669                         return -ENOMEM;
1670                 our_env[n_env++] = x;
1671         }
1672
1673         if (home) {
1674                 x = strappend("HOME=", home);
1675                 if (!x)
1676                         return -ENOMEM;
1677                 our_env[n_env++] = x;
1678         }
1679
1680         if (username) {
1681                 x = strappend("LOGNAME=", username);
1682                 if (!x)
1683                         return -ENOMEM;
1684                 our_env[n_env++] = x;
1685
1686                 x = strappend("USER=", username);
1687                 if (!x)
1688                         return -ENOMEM;
1689                 our_env[n_env++] = x;
1690         }
1691
1692         if (shell) {
1693                 x = strappend("SHELL=", shell);
1694                 if (!x)
1695                         return -ENOMEM;
1696                 our_env[n_env++] = x;
1697         }
1698
1699         if (!sd_id128_is_null(u->invocation_id)) {
1700                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1701                         return -ENOMEM;
1702
1703                 our_env[n_env++] = x;
1704         }
1705
1706         if (exec_context_needs_term(c)) {
1707                 const char *tty_path, *term = NULL;
1708
1709                 tty_path = exec_context_tty_path(c);
1710
1711                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1712                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1713                  * passes to PID 1 ends up all the way in the console login shown. */
1714
1715                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1716                         term = getenv("TERM");
1717                 if (!term)
1718                         term = default_term_for_tty(tty_path);
1719
1720                 x = strappend("TERM=", term);
1721                 if (!x)
1722                         return -ENOMEM;
1723                 our_env[n_env++] = x;
1724         }
1725
1726         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1727                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1728                         return -ENOMEM;
1729
1730                 our_env[n_env++] = x;
1731         }
1732
1733         our_env[n_env++] = NULL;
1734         assert(n_env <= 12);
1735
1736         *ret = TAKE_PTR(our_env);
1737
1738         return 0;
1739 }
1740
1741 static int build_pass_environment(const ExecContext *c, char ***ret) {
1742         _cleanup_strv_free_ char **pass_env = NULL;
1743         size_t n_env = 0, n_bufsize = 0;
1744         char **i;
1745
1746         STRV_FOREACH(i, c->pass_environment) {
1747                 _cleanup_free_ char *x = NULL;
1748                 char *v;
1749
1750                 v = getenv(*i);
1751                 if (!v)
1752                         continue;
1753                 x = strjoin(*i, "=", v);
1754                 if (!x)
1755                         return -ENOMEM;
1756
1757                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1758                         return -ENOMEM;
1759
1760                 pass_env[n_env++] = TAKE_PTR(x);
1761                 pass_env[n_env] = NULL;
1762         }
1763
1764         *ret = TAKE_PTR(pass_env);
1765
1766         return 0;
1767 }
1768
1769 static bool exec_needs_mount_namespace(
1770                 const ExecContext *context,
1771                 const ExecParameters *params,
1772                 const ExecRuntime *runtime) {
1773
1774         assert(context);
1775         assert(params);
1776
1777         if (context->root_image)
1778                 return true;
1779
1780         if (!strv_isempty(context->read_write_paths) ||
1781             !strv_isempty(context->read_only_paths) ||
1782             !strv_isempty(context->inaccessible_paths))
1783                 return true;
1784
1785         if (context->n_bind_mounts > 0)
1786                 return true;
1787
1788         if (context->n_temporary_filesystems > 0)
1789                 return true;
1790
1791         if (context->mount_flags != 0)
1792                 return true;
1793
1794         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1795                 return true;
1796
1797         if (context->private_devices ||
1798             context->protect_system != PROTECT_SYSTEM_NO ||
1799             context->protect_home != PROTECT_HOME_NO ||
1800             context->protect_kernel_tunables ||
1801             context->protect_kernel_modules ||
1802             context->protect_control_groups)
1803                 return true;
1804
1805         if (context->mount_apivfs && (context->root_image || context->root_directory))
1806                 return true;
1807
1808         if (context->dynamic_user &&
1809             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1810              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1811              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1812                 return true;
1813
1814         return false;
1815 }
1816
1817 static int setup_private_users(uid_t uid, gid_t gid) {
1818         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1819         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1820         _cleanup_close_ int unshare_ready_fd = -1;
1821         _cleanup_(sigkill_waitp) pid_t pid = 0;
1822         uint64_t c = 1;
1823         ssize_t n;
1824         int r;
1825
1826         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1827          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1828          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1829          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1830          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1831          * continues execution normally. */
1832
1833         if (uid != 0 && uid_is_valid(uid)) {
1834                 r = asprintf(&uid_map,
1835                              "0 0 1\n"                      /* Map root → root */
1836                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1837                              uid, uid);
1838                 if (r < 0)
1839                         return -ENOMEM;
1840         } else {
1841                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1842                 if (!uid_map)
1843                         return -ENOMEM;
1844         }
1845
1846         if (gid != 0 && gid_is_valid(gid)) {
1847                 r = asprintf(&gid_map,
1848                              "0 0 1\n"                      /* Map root → root */
1849                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1850                              gid, gid);
1851                 if (r < 0)
1852                         return -ENOMEM;
1853         } else {
1854                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1855                 if (!gid_map)
1856                         return -ENOMEM;
1857         }
1858
1859         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1860          * namespace. */
1861         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1862         if (unshare_ready_fd < 0)
1863                 return -errno;
1864
1865         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1866          * failed. */
1867         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1868                 return -errno;
1869
1870         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1871         if (r < 0)
1872                 return r;
1873         if (r == 0) {
1874                 _cleanup_close_ int fd = -1;
1875                 const char *a;
1876                 pid_t ppid;
1877
1878                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1879                  * here, after the parent opened its own user namespace. */
1880
1881                 ppid = getppid();
1882                 errno_pipe[0] = safe_close(errno_pipe[0]);
1883
1884                 /* Wait until the parent unshared the user namespace */
1885                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1886                         r = -errno;
1887                         goto child_fail;
1888                 }
1889
1890                 /* Disable the setgroups() system call in the child user namespace, for good. */
1891                 a = procfs_file_alloca(ppid, "setgroups");
1892                 fd = open(a, O_WRONLY|O_CLOEXEC);
1893                 if (fd < 0) {
1894                         if (errno != ENOENT) {
1895                                 r = -errno;
1896                                 goto child_fail;
1897                         }
1898
1899                         /* If the file is missing the kernel is too old, let's continue anyway. */
1900                 } else {
1901                         if (write(fd, "deny\n", 5) < 0) {
1902                                 r = -errno;
1903                                 goto child_fail;
1904                         }
1905
1906                         fd = safe_close(fd);
1907                 }
1908
1909                 /* First write the GID map */
1910                 a = procfs_file_alloca(ppid, "gid_map");
1911                 fd = open(a, O_WRONLY|O_CLOEXEC);
1912                 if (fd < 0) {
1913                         r = -errno;
1914                         goto child_fail;
1915                 }
1916                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1917                         r = -errno;
1918                         goto child_fail;
1919                 }
1920                 fd = safe_close(fd);
1921
1922                 /* The write the UID map */
1923                 a = procfs_file_alloca(ppid, "uid_map");
1924                 fd = open(a, O_WRONLY|O_CLOEXEC);
1925                 if (fd < 0) {
1926                         r = -errno;
1927                         goto child_fail;
1928                 }
1929                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1930                         r = -errno;
1931                         goto child_fail;
1932                 }
1933
1934                 _exit(EXIT_SUCCESS);
1935
1936         child_fail:
1937                 (void) write(errno_pipe[1], &r, sizeof(r));
1938                 _exit(EXIT_FAILURE);
1939         }
1940
1941         errno_pipe[1] = safe_close(errno_pipe[1]);
1942
1943         if (unshare(CLONE_NEWUSER) < 0)
1944                 return -errno;
1945
1946         /* Let the child know that the namespace is ready now */
1947         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1948                 return -errno;
1949
1950         /* Try to read an error code from the child */
1951         n = read(errno_pipe[0], &r, sizeof(r));
1952         if (n < 0)
1953                 return -errno;
1954         if (n == sizeof(r)) { /* an error code was sent to us */
1955                 if (r < 0)
1956                         return r;
1957                 return -EIO;
1958         }
1959         if (n != 0) /* on success we should have read 0 bytes */
1960                 return -EIO;
1961
1962         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1963         pid = 0;
1964         if (r < 0)
1965                 return r;
1966         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1967                 return -EIO;
1968
1969         return 0;
1970 }
1971
1972 static int setup_exec_directory(
1973                 const ExecContext *context,
1974                 const ExecParameters *params,
1975                 uid_t uid,
1976                 gid_t gid,
1977                 ExecDirectoryType type,
1978                 int *exit_status) {
1979
1980         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1981                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1982                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1983                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1984                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1985                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1986         };
1987         char **rt;
1988         int r;
1989
1990         assert(context);
1991         assert(params);
1992         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1993         assert(exit_status);
1994
1995         if (!params->prefix[type])
1996                 return 0;
1997
1998         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1999                 if (!uid_is_valid(uid))
2000                         uid = 0;
2001                 if (!gid_is_valid(gid))
2002                         gid = 0;
2003         }
2004
2005         STRV_FOREACH(rt, context->directories[type].paths) {
2006                 _cleanup_free_ char *p = NULL, *pp = NULL;
2007
2008                 p = strjoin(params->prefix[type], "/", *rt);
2009                 if (!p) {
2010                         r = -ENOMEM;
2011                         goto fail;
2012                 }
2013
2014                 r = mkdir_parents_label(p, 0755);
2015                 if (r < 0)
2016                         goto fail;
2017
2018                 if (context->dynamic_user &&
2019                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2020                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2021
2022                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2023                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2024                          * whose UID is later on reused. To lock this down we use the same trick used by container
2025                          * managers to prohibit host users to get access to files of the same UID in containers: we
2026                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2027                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2028                          * to make this directory permeable for the service itself.
2029                          *
2030                          * Specifically: for a service which wants a special directory "foo/" we first create a
2031                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2032                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2033                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2034                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2035                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2036                          * disabling the access boundary for the service and making sure it only gets access to the
2037                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2038                          *
2039                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2040                          * owned by the service itself.
2041                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2042                          * files or sockets with other services. */
2043
2044                         private_root = strjoin(params->prefix[type], "/private");
2045                         if (!private_root) {
2046                                 r = -ENOMEM;
2047                                 goto fail;
2048                         }
2049
2050                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2051                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2052                         if (r < 0)
2053                                 goto fail;
2054
2055                         pp = strjoin(private_root, "/", *rt);
2056                         if (!pp) {
2057                                 r = -ENOMEM;
2058                                 goto fail;
2059                         }
2060
2061                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2062                         r = mkdir_parents_label(pp, 0755);
2063                         if (r < 0)
2064                                 goto fail;
2065
2066                         if (is_dir(p, false) > 0 &&
2067                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2068
2069                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2070                                  * it over. Most likely the service has been upgraded from one that didn't use
2071                                  * DynamicUser=1, to one that does. */
2072
2073                                 if (rename(p, pp) < 0) {
2074                                         r = -errno;
2075                                         goto fail;
2076                                 }
2077                         } else {
2078                                 /* Otherwise, create the actual directory for the service */
2079
2080                                 r = mkdir_label(pp, context->directories[type].mode);
2081                                 if (r < 0 && r != -EEXIST)
2082                                         goto fail;
2083                         }
2084
2085                         parent = dirname_malloc(p);
2086                         if (!parent) {
2087                                 r = -ENOMEM;
2088                                 goto fail;
2089                         }
2090
2091                         r = path_make_relative(parent, pp, &relative);
2092                         if (r < 0)
2093                                 goto fail;
2094
2095                         /* And link it up from the original place */
2096                         r = symlink_idempotent(relative, p);
2097                         if (r < 0)
2098                                 goto fail;
2099
2100                         /* Lock down the access mode */
2101                         if (chmod(pp, context->directories[type].mode) < 0) {
2102                                 r = -errno;
2103                                 goto fail;
2104                         }
2105                 } else {
2106                         r = mkdir_label(p, context->directories[type].mode);
2107                         if (r == -EEXIST)
2108                                 continue;
2109                         if (r < 0)
2110                                 goto fail;
2111                 }
2112
2113                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2114                  * a service, and shall not be writable. */
2115                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2116                         continue;
2117
2118                 /* Then, change the ownership of the whole tree, if necessary */
2119                 r = path_chown_recursive(pp ?: p, uid, gid);
2120                 if (r < 0)
2121                         goto fail;
2122         }
2123
2124         return 0;
2125
2126 fail:
2127         *exit_status = exit_status_table[type];
2128         return r;
2129 }
2130
2131 #if ENABLE_SMACK
2132 static int setup_smack(
2133                 const ExecContext *context,
2134                 const ExecCommand *command) {
2135
2136         int r;
2137
2138         assert(context);
2139         assert(command);
2140
2141         if (context->smack_process_label) {
2142                 r = mac_smack_apply_pid(0, context->smack_process_label);
2143                 if (r < 0)
2144                         return r;
2145         }
2146 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2147         else {
2148                 _cleanup_free_ char *exec_label = NULL;
2149
2150                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2151                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2152                         return r;
2153
2154                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2155                 if (r < 0)
2156                         return r;
2157         }
2158 #endif
2159
2160         return 0;
2161 }
2162 #endif
2163
2164 static int compile_bind_mounts(
2165                 const ExecContext *context,
2166                 const ExecParameters *params,
2167                 BindMount **ret_bind_mounts,
2168                 unsigned *ret_n_bind_mounts,
2169                 char ***ret_empty_directories) {
2170
2171         _cleanup_strv_free_ char **empty_directories = NULL;
2172         BindMount *bind_mounts;
2173         unsigned n, h = 0, i;
2174         ExecDirectoryType t;
2175         int r;
2176
2177         assert(context);
2178         assert(params);
2179         assert(ret_bind_mounts);
2180         assert(ret_n_bind_mounts);
2181         assert(ret_empty_directories);
2182
2183         n = context->n_bind_mounts;
2184         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2185                 if (!params->prefix[t])
2186                         continue;
2187
2188                 n += strv_length(context->directories[t].paths);
2189         }
2190
2191         if (n <= 0) {
2192                 *ret_bind_mounts = NULL;
2193                 *ret_n_bind_mounts = 0;
2194                 *ret_empty_directories = NULL;
2195                 return 0;
2196         }
2197
2198         bind_mounts = new(BindMount, n);
2199         if (!bind_mounts)
2200                 return -ENOMEM;
2201
2202         for (i = 0; i < context->n_bind_mounts; i++) {
2203                 BindMount *item = context->bind_mounts + i;
2204                 char *s, *d;
2205
2206                 s = strdup(item->source);
2207                 if (!s) {
2208                         r = -ENOMEM;
2209                         goto finish;
2210                 }
2211
2212                 d = strdup(item->destination);
2213                 if (!d) {
2214                         free(s);
2215                         r = -ENOMEM;
2216                         goto finish;
2217                 }
2218
2219                 bind_mounts[h++] = (BindMount) {
2220                         .source = s,
2221                         .destination = d,
2222                         .read_only = item->read_only,
2223                         .recursive = item->recursive,
2224                         .ignore_enoent = item->ignore_enoent,
2225                 };
2226         }
2227
2228         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2229                 char **suffix;
2230
2231                 if (!params->prefix[t])
2232                         continue;
2233
2234                 if (strv_isempty(context->directories[t].paths))
2235                         continue;
2236
2237                 if (context->dynamic_user &&
2238                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2239                         char *private_root;
2240
2241                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2242                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2243                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2244
2245                         private_root = strjoin(params->prefix[t], "/private");
2246                         if (!private_root) {
2247                                 r = -ENOMEM;
2248                                 goto finish;
2249                         }
2250
2251                         r = strv_consume(&empty_directories, private_root);
2252                         if (r < 0)
2253                                 goto finish;
2254                 }
2255
2256                 STRV_FOREACH(suffix, context->directories[t].paths) {
2257                         char *s, *d;
2258
2259                         if (context->dynamic_user &&
2260                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2261                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2262                         else
2263                                 s = strjoin(params->prefix[t], "/", *suffix);
2264                         if (!s) {
2265                                 r = -ENOMEM;
2266                                 goto finish;
2267                         }
2268
2269                         d = strdup(s);
2270                         if (!d) {
2271                                 free(s);
2272                                 r = -ENOMEM;
2273                                 goto finish;
2274                         }
2275
2276                         bind_mounts[h++] = (BindMount) {
2277                                 .source = s,
2278                                 .destination = d,
2279                                 .read_only = false,
2280                                 .recursive = true,
2281                                 .ignore_enoent = false,
2282                         };
2283                 }
2284         }
2285
2286         assert(h == n);
2287
2288         *ret_bind_mounts = bind_mounts;
2289         *ret_n_bind_mounts = n;
2290         *ret_empty_directories = TAKE_PTR(empty_directories);
2291
2292         return (int) n;
2293
2294 finish:
2295         bind_mount_free_many(bind_mounts, h);
2296         return r;
2297 }
2298
2299 static int apply_mount_namespace(
2300                 const Unit *u,
2301                 const ExecCommand *command,
2302                 const ExecContext *context,
2303                 const ExecParameters *params,
2304                 const ExecRuntime *runtime) {
2305
2306         _cleanup_strv_free_ char **empty_directories = NULL;
2307         char *tmp = NULL, *var = NULL;
2308         const char *root_dir = NULL, *root_image = NULL;
2309         NamespaceInfo ns_info = {
2310                 .ignore_protect_paths = false,
2311                 .private_dev = context->private_devices,
2312                 .protect_control_groups = context->protect_control_groups,
2313                 .protect_kernel_tunables = context->protect_kernel_tunables,
2314                 .protect_kernel_modules = context->protect_kernel_modules,
2315                 .mount_apivfs = context->mount_apivfs,
2316         };
2317         bool needs_sandboxing;
2318         BindMount *bind_mounts = NULL;
2319         unsigned n_bind_mounts = 0;
2320         int r;
2321
2322         assert(context);
2323
2324         /* The runtime struct only contains the parent of the private /tmp,
2325          * which is non-accessible to world users. Inside of it there's a /tmp
2326          * that is sticky, and that's the one we want to use here. */
2327
2328         if (context->private_tmp && runtime) {
2329                 if (runtime->tmp_dir)
2330                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2331                 if (runtime->var_tmp_dir)
2332                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2333         }
2334
2335         if (params->flags & EXEC_APPLY_CHROOT) {
2336                 root_image = context->root_image;
2337
2338                 if (!root_image)
2339                         root_dir = context->root_directory;
2340         }
2341
2342         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2343         if (r < 0)
2344                 return r;
2345
2346         /*
2347          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2348          * sandbox info, otherwise enforce it, don't ignore protected paths and
2349          * fail if we are enable to apply the sandbox inside the mount namespace.
2350          */
2351         if (!context->dynamic_user && root_dir)
2352                 ns_info.ignore_protect_paths = true;
2353
2354         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2355
2356         r = setup_namespace(root_dir, root_image,
2357                             &ns_info, context->read_write_paths,
2358                             needs_sandboxing ? context->read_only_paths : NULL,
2359                             needs_sandboxing ? context->inaccessible_paths : NULL,
2360                             empty_directories,
2361                             bind_mounts,
2362                             n_bind_mounts,
2363                             context->temporary_filesystems,
2364                             context->n_temporary_filesystems,
2365                             tmp,
2366                             var,
2367                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2368                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2369                             context->mount_flags,
2370                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2371
2372         bind_mount_free_many(bind_mounts, n_bind_mounts);
2373
2374         /* If we couldn't set up the namespace this is probably due to a
2375          * missing capability. In this case, silently proceeed. */
2376         if (IN_SET(r, -EPERM, -EACCES)) {
2377                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2378                 return 0;
2379         }
2380
2381         return r;
2382 }
2383
2384 static int apply_working_directory(
2385                 const ExecContext *context,
2386                 const ExecParameters *params,
2387                 const char *home,
2388                 const bool needs_mount_ns,
2389                 int *exit_status) {
2390
2391         const char *d, *wd;
2392
2393         assert(context);
2394         assert(exit_status);
2395
2396         if (context->working_directory_home) {
2397
2398                 if (!home) {
2399                         *exit_status = EXIT_CHDIR;
2400                         return -ENXIO;
2401                 }
2402
2403                 wd = home;
2404
2405         } else if (context->working_directory)
2406                 wd = context->working_directory;
2407         else
2408                 wd = "/";
2409
2410         if (params->flags & EXEC_APPLY_CHROOT) {
2411                 if (!needs_mount_ns && context->root_directory)
2412                         if (chroot(context->root_directory) < 0) {
2413                                 *exit_status = EXIT_CHROOT;
2414                                 return -errno;
2415                         }
2416
2417                 d = wd;
2418         } else
2419                 d = prefix_roota(context->root_directory, wd);
2420
2421         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2422                 *exit_status = EXIT_CHDIR;
2423                 return -errno;
2424         }
2425
2426         return 0;
2427 }
2428
2429 static int setup_keyring(
2430                 const Unit *u,
2431                 const ExecContext *context,
2432                 const ExecParameters *p,
2433                 uid_t uid, gid_t gid) {
2434
2435         key_serial_t keyring;
2436         int r = 0;
2437         uid_t saved_uid;
2438         gid_t saved_gid;
2439
2440         assert(u);
2441         assert(context);
2442         assert(p);
2443
2444         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2445          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2446          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2447          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2448          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2449          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2450
2451         if (!(p->flags & EXEC_NEW_KEYRING))
2452                 return 0;
2453
2454         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2455                 return 0;
2456
2457         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2458          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2459          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2460          * & group is just as nasty as acquiring a reference to the user keyring. */
2461
2462         saved_uid = getuid();
2463         saved_gid = getgid();
2464
2465         if (gid_is_valid(gid) && gid != saved_gid) {
2466                 if (setregid(gid, -1) < 0)
2467                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2468         }
2469
2470         if (uid_is_valid(uid) && uid != saved_uid) {
2471                 if (setreuid(uid, -1) < 0) {
2472                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2473                         goto out;
2474                 }
2475         }
2476
2477         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2478         if (keyring == -1) {
2479                 if (errno == ENOSYS)
2480                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2481                 else if (IN_SET(errno, EACCES, EPERM))
2482                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2483                 else if (errno == EDQUOT)
2484                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2485                 else
2486                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2487
2488                 goto out;
2489         }
2490
2491         /* When requested link the user keyring into the session keyring. */
2492         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2493
2494                 if (keyctl(KEYCTL_LINK,
2495                            KEY_SPEC_USER_KEYRING,
2496                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2497                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2498                         goto out;
2499                 }
2500         }
2501
2502         /* Restore uid/gid back */
2503         if (uid_is_valid(uid) && uid != saved_uid) {
2504                 if (setreuid(saved_uid, -1) < 0) {
2505                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2506                         goto out;
2507                 }
2508         }
2509
2510         if (gid_is_valid(gid) && gid != saved_gid) {
2511                 if (setregid(saved_gid, -1) < 0)
2512                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2513         }
2514
2515         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2516         if (!sd_id128_is_null(u->invocation_id)) {
2517                 key_serial_t key;
2518
2519                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2520                 if (key == -1)
2521                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2522                 else {
2523                         if (keyctl(KEYCTL_SETPERM, key,
2524                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2525                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2526                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2527                 }
2528         }
2529
2530 out:
2531         /* Revert back uid & gid for the the last time, and exit */
2532         /* no extra logging, as only the first already reported error matters */
2533         if (getuid() != saved_uid)
2534                 (void) setreuid(saved_uid, -1);
2535
2536         if (getgid() != saved_gid)
2537                 (void) setregid(saved_gid, -1);
2538
2539         return r;
2540 }
2541
2542 static void append_socket_pair(int *array, unsigned *n, const int pair[2]) {
2543         assert(array);
2544         assert(n);
2545
2546         if (!pair)
2547                 return;
2548
2549         if (pair[0] >= 0)
2550                 array[(*n)++] = pair[0];
2551         if (pair[1] >= 0)
2552                 array[(*n)++] = pair[1];
2553 }
2554
2555 static int close_remaining_fds(
2556                 const ExecParameters *params,
2557                 const ExecRuntime *runtime,
2558                 const DynamicCreds *dcreds,
2559                 int user_lookup_fd,
2560                 int socket_fd,
2561                 int *fds, unsigned n_fds) {
2562
2563         unsigned n_dont_close = 0;
2564         int dont_close[n_fds + 12];
2565
2566         assert(params);
2567
2568         if (params->stdin_fd >= 0)
2569                 dont_close[n_dont_close++] = params->stdin_fd;
2570         if (params->stdout_fd >= 0)
2571                 dont_close[n_dont_close++] = params->stdout_fd;
2572         if (params->stderr_fd >= 0)
2573                 dont_close[n_dont_close++] = params->stderr_fd;
2574
2575         if (socket_fd >= 0)
2576                 dont_close[n_dont_close++] = socket_fd;
2577         if (n_fds > 0) {
2578                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2579                 n_dont_close += n_fds;
2580         }
2581
2582         if (runtime)
2583                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2584
2585         if (dcreds) {
2586                 if (dcreds->user)
2587                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2588                 if (dcreds->group)
2589                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2590         }
2591
2592         if (user_lookup_fd >= 0)
2593                 dont_close[n_dont_close++] = user_lookup_fd;
2594
2595         return close_all_fds(dont_close, n_dont_close);
2596 }
2597
2598 static int send_user_lookup(
2599                 Unit *unit,
2600                 int user_lookup_fd,
2601                 uid_t uid,
2602                 gid_t gid) {
2603
2604         assert(unit);
2605
2606         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2607          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2608          * specified. */
2609
2610         if (user_lookup_fd < 0)
2611                 return 0;
2612
2613         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2614                 return 0;
2615
2616         if (writev(user_lookup_fd,
2617                (struct iovec[]) {
2618                            IOVEC_INIT(&uid, sizeof(uid)),
2619                            IOVEC_INIT(&gid, sizeof(gid)),
2620                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2621                 return -errno;
2622
2623         return 0;
2624 }
2625
2626 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2627         int r;
2628
2629         assert(c);
2630         assert(home);
2631         assert(buf);
2632
2633         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2634
2635         if (*home)
2636                 return 0;
2637
2638         if (!c->working_directory_home)
2639                 return 0;
2640
2641         if (uid == 0) {
2642                 /* Hardcode /root as home directory for UID 0 */
2643                 *home = "/root";
2644                 return 1;
2645         }
2646
2647         r = get_home_dir(buf);
2648         if (r < 0)
2649                 return r;
2650
2651         *home = *buf;
2652         return 1;
2653 }
2654
2655 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2656         _cleanup_strv_free_ char ** list = NULL;
2657         ExecDirectoryType t;
2658         int r;
2659
2660         assert(c);
2661         assert(p);
2662         assert(ret);
2663
2664         assert(c->dynamic_user);
2665
2666         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2667          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2668          * directories. */
2669
2670         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2671                 char **i;
2672
2673                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2674                         continue;
2675
2676                 if (!p->prefix[t])
2677                         continue;
2678
2679                 STRV_FOREACH(i, c->directories[t].paths) {
2680                         char *e;
2681
2682                         if (t == EXEC_DIRECTORY_RUNTIME)
2683                                 e = strjoin(p->prefix[t], "/", *i);
2684                         else
2685                                 e = strjoin(p->prefix[t], "/private/", *i);
2686                         if (!e)
2687                                 return -ENOMEM;
2688
2689                         r = strv_consume(&list, e);
2690                         if (r < 0)
2691                                 return r;
2692                 }
2693         }
2694
2695         *ret = TAKE_PTR(list);
2696
2697         return 0;
2698 }
2699
2700 static char *exec_command_line(char **argv);
2701
2702 static int exec_child(
2703                 Unit *unit,
2704                 const ExecCommand *command,
2705                 const ExecContext *context,
2706                 const ExecParameters *params,
2707                 ExecRuntime *runtime,
2708                 DynamicCreds *dcreds,
2709                 char **argv,
2710                 int socket_fd,
2711                 int named_iofds[3],
2712                 int *fds,
2713                 unsigned n_storage_fds,
2714                 unsigned n_socket_fds,
2715                 char **files_env,
2716                 int user_lookup_fd,
2717                 int *exit_status) {
2718
2719         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2720         _cleanup_free_ char *home_buffer = NULL;
2721         _cleanup_free_ gid_t *supplementary_gids = NULL;
2722         const char *username = NULL, *groupname = NULL;
2723         const char *home = NULL, *shell = NULL;
2724         dev_t journal_stream_dev = 0;
2725         ino_t journal_stream_ino = 0;
2726         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2727                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2728                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2729                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2730 #if HAVE_SELINUX
2731         _cleanup_free_ char *mac_selinux_context_net = NULL;
2732         bool use_selinux = false;
2733 #endif
2734 #if ENABLE_SMACK
2735         bool use_smack = false;
2736 #endif
2737 #if HAVE_APPARMOR
2738         bool use_apparmor = false;
2739 #endif
2740         uid_t uid = UID_INVALID;
2741         gid_t gid = GID_INVALID;
2742         int i, r, ngids = 0;
2743         unsigned n_fds;
2744         ExecDirectoryType dt;
2745         int secure_bits;
2746
2747         assert(unit);
2748         assert(command);
2749         assert(context);
2750         assert(params);
2751         assert(exit_status);
2752
2753         rename_process_from_path(command->path);
2754
2755         /* We reset exactly these signals, since they are the
2756          * only ones we set to SIG_IGN in the main daemon. All
2757          * others we leave untouched because we set them to
2758          * SIG_DFL or a valid handler initially, both of which
2759          * will be demoted to SIG_DFL. */
2760         (void) default_signals(SIGNALS_CRASH_HANDLER,
2761                                SIGNALS_IGNORE, -1);
2762
2763         if (context->ignore_sigpipe)
2764                 (void) ignore_signals(SIGPIPE, -1);
2765
2766         r = reset_signal_mask();
2767         if (r < 0) {
2768                 *exit_status = EXIT_SIGNAL_MASK;
2769                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2770         }
2771
2772         if (params->idle_pipe)
2773                 do_idle_pipe_dance(params->idle_pipe);
2774
2775         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2776          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2777          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2778          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2779
2780         log_forget_fds();
2781         log_set_open_when_needed(true);
2782
2783         /* In case anything used libc syslog(), close this here, too */
2784         closelog();
2785
2786         n_fds = n_storage_fds + n_socket_fds;
2787         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2788         if (r < 0) {
2789                 *exit_status = EXIT_FDS;
2790                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2791         }
2792
2793         if (!context->same_pgrp)
2794                 if (setsid() < 0) {
2795                         *exit_status = EXIT_SETSID;
2796                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2797                 }
2798
2799         exec_context_tty_reset(context, params);
2800
2801         if (unit_shall_confirm_spawn(unit)) {
2802                 const char *vc = params->confirm_spawn;
2803                 _cleanup_free_ char *cmdline = NULL;
2804
2805                 cmdline = exec_command_line(argv);
2806                 if (!cmdline) {
2807                         *exit_status = EXIT_MEMORY;
2808                         return log_oom();
2809                 }
2810
2811                 r = ask_for_confirmation(vc, unit, cmdline);
2812                 if (r != CONFIRM_EXECUTE) {
2813                         if (r == CONFIRM_PRETEND_SUCCESS) {
2814                                 *exit_status = EXIT_SUCCESS;
2815                                 return 0;
2816                         }
2817                         *exit_status = EXIT_CONFIRM;
2818                         log_unit_error(unit, "Execution cancelled by the user");
2819                         return -ECANCELED;
2820                 }
2821         }
2822
2823         if (context->dynamic_user && dcreds) {
2824                 _cleanup_strv_free_ char **suggested_paths = NULL;
2825
2826                 /* Make sure we bypass our own NSS module for any NSS checks */
2827                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2828                         *exit_status = EXIT_USER;
2829                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2830                 }
2831
2832                 r = compile_suggested_paths(context, params, &suggested_paths);
2833                 if (r < 0) {
2834                         *exit_status = EXIT_MEMORY;
2835                         return log_oom();
2836                 }
2837
2838                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2839                 if (r < 0) {
2840                         *exit_status = EXIT_USER;
2841                         if (r == -EILSEQ) {
2842                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2843                                 return -EOPNOTSUPP;
2844                         }
2845                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2846                 }
2847
2848                 if (!uid_is_valid(uid)) {
2849                         *exit_status = EXIT_USER;
2850                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2851                         return -ESRCH;
2852                 }
2853
2854                 if (!gid_is_valid(gid)) {
2855                         *exit_status = EXIT_USER;
2856                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2857                         return -ESRCH;
2858                 }
2859
2860                 if (dcreds->user)
2861                         username = dcreds->user->name;
2862
2863         } else {
2864                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2865                 if (r < 0) {
2866                         *exit_status = EXIT_USER;
2867                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2868                 }
2869
2870                 r = get_fixed_group(context, &groupname, &gid);
2871                 if (r < 0) {
2872                         *exit_status = EXIT_GROUP;
2873                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2874                 }
2875         }
2876
2877         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2878         r = get_supplementary_groups(context, username, groupname, gid,
2879                                      &supplementary_gids, &ngids);
2880         if (r < 0) {
2881                 *exit_status = EXIT_GROUP;
2882                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2883         }
2884
2885         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2886         if (r < 0) {
2887                 *exit_status = EXIT_USER;
2888                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2889         }
2890
2891         user_lookup_fd = safe_close(user_lookup_fd);
2892
2893         r = acquire_home(context, uid, &home, &home_buffer);
2894         if (r < 0) {
2895                 *exit_status = EXIT_CHDIR;
2896                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2897         }
2898
2899         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2900          * must sure to drop O_NONBLOCK */
2901         if (socket_fd >= 0)
2902                 (void) fd_nonblock(socket_fd, false);
2903
2904         r = setup_input(context, params, socket_fd, named_iofds);
2905         if (r < 0) {
2906                 *exit_status = EXIT_STDIN;
2907                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2908         }
2909
2910         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2911         if (r < 0) {
2912                 *exit_status = EXIT_STDOUT;
2913                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2914         }
2915
2916         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2917         if (r < 0) {
2918                 *exit_status = EXIT_STDERR;
2919                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2920         }
2921
2922         if (params->cgroup_path) {
2923                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2924                 if (r < 0) {
2925                         *exit_status = EXIT_CGROUP;
2926                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2927                 }
2928         }
2929
2930         if (context->oom_score_adjust_set) {
2931                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2932
2933                 /* When we can't make this change due to EPERM, then
2934                  * let's silently skip over it. User namespaces
2935                  * prohibit write access to this file, and we
2936                  * shouldn't trip up over that. */
2937
2938                 sprintf(t, "%i", context->oom_score_adjust);
2939                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2940                 if (IN_SET(r, -EPERM, -EACCES))
2941                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2942                 else if (r < 0) {
2943                         *exit_status = EXIT_OOM_ADJUST;
2944                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2945                 }
2946         }
2947
2948         if (context->nice_set)
2949                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2950                         *exit_status = EXIT_NICE;
2951                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2952                 }
2953
2954         if (context->cpu_sched_set) {
2955                 struct sched_param param = {
2956                         .sched_priority = context->cpu_sched_priority,
2957                 };
2958
2959                 r = sched_setscheduler(0,
2960                                        context->cpu_sched_policy |
2961                                        (context->cpu_sched_reset_on_fork ?
2962                                         SCHED_RESET_ON_FORK : 0),
2963                                        &param);
2964                 if (r < 0) {
2965                         *exit_status = EXIT_SETSCHEDULER;
2966                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2967                 }
2968         }
2969
2970         if (context->cpuset)
2971                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2972                         *exit_status = EXIT_CPUAFFINITY;
2973                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2974                 }
2975
2976         if (context->ioprio_set)
2977                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2978                         *exit_status = EXIT_IOPRIO;
2979                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2980                 }
2981
2982         if (context->timer_slack_nsec != NSEC_INFINITY)
2983                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2984                         *exit_status = EXIT_TIMERSLACK;
2985                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2986                 }
2987
2988         if (context->personality != PERSONALITY_INVALID) {
2989                 r = safe_personality(context->personality);
2990                 if (r < 0) {
2991                         *exit_status = EXIT_PERSONALITY;
2992                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2993                 }
2994         }
2995
2996         if (context->utmp_id)
2997                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2998                                       context->tty_path,
2999                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3000                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3001                                       USER_PROCESS,
3002                                       username);
3003
3004         if (context->user) {
3005                 r = chown_terminal(STDIN_FILENO, uid);
3006                 if (r < 0) {
3007                         *exit_status = EXIT_STDIN;
3008                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3009                 }
3010         }
3011
3012         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3013          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3014          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3015          * touch a single hierarchy too. */
3016         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3017                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3018                 if (r < 0) {
3019                         *exit_status = EXIT_CGROUP;
3020                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3021                 }
3022         }
3023
3024         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3025                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3026                 if (r < 0)
3027                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3028         }
3029
3030         r = build_environment(
3031                         unit,
3032                         context,
3033                         params,
3034                         n_fds,
3035                         home,
3036                         username,
3037                         shell,
3038                         journal_stream_dev,
3039                         journal_stream_ino,
3040                         &our_env);
3041         if (r < 0) {
3042                 *exit_status = EXIT_MEMORY;
3043                 return log_oom();
3044         }
3045
3046         r = build_pass_environment(context, &pass_env);
3047         if (r < 0) {
3048                 *exit_status = EXIT_MEMORY;
3049                 return log_oom();
3050         }
3051
3052         accum_env = strv_env_merge(5,
3053                                    params->environment,
3054                                    our_env,
3055                                    pass_env,
3056                                    context->environment,
3057                                    files_env,
3058                                    NULL);
3059         if (!accum_env) {
3060                 *exit_status = EXIT_MEMORY;
3061                 return log_oom();
3062         }
3063         accum_env = strv_env_clean(accum_env);
3064
3065         (void) umask(context->umask);
3066
3067         r = setup_keyring(unit, context, params, uid, gid);
3068         if (r < 0) {
3069                 *exit_status = EXIT_KEYRING;
3070                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3071         }
3072
3073         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3074         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3075
3076         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3077         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3078
3079         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3080         if (needs_ambient_hack)
3081                 needs_setuid = false;
3082         else
3083                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3084
3085         if (needs_sandboxing) {
3086                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3087                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3088                  * impacting our own code paths. */
3089
3090 #if HAVE_SELINUX
3091                 use_selinux = mac_selinux_use();
3092 #endif
3093 #if ENABLE_SMACK
3094                 use_smack = mac_smack_use();
3095 #endif
3096 #if HAVE_APPARMOR
3097                 use_apparmor = mac_apparmor_use();
3098 #endif
3099         }
3100
3101         if (needs_setuid) {
3102                 if (context->pam_name && username) {
3103                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3104                         if (r < 0) {
3105                                 *exit_status = EXIT_PAM;
3106                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3107                         }
3108                 }
3109         }
3110
3111         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3112                 if (ns_type_supported(NAMESPACE_NET)) {
3113                         r = setup_netns(runtime->netns_storage_socket);
3114                         if (r < 0) {
3115                                 *exit_status = EXIT_NETWORK;
3116                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3117                         }
3118                 } else
3119                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3120         }
3121
3122         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3123         if (needs_mount_namespace) {
3124                 r = apply_mount_namespace(unit, command, context, params, runtime);
3125                 if (r < 0) {
3126                         *exit_status = EXIT_NAMESPACE;
3127                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3128                 }
3129         }
3130
3131         /* Apply just after mount namespace setup */
3132         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3133         if (r < 0)
3134                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3135
3136         /* Drop groups as early as possbile */
3137         if (needs_setuid) {
3138                 r = enforce_groups(gid, supplementary_gids, ngids);
3139                 if (r < 0) {
3140                         *exit_status = EXIT_GROUP;
3141                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3142                 }
3143         }
3144
3145         if (needs_sandboxing) {
3146 #if HAVE_SELINUX
3147                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3148                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3149                         if (r < 0) {
3150                                 *exit_status = EXIT_SELINUX_CONTEXT;
3151                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3152                         }
3153                 }
3154 #endif
3155
3156                 if (context->private_users) {
3157                         r = setup_private_users(uid, gid);
3158                         if (r < 0) {
3159                                 *exit_status = EXIT_USER;
3160                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3161                         }
3162                 }
3163         }
3164
3165         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3166          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3167          * was needed to upload the policy and can now be closed as well. */
3168         r = close_all_fds(fds, n_fds);
3169         if (r >= 0)
3170                 r = shift_fds(fds, n_fds);
3171         if (r >= 0)
3172                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3173         if (r < 0) {
3174                 *exit_status = EXIT_FDS;
3175                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3176         }
3177
3178         secure_bits = context->secure_bits;
3179
3180         if (needs_sandboxing) {
3181                 uint64_t bset;
3182
3183                 for (i = 0; i < _RLIMIT_MAX; i++) {
3184
3185                         if (!context->rlimit[i])
3186                                 continue;
3187
3188                         r = setrlimit_closest(i, context->rlimit[i]);
3189                         if (r < 0) {
3190                                 *exit_status = EXIT_LIMITS;
3191                                 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3192                         }
3193                 }
3194
3195                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3196                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3197                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3198                                 *exit_status = EXIT_LIMITS;
3199                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3200                         }
3201                 }
3202
3203 #if ENABLE_SMACK
3204                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3205                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3206                 if (use_smack) {
3207                         r = setup_smack(context, command);
3208                         if (r < 0) {
3209                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3210                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3211                         }
3212                 }
3213 #endif
3214
3215                 bset = context->capability_bounding_set;
3216                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3217                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3218                  * instead of us doing that */
3219                 if (needs_ambient_hack)
3220                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3221                                 (UINT64_C(1) << CAP_SETUID) |
3222                                 (UINT64_C(1) << CAP_SETGID);
3223
3224                 if (!cap_test_all(bset)) {
3225                         r = capability_bounding_set_drop(bset, false);
3226                         if (r < 0) {
3227                                 *exit_status = EXIT_CAPABILITIES;
3228                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3229                         }
3230                 }
3231
3232                 /* This is done before enforce_user, but ambient set
3233                  * does not survive over setresuid() if keep_caps is not set. */
3234                 if (!needs_ambient_hack &&
3235                     context->capability_ambient_set != 0) {
3236                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3237                         if (r < 0) {
3238                                 *exit_status = EXIT_CAPABILITIES;
3239                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3240                         }
3241                 }
3242         }
3243
3244         if (needs_setuid) {
3245                 if (context->user) {
3246                         r = enforce_user(context, uid);
3247                         if (r < 0) {
3248                                 *exit_status = EXIT_USER;
3249                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3250                         }
3251
3252                         if (!needs_ambient_hack &&
3253                             context->capability_ambient_set != 0) {
3254
3255                                 /* Fix the ambient capabilities after user change. */
3256                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3257                                 if (r < 0) {
3258                                         *exit_status = EXIT_CAPABILITIES;
3259                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3260                                 }
3261
3262                                 /* If we were asked to change user and ambient capabilities
3263                                  * were requested, we had to add keep-caps to the securebits
3264                                  * so that we would maintain the inherited capability set
3265                                  * through the setresuid(). Make sure that the bit is added
3266                                  * also to the context secure_bits so that we don't try to
3267                                  * drop the bit away next. */
3268
3269                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3270                         }
3271                 }
3272         }
3273
3274         if (needs_sandboxing) {
3275                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3276                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3277                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3278                  * are restricted. */
3279
3280 #if HAVE_SELINUX
3281                 if (use_selinux) {
3282                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3283
3284                         if (exec_context) {
3285                                 r = setexeccon(exec_context);
3286                                 if (r < 0) {
3287                                         *exit_status = EXIT_SELINUX_CONTEXT;
3288                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3289                                 }
3290                         }
3291                 }
3292 #endif
3293
3294 #if HAVE_APPARMOR
3295                 if (use_apparmor && context->apparmor_profile) {
3296                         r = aa_change_onexec(context->apparmor_profile);
3297                         if (r < 0 && !context->apparmor_profile_ignore) {
3298                                 *exit_status = EXIT_APPARMOR_PROFILE;
3299                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3300                         }
3301                 }
3302 #endif
3303
3304                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3305                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3306                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3307                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3308                                 *exit_status = EXIT_SECUREBITS;
3309                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3310                         }
3311
3312                 if (context_has_no_new_privileges(context))
3313                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3314                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3315                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3316                         }
3317
3318 #if HAVE_SECCOMP
3319                 r = apply_address_families(unit, context);
3320                 if (r < 0) {
3321                         *exit_status = EXIT_ADDRESS_FAMILIES;
3322                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3323                 }
3324
3325                 r = apply_memory_deny_write_execute(unit, context);
3326                 if (r < 0) {
3327                         *exit_status = EXIT_SECCOMP;
3328                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3329                 }
3330
3331                 r = apply_restrict_realtime(unit, context);
3332                 if (r < 0) {
3333                         *exit_status = EXIT_SECCOMP;
3334                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3335                 }
3336
3337                 r = apply_restrict_namespaces(unit, context);
3338                 if (r < 0) {
3339                         *exit_status = EXIT_SECCOMP;
3340                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3341                 }
3342
3343                 r = apply_protect_sysctl(unit, context);
3344                 if (r < 0) {
3345                         *exit_status = EXIT_SECCOMP;
3346                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3347                 }
3348
3349                 r = apply_protect_kernel_modules(unit, context);
3350                 if (r < 0) {
3351                         *exit_status = EXIT_SECCOMP;
3352                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3353                 }
3354
3355                 r = apply_private_devices(unit, context);
3356                 if (r < 0) {
3357                         *exit_status = EXIT_SECCOMP;
3358                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3359                 }
3360
3361                 r = apply_syscall_archs(unit, context);
3362                 if (r < 0) {
3363                         *exit_status = EXIT_SECCOMP;
3364                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3365                 }
3366
3367                 r = apply_lock_personality(unit, context);
3368                 if (r < 0) {
3369                         *exit_status = EXIT_SECCOMP;
3370                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3371                 }
3372
3373                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3374                  * by the filter as little as possible. */
3375                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3376                 if (r < 0) {
3377                         *exit_status = EXIT_SECCOMP;
3378                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3379                 }
3380 #endif
3381         }
3382
3383         if (!strv_isempty(context->unset_environment)) {
3384                 char **ee = NULL;
3385
3386                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3387                 if (!ee) {
3388                         *exit_status = EXIT_MEMORY;
3389                         return log_oom();
3390                 }
3391
3392                 strv_free(accum_env);
3393                 accum_env = ee;
3394         }
3395
3396         final_argv = replace_env_argv(argv, accum_env);
3397         if (!final_argv) {
3398                 *exit_status = EXIT_MEMORY;
3399                 return log_oom();
3400         }
3401
3402         if (DEBUG_LOGGING) {
3403                 _cleanup_free_ char *line;
3404
3405                 line = exec_command_line(final_argv);
3406                 if (line) {
3407                         log_struct(LOG_DEBUG,
3408                                    "EXECUTABLE=%s", command->path,
3409                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3410                                    LOG_UNIT_ID(unit),
3411                                    LOG_UNIT_INVOCATION_ID(unit),
3412                                    NULL);
3413                 }
3414         }
3415
3416         execve(command->path, final_argv, accum_env);
3417
3418         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3419
3420                 log_struct_errno(LOG_INFO, errno,
3421                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3422                                  LOG_UNIT_ID(unit),
3423                                  LOG_UNIT_INVOCATION_ID(unit),
3424                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3425                                                   command->path),
3426                                  "EXECUTABLE=%s", command->path,
3427                                  NULL);
3428
3429                 return 0;
3430         }
3431
3432         *exit_status = EXIT_EXEC;
3433         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3434 }
3435
3436 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3437 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3438
3439 int exec_spawn(Unit *unit,
3440                ExecCommand *command,
3441                const ExecContext *context,
3442                const ExecParameters *params,
3443                ExecRuntime *runtime,
3444                DynamicCreds *dcreds,
3445                pid_t *ret) {
3446
3447         _cleanup_strv_free_ char **files_env = NULL;
3448         int *fds = NULL;
3449         unsigned n_storage_fds = 0, n_socket_fds = 0;
3450         _cleanup_free_ char *line = NULL;
3451         int socket_fd, r;
3452         int named_iofds[3] = { -1, -1, -1 };
3453         char **argv;
3454         pid_t pid;
3455
3456         assert(unit);
3457         assert(command);
3458         assert(context);
3459         assert(ret);
3460         assert(params);
3461         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3462
3463         if (context->std_input == EXEC_INPUT_SOCKET ||
3464             context->std_output == EXEC_OUTPUT_SOCKET ||
3465             context->std_error == EXEC_OUTPUT_SOCKET) {
3466
3467                 if (params->n_socket_fds > 1) {
3468                         log_unit_error(unit, "Got more than one socket.");
3469                         return -EINVAL;
3470                 }
3471
3472                 if (params->n_socket_fds == 0) {
3473                         log_unit_error(unit, "Got no socket.");
3474                         return -EINVAL;
3475                 }
3476
3477                 socket_fd = params->fds[0];
3478         } else {
3479                 socket_fd = -1;
3480                 fds = params->fds;
3481                 n_storage_fds = params->n_storage_fds;
3482                 n_socket_fds = params->n_socket_fds;
3483         }
3484
3485         r = exec_context_named_iofds(context, params, named_iofds);
3486         if (r < 0)
3487                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3488
3489         r = exec_context_load_environment(unit, context, &files_env);
3490         if (r < 0)
3491                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3492
3493         argv = params->argv ?: command->argv;
3494         line = exec_command_line(argv);
3495         if (!line)
3496                 return log_oom();
3497
3498         log_struct(LOG_DEBUG,
3499                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3500                    "EXECUTABLE=%s", command->path,
3501                    LOG_UNIT_ID(unit),
3502                    LOG_UNIT_INVOCATION_ID(unit),
3503                    NULL);
3504
3505         pid = fork();
3506         if (pid < 0)
3507                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3508
3509         if (pid == 0) {
3510                 int exit_status = EXIT_SUCCESS;
3511
3512                 r = exec_child(unit,
3513                                command,
3514                                context,
3515                                params,
3516                                runtime,
3517                                dcreds,
3518                                argv,
3519                                socket_fd,
3520                                named_iofds,
3521                                fds,
3522                                n_storage_fds,
3523                                n_socket_fds,
3524                                files_env,
3525                                unit->manager->user_lookup_fds[1],
3526                                &exit_status);
3527
3528                 if (r < 0) {
3529                         log_struct_errno(LOG_ERR, r,
3530                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3531                                          LOG_UNIT_ID(unit),
3532                                          LOG_UNIT_INVOCATION_ID(unit),
3533                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3534                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3535                                                           command->path),
3536                                          "EXECUTABLE=%s", command->path,
3537                                          NULL);
3538                 }
3539
3540                 _exit(exit_status);
3541         }
3542
3543         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3544
3545         /* We add the new process to the cgroup both in the child (so
3546          * that we can be sure that no user code is ever executed
3547          * outside of the cgroup) and in the parent (so that we can be
3548          * sure that when we kill the cgroup the process will be
3549          * killed too). */
3550         if (params->cgroup_path)
3551                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3552
3553         exec_status_start(&command->exec_status, pid);
3554
3555         *ret = pid;
3556         return 0;
3557 }
3558
3559 void exec_context_init(ExecContext *c) {
3560         ExecDirectoryType i;
3561
3562         assert(c);
3563
3564         c->umask = 0022;
3565         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3566         c->cpu_sched_policy = SCHED_OTHER;
3567         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3568         c->syslog_level_prefix = true;
3569         c->ignore_sigpipe = true;
3570         c->timer_slack_nsec = NSEC_INFINITY;
3571         c->personality = PERSONALITY_INVALID;
3572         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3573                 c->directories[i].mode = 0755;
3574         c->capability_bounding_set = CAP_ALL;
3575         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3576         c->log_level_max = -1;
3577 }
3578
3579 void exec_context_done(ExecContext *c) {
3580         ExecDirectoryType i;
3581         size_t l;
3582
3583         assert(c);
3584
3585         c->environment = strv_free(c->environment);
3586         c->environment_files = strv_free(c->environment_files);
3587         c->pass_environment = strv_free(c->pass_environment);
3588         c->unset_environment = strv_free(c->unset_environment);
3589
3590         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3591                 c->rlimit[l] = mfree(c->rlimit[l]);
3592
3593         for (l = 0; l < 3; l++) {
3594                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3595                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3596         }
3597
3598         c->working_directory = mfree(c->working_directory);
3599         c->root_directory = mfree(c->root_directory);
3600         c->root_image = mfree(c->root_image);
3601         c->tty_path = mfree(c->tty_path);
3602         c->syslog_identifier = mfree(c->syslog_identifier);
3603         c->user = mfree(c->user);
3604         c->group = mfree(c->group);
3605
3606         c->supplementary_groups = strv_free(c->supplementary_groups);
3607
3608         c->pam_name = mfree(c->pam_name);
3609
3610         c->read_only_paths = strv_free(c->read_only_paths);
3611         c->read_write_paths = strv_free(c->read_write_paths);
3612         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3613
3614         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3615         c->bind_mounts = NULL;
3616         c->n_bind_mounts = 0;
3617         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3618         c->temporary_filesystems = NULL;
3619         c->n_temporary_filesystems = 0;
3620
3621         c->cpuset = cpu_set_mfree(c->cpuset);
3622
3623         c->utmp_id = mfree(c->utmp_id);
3624         c->selinux_context = mfree(c->selinux_context);
3625         c->apparmor_profile = mfree(c->apparmor_profile);
3626         c->smack_process_label = mfree(c->smack_process_label);
3627
3628         c->syscall_filter = hashmap_free(c->syscall_filter);
3629         c->syscall_archs = set_free(c->syscall_archs);
3630         c->address_families = set_free(c->address_families);
3631
3632         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3633                 c->directories[i].paths = strv_free(c->directories[i].paths);
3634
3635         c->log_level_max = -1;
3636
3637         exec_context_free_log_extra_fields(c);
3638
3639         c->stdin_data = mfree(c->stdin_data);
3640         c->stdin_data_size = 0;
3641 }
3642
3643 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3644         char **i;
3645
3646         assert(c);
3647
3648         if (!runtime_prefix)
3649                 return 0;
3650
3651         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3652                 _cleanup_free_ char *p;
3653
3654                 p = strjoin(runtime_prefix, "/", *i);
3655                 if (!p)
3656                         return -ENOMEM;
3657
3658                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3659                  * next. */
3660                 (void) rm_rf(p, REMOVE_ROOT);
3661         }
3662
3663         return 0;
3664 }
3665
3666 static void exec_command_done(ExecCommand *c) {
3667         assert(c);
3668
3669         c->path = mfree(c->path);
3670
3671         c->argv = strv_free(c->argv);
3672 }
3673
3674 void exec_command_done_array(ExecCommand *c, unsigned n) {
3675         unsigned i;
3676
3677         for (i = 0; i < n; i++)
3678                 exec_command_done(c+i);
3679 }
3680
3681 ExecCommand* exec_command_free_list(ExecCommand *c) {
3682         ExecCommand *i;
3683
3684         while ((i = c)) {
3685                 LIST_REMOVE(command, c, i);
3686                 exec_command_done(i);
3687                 free(i);
3688         }
3689
3690         return NULL;
3691 }
3692
3693 void exec_command_free_array(ExecCommand **c, unsigned n) {
3694         unsigned i;
3695
3696         for (i = 0; i < n; i++)
3697                 c[i] = exec_command_free_list(c[i]);
3698 }
3699
3700 typedef struct InvalidEnvInfo {
3701         const Unit *unit;
3702         const char *path;
3703 } InvalidEnvInfo;
3704
3705 static void invalid_env(const char *p, void *userdata) {
3706         InvalidEnvInfo *info = userdata;
3707
3708         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3709 }
3710
3711 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3712         assert(c);
3713
3714         switch (fd_index) {
3715
3716         case STDIN_FILENO:
3717                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3718                         return NULL;
3719
3720                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3721
3722         case STDOUT_FILENO:
3723                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3724                         return NULL;
3725
3726                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3727
3728         case STDERR_FILENO:
3729                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3730                         return NULL;
3731
3732                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3733
3734         default:
3735                 return NULL;
3736         }
3737 }
3738
3739 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3740         unsigned i, targets;
3741         const char* stdio_fdname[3];
3742         unsigned n_fds;
3743
3744         assert(c);
3745         assert(p);
3746
3747         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3748                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3749                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3750
3751         for (i = 0; i < 3; i++)
3752                 stdio_fdname[i] = exec_context_fdname(c, i);
3753
3754         n_fds = p->n_storage_fds + p->n_socket_fds;
3755
3756         for (i = 0; i < n_fds  && targets > 0; i++)
3757                 if (named_iofds[STDIN_FILENO] < 0 &&
3758                     c->std_input == EXEC_INPUT_NAMED_FD &&
3759                     stdio_fdname[STDIN_FILENO] &&
3760                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3761
3762                         named_iofds[STDIN_FILENO] = p->fds[i];
3763                         targets--;
3764
3765                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3766                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3767                            stdio_fdname[STDOUT_FILENO] &&
3768                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3769
3770                         named_iofds[STDOUT_FILENO] = p->fds[i];
3771                         targets--;
3772
3773                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3774                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3775                            stdio_fdname[STDERR_FILENO] &&
3776                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3777
3778                         named_iofds[STDERR_FILENO] = p->fds[i];
3779                         targets--;
3780                 }
3781
3782         return targets == 0 ? 0 : -ENOENT;
3783 }
3784
3785 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3786         char **i, **r = NULL;
3787
3788         assert(c);
3789         assert(l);
3790
3791         STRV_FOREACH(i, c->environment_files) {
3792                 char *fn;
3793                 int k;
3794                 unsigned n;
3795                 bool ignore = false;
3796                 char **p;
3797                 _cleanup_globfree_ glob_t pglob = {};
3798
3799                 fn = *i;
3800
3801                 if (fn[0] == '-') {
3802                         ignore = true;
3803                         fn++;
3804                 }
3805
3806                 if (!path_is_absolute(fn)) {
3807                         if (ignore)
3808                                 continue;
3809
3810                         strv_free(r);
3811                         return -EINVAL;
3812                 }
3813
3814                 /* Filename supports globbing, take all matching files */
3815                 k = safe_glob(fn, 0, &pglob);
3816                 if (k < 0) {
3817                         if (ignore)
3818                                 continue;
3819
3820                         strv_free(r);
3821                         return k;
3822                 }
3823
3824                 /* When we don't match anything, -ENOENT should be returned */
3825                 assert(pglob.gl_pathc > 0);
3826
3827                 for (n = 0; n < pglob.gl_pathc; n++) {
3828                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3829                         if (k < 0) {
3830                                 if (ignore)
3831                                         continue;
3832
3833                                 strv_free(r);
3834                                 return k;
3835                         }
3836                         /* Log invalid environment variables with filename */
3837                         if (p) {
3838                                 InvalidEnvInfo info = {
3839                                         .unit = unit,
3840                                         .path = pglob.gl_pathv[n]
3841                                 };
3842
3843                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3844                         }
3845
3846                         if (!r)
3847                                 r = p;
3848                         else {
3849                                 char **m;
3850
3851                                 m = strv_env_merge(2, r, p);
3852                                 strv_free(r);
3853                                 strv_free(p);
3854                                 if (!m)
3855                                         return -ENOMEM;
3856
3857                                 r = m;
3858                         }
3859                 }
3860         }
3861
3862         *l = r;
3863
3864         return 0;
3865 }
3866
3867 static bool tty_may_match_dev_console(const char *tty) {
3868         _cleanup_free_ char *resolved = NULL;
3869
3870         if (!tty)
3871                 return true;
3872
3873         tty = skip_dev_prefix(tty);
3874
3875         /* trivial identity? */
3876         if (streq(tty, "console"))
3877                 return true;
3878
3879         if (resolve_dev_console(&resolved) < 0)
3880                 return true; /* if we could not resolve, assume it may */
3881
3882         /* "tty0" means the active VC, so it may be the same sometimes */
3883         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3884 }
3885
3886 bool exec_context_may_touch_console(const ExecContext *ec) {
3887
3888         return (ec->tty_reset ||
3889                 ec->tty_vhangup ||
3890                 ec->tty_vt_disallocate ||
3891                 is_terminal_input(ec->std_input) ||
3892                 is_terminal_output(ec->std_output) ||
3893                 is_terminal_output(ec->std_error)) &&
3894                tty_may_match_dev_console(exec_context_tty_path(ec));
3895 }
3896
3897 static void strv_fprintf(FILE *f, char **l) {
3898         char **g;
3899
3900         assert(f);
3901
3902         STRV_FOREACH(g, l)
3903                 fprintf(f, " %s", *g);
3904 }
3905
3906 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3907         ExecDirectoryType dt;
3908         char **e, **d;
3909         unsigned i;
3910         int r;
3911
3912         assert(c);
3913         assert(f);
3914
3915         prefix = strempty(prefix);
3916
3917         fprintf(f,
3918                 "%sUMask: %04o\n"
3919                 "%sWorkingDirectory: %s\n"
3920                 "%sRootDirectory: %s\n"
3921                 "%sNonBlocking: %s\n"
3922                 "%sPrivateTmp: %s\n"
3923                 "%sPrivateDevices: %s\n"
3924                 "%sProtectKernelTunables: %s\n"
3925                 "%sProtectKernelModules: %s\n"
3926                 "%sProtectControlGroups: %s\n"
3927                 "%sPrivateNetwork: %s\n"
3928                 "%sPrivateUsers: %s\n"
3929                 "%sProtectHome: %s\n"
3930                 "%sProtectSystem: %s\n"
3931                 "%sMountAPIVFS: %s\n"
3932                 "%sIgnoreSIGPIPE: %s\n"
3933                 "%sMemoryDenyWriteExecute: %s\n"
3934                 "%sRestrictRealtime: %s\n"
3935                 "%sKeyringMode: %s\n",
3936                 prefix, c->umask,
3937                 prefix, c->working_directory ? c->working_directory : "/",
3938                 prefix, c->root_directory ? c->root_directory : "/",
3939                 prefix, yes_no(c->non_blocking),
3940                 prefix, yes_no(c->private_tmp),
3941                 prefix, yes_no(c->private_devices),
3942                 prefix, yes_no(c->protect_kernel_tunables),
3943                 prefix, yes_no(c->protect_kernel_modules),
3944                 prefix, yes_no(c->protect_control_groups),
3945                 prefix, yes_no(c->private_network),
3946                 prefix, yes_no(c->private_users),
3947                 prefix, protect_home_to_string(c->protect_home),
3948                 prefix, protect_system_to_string(c->protect_system),
3949                 prefix, yes_no(c->mount_apivfs),
3950                 prefix, yes_no(c->ignore_sigpipe),
3951                 prefix, yes_no(c->memory_deny_write_execute),
3952                 prefix, yes_no(c->restrict_realtime),
3953                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3954
3955         if (c->root_image)
3956                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3957
3958         STRV_FOREACH(e, c->environment)
3959                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3960
3961         STRV_FOREACH(e, c->environment_files)
3962                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3963
3964         STRV_FOREACH(e, c->pass_environment)
3965                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3966
3967         STRV_FOREACH(e, c->unset_environment)
3968                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3969
3970         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3971
3972         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3973                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3974
3975                 STRV_FOREACH(d, c->directories[dt].paths)
3976                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3977         }
3978
3979         if (c->nice_set)
3980                 fprintf(f,
3981                         "%sNice: %i\n",
3982                         prefix, c->nice);
3983
3984         if (c->oom_score_adjust_set)
3985                 fprintf(f,
3986                         "%sOOMScoreAdjust: %i\n",
3987                         prefix, c->oom_score_adjust);
3988
3989         for (i = 0; i < RLIM_NLIMITS; i++)
3990                 if (c->rlimit[i]) {
3991                         fprintf(f, "%s%s: " RLIM_FMT "\n",
3992                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3993                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3994                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3995                 }
3996
3997         if (c->ioprio_set) {
3998                 _cleanup_free_ char *class_str = NULL;
3999
4000                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4001                 if (r >= 0)
4002                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4003
4004                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4005         }
4006
4007         if (c->cpu_sched_set) {
4008                 _cleanup_free_ char *policy_str = NULL;
4009
4010                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4011                 if (r >= 0)
4012                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4013
4014                 fprintf(f,
4015                         "%sCPUSchedulingPriority: %i\n"
4016                         "%sCPUSchedulingResetOnFork: %s\n",
4017                         prefix, c->cpu_sched_priority,
4018                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4019         }
4020
4021         if (c->cpuset) {
4022                 fprintf(f, "%sCPUAffinity:", prefix);
4023                 for (i = 0; i < c->cpuset_ncpus; i++)
4024                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4025                                 fprintf(f, " %u", i);
4026                 fputs("\n", f);
4027         }
4028
4029         if (c->timer_slack_nsec != NSEC_INFINITY)
4030                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4031
4032         fprintf(f,
4033                 "%sStandardInput: %s\n"
4034                 "%sStandardOutput: %s\n"
4035                 "%sStandardError: %s\n",
4036                 prefix, exec_input_to_string(c->std_input),
4037                 prefix, exec_output_to_string(c->std_output),
4038                 prefix, exec_output_to_string(c->std_error));
4039
4040         if (c->std_input == EXEC_INPUT_NAMED_FD)
4041                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4042         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4043                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4044         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4045                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4046
4047         if (c->std_input == EXEC_INPUT_FILE)
4048                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4049         if (c->std_output == EXEC_OUTPUT_FILE)
4050                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4051         if (c->std_error == EXEC_OUTPUT_FILE)
4052                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4053
4054         if (c->tty_path)
4055                 fprintf(f,
4056                         "%sTTYPath: %s\n"
4057                         "%sTTYReset: %s\n"
4058                         "%sTTYVHangup: %s\n"
4059                         "%sTTYVTDisallocate: %s\n",
4060                         prefix, c->tty_path,
4061                         prefix, yes_no(c->tty_reset),
4062                         prefix, yes_no(c->tty_vhangup),
4063                         prefix, yes_no(c->tty_vt_disallocate));
4064
4065         if (IN_SET(c->std_output,
4066                    EXEC_OUTPUT_SYSLOG,
4067                    EXEC_OUTPUT_KMSG,
4068                    EXEC_OUTPUT_JOURNAL,
4069                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4070                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4071                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4072             IN_SET(c->std_error,
4073                    EXEC_OUTPUT_SYSLOG,
4074                    EXEC_OUTPUT_KMSG,
4075                    EXEC_OUTPUT_JOURNAL,
4076                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4077                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4078                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4079
4080                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4081
4082                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4083                 if (r >= 0)
4084                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4085
4086                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4087                 if (r >= 0)
4088                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4089         }
4090
4091         if (c->log_level_max >= 0) {
4092                 _cleanup_free_ char *t = NULL;
4093
4094                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4095
4096                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4097         }
4098
4099         if (c->n_log_extra_fields > 0) {
4100                 size_t j;
4101
4102                 for (j = 0; j < c->n_log_extra_fields; j++) {
4103                         fprintf(f, "%sLogExtraFields: ", prefix);
4104                         fwrite(c->log_extra_fields[j].iov_base,
4105                                1, c->log_extra_fields[j].iov_len,
4106                                f);
4107                         fputc('\n', f);
4108                 }
4109         }
4110
4111         if (c->secure_bits) {
4112                 _cleanup_free_ char *str = NULL;
4113
4114                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4115                 if (r >= 0)
4116                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4117         }
4118
4119         if (c->capability_bounding_set != CAP_ALL) {
4120                 _cleanup_free_ char *str = NULL;
4121
4122                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4123                 if (r >= 0)
4124                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4125         }
4126
4127         if (c->capability_ambient_set != 0) {
4128                 _cleanup_free_ char *str = NULL;
4129
4130                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4131                 if (r >= 0)
4132                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4133         }
4134
4135         if (c->user)
4136                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4137         if (c->group)
4138                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4139
4140         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4141
4142         if (!strv_isempty(c->supplementary_groups)) {
4143                 fprintf(f, "%sSupplementaryGroups:", prefix);
4144                 strv_fprintf(f, c->supplementary_groups);
4145                 fputs("\n", f);
4146         }
4147
4148         if (c->pam_name)
4149                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4150
4151         if (!strv_isempty(c->read_write_paths)) {
4152                 fprintf(f, "%sReadWritePaths:", prefix);
4153                 strv_fprintf(f, c->read_write_paths);
4154                 fputs("\n", f);
4155         }
4156
4157         if (!strv_isempty(c->read_only_paths)) {
4158                 fprintf(f, "%sReadOnlyPaths:", prefix);
4159                 strv_fprintf(f, c->read_only_paths);
4160                 fputs("\n", f);
4161         }
4162
4163         if (!strv_isempty(c->inaccessible_paths)) {
4164                 fprintf(f, "%sInaccessiblePaths:", prefix);
4165                 strv_fprintf(f, c->inaccessible_paths);
4166                 fputs("\n", f);
4167         }
4168
4169         if (c->n_bind_mounts > 0)
4170                 for (i = 0; i < c->n_bind_mounts; i++)
4171                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4172                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4173                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4174                                 c->bind_mounts[i].source,
4175                                 c->bind_mounts[i].destination,
4176                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4177
4178         if (c->n_temporary_filesystems > 0)
4179                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4180                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4181
4182                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4183                                 t->path,
4184                                 isempty(t->options) ? "" : ":",
4185                                 strempty(t->options));
4186                 }
4187
4188         if (c->utmp_id)
4189                 fprintf(f,
4190                         "%sUtmpIdentifier: %s\n",
4191                         prefix, c->utmp_id);
4192
4193         if (c->selinux_context)
4194                 fprintf(f,
4195                         "%sSELinuxContext: %s%s\n",
4196                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4197
4198         if (c->apparmor_profile)
4199                 fprintf(f,
4200                         "%sAppArmorProfile: %s%s\n",
4201                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4202
4203         if (c->smack_process_label)
4204                 fprintf(f,
4205                         "%sSmackProcessLabel: %s%s\n",
4206                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4207
4208         if (c->personality != PERSONALITY_INVALID)
4209                 fprintf(f,
4210                         "%sPersonality: %s\n",
4211                         prefix, strna(personality_to_string(c->personality)));
4212
4213         fprintf(f,
4214                 "%sLockPersonality: %s\n",
4215                 prefix, yes_no(c->lock_personality));
4216
4217         if (c->syscall_filter) {
4218 #if HAVE_SECCOMP
4219                 Iterator j;
4220                 void *id, *val;
4221                 bool first = true;
4222 #endif
4223
4224                 fprintf(f,
4225                         "%sSystemCallFilter: ",
4226                         prefix);
4227
4228                 if (!c->syscall_whitelist)
4229                         fputc('~', f);
4230
4231 #if HAVE_SECCOMP
4232                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4233                         _cleanup_free_ char *name = NULL;
4234                         const char *errno_name = NULL;
4235                         int num = PTR_TO_INT(val);
4236
4237                         if (first)
4238                                 first = false;
4239                         else
4240                                 fputc(' ', f);
4241
4242                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4243                         fputs(strna(name), f);
4244
4245                         if (num >= 0) {
4246                                 errno_name = errno_to_name(num);
4247                                 if (errno_name)
4248                                         fprintf(f, ":%s", errno_name);
4249                                 else
4250                                         fprintf(f, ":%d", num);
4251                         }
4252                 }
4253 #endif
4254
4255                 fputc('\n', f);
4256         }
4257
4258         if (c->syscall_archs) {
4259 #if HAVE_SECCOMP
4260                 Iterator j;
4261                 void *id;
4262 #endif
4263
4264                 fprintf(f,
4265                         "%sSystemCallArchitectures:",
4266                         prefix);
4267
4268 #if HAVE_SECCOMP
4269                 SET_FOREACH(id, c->syscall_archs, j)
4270                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4271 #endif
4272                 fputc('\n', f);
4273         }
4274
4275         if (exec_context_restrict_namespaces_set(c)) {
4276                 _cleanup_free_ char *s = NULL;
4277
4278                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4279                 if (r >= 0)
4280                         fprintf(f, "%sRestrictNamespaces: %s\n",
4281                                 prefix, s);
4282         }
4283
4284         if (c->syscall_errno > 0) {
4285                 const char *errno_name;
4286
4287                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4288
4289                 errno_name = errno_to_name(c->syscall_errno);
4290                 if (errno_name)
4291                         fprintf(f, "%s\n", errno_name);
4292                 else
4293                         fprintf(f, "%d\n", c->syscall_errno);
4294         }
4295
4296         if (c->apparmor_profile)
4297                 fprintf(f,
4298                         "%sAppArmorProfile: %s%s\n",
4299                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4300 }
4301
4302 bool exec_context_maintains_privileges(const ExecContext *c) {
4303         assert(c);
4304
4305         /* Returns true if the process forked off would run under
4306          * an unchanged UID or as root. */
4307
4308         if (!c->user)
4309                 return true;
4310
4311         if (streq(c->user, "root") || streq(c->user, "0"))
4312                 return true;
4313
4314         return false;
4315 }
4316
4317 int exec_context_get_effective_ioprio(const ExecContext *c) {
4318         int p;
4319
4320         assert(c);
4321
4322         if (c->ioprio_set)
4323                 return c->ioprio;
4324
4325         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4326         if (p < 0)
4327                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4328
4329         return p;
4330 }
4331
4332 void exec_context_free_log_extra_fields(ExecContext *c) {
4333         size_t l;
4334
4335         assert(c);
4336
4337         for (l = 0; l < c->n_log_extra_fields; l++)
4338                 free(c->log_extra_fields[l].iov_base);
4339         c->log_extra_fields = mfree(c->log_extra_fields);
4340         c->n_log_extra_fields = 0;
4341 }
4342
4343 void exec_status_start(ExecStatus *s, pid_t pid) {
4344         assert(s);
4345
4346         zero(*s);
4347         s->pid = pid;
4348         dual_timestamp_get(&s->start_timestamp);
4349 }
4350
4351 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4352         assert(s);
4353
4354         if (s->pid && s->pid != pid)
4355                 zero(*s);
4356
4357         s->pid = pid;
4358         dual_timestamp_get(&s->exit_timestamp);
4359
4360         s->code = code;
4361         s->status = status;
4362
4363         if (context) {
4364                 if (context->utmp_id)
4365                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4366
4367                 exec_context_tty_reset(context, NULL);
4368         }
4369 }
4370
4371 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4372         char buf[FORMAT_TIMESTAMP_MAX];
4373
4374         assert(s);
4375         assert(f);
4376
4377         if (s->pid <= 0)
4378                 return;
4379
4380         prefix = strempty(prefix);
4381
4382         fprintf(f,
4383                 "%sPID: "PID_FMT"\n",
4384                 prefix, s->pid);
4385
4386         if (dual_timestamp_is_set(&s->start_timestamp))
4387                 fprintf(f,
4388                         "%sStart Timestamp: %s\n",
4389                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4390
4391         if (dual_timestamp_is_set(&s->exit_timestamp))
4392                 fprintf(f,
4393                         "%sExit Timestamp: %s\n"
4394                         "%sExit Code: %s\n"
4395                         "%sExit Status: %i\n",
4396                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4397                         prefix, sigchld_code_to_string(s->code),
4398                         prefix, s->status);
4399 }
4400
4401 static char *exec_command_line(char **argv) {
4402         size_t k;
4403         char *n, *p, **a;
4404         bool first = true;
4405
4406         assert(argv);
4407
4408         k = 1;
4409         STRV_FOREACH(a, argv)
4410                 k += strlen(*a)+3;
4411
4412         n = new(char, k);
4413         if (!n)
4414                 return NULL;
4415
4416         p = n;
4417         STRV_FOREACH(a, argv) {
4418
4419                 if (!first)
4420                         *(p++) = ' ';
4421                 else
4422                         first = false;
4423
4424                 if (strpbrk(*a, WHITESPACE)) {
4425                         *(p++) = '\'';
4426                         p = stpcpy(p, *a);
4427                         *(p++) = '\'';
4428                 } else
4429                         p = stpcpy(p, *a);
4430
4431         }
4432
4433         *p = 0;
4434
4435         /* FIXME: this doesn't really handle arguments that have
4436          * spaces and ticks in them */
4437
4438         return n;
4439 }
4440
4441 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4442         _cleanup_free_ char *cmd = NULL;
4443         const char *prefix2;
4444
4445         assert(c);
4446         assert(f);
4447
4448         prefix = strempty(prefix);
4449         prefix2 = strjoina(prefix, "\t");
4450
4451         cmd = exec_command_line(c->argv);
4452         fprintf(f,
4453                 "%sCommand Line: %s\n",
4454                 prefix, cmd ? cmd : strerror(ENOMEM));
4455
4456         exec_status_dump(&c->exec_status, f, prefix2);
4457 }
4458
4459 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4460         assert(f);
4461
4462         prefix = strempty(prefix);
4463
4464         LIST_FOREACH(command, c, c)
4465                 exec_command_dump(c, f, prefix);
4466 }
4467
4468 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4469         ExecCommand *end;
4470
4471         assert(l);
4472         assert(e);
4473
4474         if (*l) {
4475                 /* It's kind of important, that we keep the order here */
4476                 LIST_FIND_TAIL(command, *l, end);
4477                 LIST_INSERT_AFTER(command, *l, end, e);
4478         } else
4479               *l = e;
4480 }
4481
4482 int exec_command_set(ExecCommand *c, const char *path, ...) {
4483         va_list ap;
4484         char **l, *p;
4485
4486         assert(c);
4487         assert(path);
4488
4489         va_start(ap, path);
4490         l = strv_new_ap(path, ap);
4491         va_end(ap);
4492
4493         if (!l)
4494                 return -ENOMEM;
4495
4496         p = strdup(path);
4497         if (!p) {
4498                 strv_free(l);
4499                 return -ENOMEM;
4500         }
4501
4502         free(c->path);
4503         c->path = p;
4504
4505         strv_free(c->argv);
4506         c->argv = l;
4507
4508         return 0;
4509 }
4510
4511 int exec_command_append(ExecCommand *c, const char *path, ...) {
4512         _cleanup_strv_free_ char **l = NULL;
4513         va_list ap;
4514         int r;
4515
4516         assert(c);
4517         assert(path);
4518
4519         va_start(ap, path);
4520         l = strv_new_ap(path, ap);
4521         va_end(ap);
4522
4523         if (!l)
4524                 return -ENOMEM;
4525
4526         r = strv_extend_strv(&c->argv, l, false);
4527         if (r < 0)
4528                 return r;
4529
4530         return 0;
4531 }
4532
4533 static void *remove_tmpdir_thread(void *p) {
4534         _cleanup_free_ char *path = p;
4535
4536         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4537         return NULL;
4538 }
4539
4540 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4541         int r;
4542
4543         if (!rt)
4544                 return NULL;
4545
4546         if (rt->manager)
4547                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4548
4549         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4550         if (destroy && rt->tmp_dir) {
4551                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4552
4553                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4554                 if (r < 0) {
4555                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4556                         free(rt->tmp_dir);
4557                 }
4558
4559                 rt->tmp_dir = NULL;
4560         }
4561
4562         if (destroy && rt->var_tmp_dir) {
4563                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4564
4565                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4566                 if (r < 0) {
4567                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4568                         free(rt->var_tmp_dir);
4569                 }
4570
4571                 rt->var_tmp_dir = NULL;
4572         }
4573
4574         rt->id = mfree(rt->id);
4575         rt->tmp_dir = mfree(rt->tmp_dir);
4576         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4577         safe_close_pair(rt->netns_storage_socket);
4578         return mfree(rt);
4579 }
4580
4581 static void exec_runtime_freep(ExecRuntime **rt) {
4582         if (*rt)
4583                 (void) exec_runtime_free(*rt, false);
4584 }
4585
4586 static int exec_runtime_allocate(ExecRuntime **rt) {
4587         assert(rt);
4588
4589         *rt = new0(ExecRuntime, 1);
4590         if (!*rt)
4591                 return -ENOMEM;
4592
4593         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4594         return 0;
4595 }
4596
4597 static int exec_runtime_add(
4598                 Manager *m,
4599                 const char *id,
4600                 const char *tmp_dir,
4601                 const char *var_tmp_dir,
4602                 const int netns_storage_socket[2],
4603                 ExecRuntime **ret) {
4604
4605         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4606         int r;
4607
4608         assert(m);
4609         assert(id);
4610
4611         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4612         if (r < 0)
4613                 return r;
4614
4615         r = exec_runtime_allocate(&rt);
4616         if (r < 0)
4617                 return r;
4618
4619         rt->id = strdup(id);
4620         if (!rt->id)
4621                 return -ENOMEM;
4622
4623         if (tmp_dir) {
4624                 rt->tmp_dir = strdup(tmp_dir);
4625                 if (!rt->tmp_dir)
4626                         return -ENOMEM;
4627
4628                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4629                 assert(var_tmp_dir);
4630                 rt->var_tmp_dir = strdup(var_tmp_dir);
4631                 if (!rt->var_tmp_dir)
4632                         return -ENOMEM;
4633         }
4634
4635         if (netns_storage_socket) {
4636                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4637                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4638         }
4639
4640         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4641         if (r < 0)
4642                 return r;
4643
4644         rt->manager = m;
4645
4646         if (ret)
4647                 *ret = rt;
4648
4649         /* do not remove created ExecRuntime object when the operation succeeds. */
4650         rt = NULL;
4651         return 0;
4652 }
4653
4654 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4655         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4656         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4657         int r;
4658
4659         assert(m);
4660         assert(c);
4661         assert(id);
4662
4663         /* It is not necessary to create ExecRuntime object. */
4664         if (!c->private_network && !c->private_tmp)
4665                 return 0;
4666
4667         if (c->private_tmp) {
4668                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4669                 if (r < 0)
4670                         return r;
4671         }
4672
4673         if (c->private_network) {
4674                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4675                         return -errno;
4676         }
4677
4678         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4679         if (r < 0)
4680                 return r;
4681
4682         /* Avoid cleanup */
4683         netns_storage_socket[0] = -1;
4684         netns_storage_socket[1] = -1;
4685         return 1;
4686 }
4687
4688 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4689         ExecRuntime *rt;
4690         int r;
4691
4692         assert(m);
4693         assert(id);
4694         assert(ret);
4695
4696         rt = hashmap_get(m->exec_runtime_by_id, id);
4697         if (rt)
4698                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4699                 goto ref;
4700
4701         if (!create)
4702                 return 0;
4703
4704         /* If not found, then create a new object. */
4705         r = exec_runtime_make(m, c, id, &rt);
4706         if (r <= 0)
4707                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4708                 return r;
4709
4710 ref:
4711         /* increment reference counter. */
4712         rt->n_ref++;
4713         *ret = rt;
4714         return 1;
4715 }
4716
4717 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4718         if (!rt)
4719                 return NULL;
4720
4721         assert(rt->n_ref > 0);
4722
4723         rt->n_ref--;
4724         if (rt->n_ref > 0)
4725                 return NULL;
4726
4727         return exec_runtime_free(rt, destroy);
4728 }
4729
4730 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4731         ExecRuntime *rt;
4732         Iterator i;
4733
4734         assert(m);
4735         assert(f);
4736         assert(fds);
4737
4738         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4739                 fprintf(f, "exec-runtime=%s", rt->id);
4740
4741                 if (rt->tmp_dir)
4742                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4743
4744                 if (rt->var_tmp_dir)
4745                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4746
4747                 if (rt->netns_storage_socket[0] >= 0) {
4748                         int copy;
4749
4750                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4751                         if (copy < 0)
4752                                 return copy;
4753
4754                         fprintf(f, " netns-socket-0=%i", copy);
4755                 }
4756
4757                 if (rt->netns_storage_socket[1] >= 0) {
4758                         int copy;
4759
4760                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4761                         if (copy < 0)
4762                                 return copy;
4763
4764                         fprintf(f, " netns-socket-1=%i", copy);
4765                 }
4766
4767                 fputc('\n', f);
4768         }
4769
4770         return 0;
4771 }
4772
4773 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4774         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4775         ExecRuntime *rt;
4776         int r;
4777
4778         /* This is for the migration from old (v237 or earlier) deserialization text.
4779          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4780          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4781          * so or not from the serialized text, then we always creates a new object owned by this. */
4782
4783         assert(u);
4784         assert(key);
4785         assert(value);
4786
4787         /* Manager manages ExecRuntime objects by the unit id.
4788          * So, we omit the serialized text when the unit does not have id (yet?)... */
4789         if (isempty(u->id)) {
4790                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4791                 return 0;
4792         }
4793
4794         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4795         if (r < 0) {
4796                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4797                 return 0;
4798         }
4799
4800         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4801         if (!rt) {
4802                 r = exec_runtime_allocate(&rt_create);
4803                 if (r < 0)
4804                         return log_oom();
4805
4806                 rt_create->id = strdup(u->id);
4807                 if (!rt_create->id)
4808                         return log_oom();
4809
4810                 rt = rt_create;
4811         }
4812
4813         if (streq(key, "tmp-dir")) {
4814                 char *copy;
4815
4816                 copy = strdup(value);
4817                 if (!copy)
4818                         return log_oom();
4819
4820                 free_and_replace(rt->tmp_dir, copy);
4821
4822         } else if (streq(key, "var-tmp-dir")) {
4823                 char *copy;
4824
4825                 copy = strdup(value);
4826                 if (!copy)
4827                         return log_oom();
4828
4829                 free_and_replace(rt->var_tmp_dir, copy);
4830
4831         } else if (streq(key, "netns-socket-0")) {
4832                 int fd;
4833
4834                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4835                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4836                         return 0;
4837                 }
4838
4839                 safe_close(rt->netns_storage_socket[0]);
4840                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4841
4842         } else if (streq(key, "netns-socket-1")) {
4843                 int fd;
4844
4845                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4846                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4847                         return 0;
4848                 }
4849
4850                 safe_close(rt->netns_storage_socket[1]);
4851                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4852         } else
4853                 return 0;
4854
4855
4856         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4857         if (rt_create) {
4858                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4859                 if (r < 0) {
4860                         log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4861                         return 0;
4862                 }
4863
4864                 rt_create->manager = u->manager;
4865
4866                 /* Avoid cleanup */
4867                 rt_create = NULL;
4868         }
4869
4870         return 1;
4871 }
4872
4873 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4874         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4875         int r, fd0 = -1, fd1 = -1;
4876         const char *p, *v = value;
4877         size_t n;
4878
4879         assert(m);
4880         assert(value);
4881         assert(fds);
4882
4883         n = strcspn(v, " ");
4884         id = strndupa(v, n);
4885         if (v[n] != ' ')
4886                 goto finalize;
4887         p = v + n + 1;
4888
4889         v = startswith(p, "tmp-dir=");
4890         if (v) {
4891                 n = strcspn(v, " ");
4892                 tmp_dir = strndupa(v, n);
4893                 if (v[n] != ' ')
4894                         goto finalize;
4895                 p = v + n + 1;
4896         }
4897
4898         v = startswith(p, "var-tmp-dir=");
4899         if (v) {
4900                 n = strcspn(v, " ");
4901                 var_tmp_dir = strndupa(v, n);
4902                 if (v[n] != ' ')
4903                         goto finalize;
4904                 p = v + n + 1;
4905         }
4906
4907         v = startswith(p, "netns-socket-0=");
4908         if (v) {
4909                 char *buf;
4910
4911                 n = strcspn(v, " ");
4912                 buf = strndupa(v, n);
4913                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4914                         log_debug("Unable to process exec-runtime netns fd specification.");
4915                         return;
4916                 }
4917                 fd0 = fdset_remove(fds, fd0);
4918                 if (v[n] != ' ')
4919                         goto finalize;
4920                 p = v + n + 1;
4921         }
4922
4923         v = startswith(p, "netns-socket-1=");
4924         if (v) {
4925                 char *buf;
4926
4927                 n = strcspn(v, " ");
4928                 buf = strndupa(v, n);
4929                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4930                         log_debug("Unable to process exec-runtime netns fd specification.");
4931                         return;
4932                 }
4933                 fd1 = fdset_remove(fds, fd1);
4934         }
4935
4936 finalize:
4937
4938         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4939         if (r < 0) {
4940                 log_debug_errno(r, "Failed to add exec-runtime: %m");
4941                 return;
4942         }
4943 }
4944
4945 void exec_runtime_vacuum(Manager *m) {
4946         ExecRuntime *rt;
4947         Iterator i;
4948
4949         assert(m);
4950
4951         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4952
4953         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4954                 if (rt->n_ref > 0)
4955                         continue;
4956
4957                 (void) exec_runtime_free(rt, false);
4958         }
4959 }
4960
4961 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4962         [EXEC_INPUT_NULL] = "null",
4963         [EXEC_INPUT_TTY] = "tty",
4964         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4965         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4966         [EXEC_INPUT_SOCKET] = "socket",
4967         [EXEC_INPUT_NAMED_FD] = "fd",
4968         [EXEC_INPUT_DATA] = "data",
4969         [EXEC_INPUT_FILE] = "file",
4970 };
4971
4972 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4973
4974 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4975         [EXEC_OUTPUT_INHERIT] = "inherit",
4976         [EXEC_OUTPUT_NULL] = "null",
4977         [EXEC_OUTPUT_TTY] = "tty",
4978         [EXEC_OUTPUT_SYSLOG] = "syslog",
4979         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4980         [EXEC_OUTPUT_KMSG] = "kmsg",
4981         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4982         [EXEC_OUTPUT_JOURNAL] = "journal",
4983         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4984         [EXEC_OUTPUT_SOCKET] = "socket",
4985         [EXEC_OUTPUT_NAMED_FD] = "fd",
4986         [EXEC_OUTPUT_FILE] = "file",
4987 };
4988
4989 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4990
4991 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4992         [EXEC_UTMP_INIT] = "init",
4993         [EXEC_UTMP_LOGIN] = "login",
4994         [EXEC_UTMP_USER] = "user",
4995 };
4996
4997 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4998
4999 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5000         [EXEC_PRESERVE_NO] = "no",
5001         [EXEC_PRESERVE_YES] = "yes",
5002         [EXEC_PRESERVE_RESTART] = "restart",
5003 };
5004
5005 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5006
5007 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5008         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5009         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5010         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5011         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5012         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5013 };
5014
5015 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5016
5017 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5018         [EXEC_KEYRING_INHERIT] = "inherit",
5019         [EXEC_KEYRING_PRIVATE] = "private",
5020         [EXEC_KEYRING_SHARED] = "shared",
5021 };
5022
5023 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);