src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <glob.h>
  24 #include <grp.h>
  25 #include <poll.h>
  26 #include <signal.h>
  27 #include <string.h>
  28 #include <sys/capability.h>
  29 #include <sys/eventfd.h>
  30 #include <sys/mman.h>
  31 #include <sys/personality.h>
  32 #include <sys/prctl.h>
  33 #include <sys/shm.h>
  34 #include <sys/socket.h>
  35 #include <sys/stat.h>
  36 #include <sys/types.h>
  37 #include <sys/un.h>
  38 #include <unistd.h>
  39 #include <utmpx.h>
  40
  41 #if HAVE_PAM
  42 #include <security/pam_appl.h>
  43 #endif
  44
  45 #if HAVE_SELINUX
  46 #include <selinux/selinux.h>
  47 #endif
  48
  49 #if HAVE_SECCOMP
  50 #include <seccomp.h>
  51 #endif
  52
  53 #if HAVE_APPARMOR
  54 #include <sys/apparmor.h>
  55 #endif
  56
  57 #include "sd-messages.h"
  58
  59 #include "af-list.h"
  60 #include "alloc-util.h"
  61 #if HAVE_APPARMOR
  62 #include "apparmor-util.h"
  63 #endif
  64 #include "async.h"
  65 #include "barrier.h"
  66 #include "cap-list.h"
  67 #include "capability-util.h"
  68 #include "chown-recursive.h"
  69 #include "def.h"
  70 #include "env-util.h"
  71 #include "errno-list.h"
  72 #include "execute.h"
  73 #include "exit-status.h"
  74 #include "fd-util.h"
  75 #include "fileio.h"
  76 #include "format-util.h"
  77 #include "fs-util.h"
  78 #include "glob-util.h"
  79 #include "io-util.h"
  80 #include "ioprio.h"
  81 #include "label.h"
  82 #include "log.h"
  83 #include "macro.h"
  84 #include "missing.h"
  85 #include "mkdir.h"
  86 #include "namespace.h"
  87 #include "parse-util.h"
  88 #include "path-util.h"
  89 #include "process-util.h"
  90 #include "rlimit-util.h"
  91 #include "rm-rf.h"
  92 #if HAVE_SECCOMP
  93 #include "seccomp-util.h"
  94 #endif
  95 #include "securebits.h"
  96 #include "securebits-util.h"
  97 #include "selinux-util.h"
  98 #include "signal-util.h"
  99 #include "smack-util.h"
 100 #include "special.h"
 101 #include "string-table.h"
 102 #include "string-util.h"
 103 #include "strv.h"
 104 #include "syslog-util.h"
 105 #include "terminal-util.h"
 106 #include "unit.h"
 107 #include "user-util.h"
 108 #include "util.h"
 109 #include "utmp-wtmp.h"
 110
 111 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 112 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 113
 114 /* This assumes there is a 'tty' group */
 115 #define TTY_MODE 0620
 116
 117 #define SNDBUF_SIZE (8*1024*1024)
 118
 119 static int shift_fds(int fds[], unsigned n_fds) {
 120         int start, restart_from;
 121
 122         if (n_fds <= 0)
 123                 return 0;
 124
 125         /* Modifies the fds array! (sorts it) */
 126
 127         assert(fds);
 128
 129         start = 0;
 130         for (;;) {
 131                 int i;
 132
 133                 restart_from = -1;
 134
 135                 for (i = start; i < (int) n_fds; i++) {
 136                         int nfd;
 137
 138                         /* Already at right index? */
 139                         if (fds[i] == i+3)
 140                                 continue;
 141
 142                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 143                         if (nfd < 0)
 144                                 return -errno;
 145
 146                         safe_close(fds[i]);
 147                         fds[i] = nfd;
 148
 149                         /* Hmm, the fd we wanted isn't free? Then
 150                          * let's remember that and try again from here */
 151                         if (nfd != i+3 && restart_from < 0)
 152                                 restart_from = i;
 153                 }
 154
 155                 if (restart_from < 0)
 156                         break;
 157
 158                 start = restart_from;
 159         }
 160
 161         return 0;
 162 }
 163
 164 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 165         unsigned i, n_fds;
 166         int r;
 167
 168         n_fds = n_storage_fds + n_socket_fds;
 169         if (n_fds <= 0)
 170                 return 0;
 171
 172         assert(fds);
 173
 174         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 175          * O_NONBLOCK only applies to socket activation though. */
 176
 177         for (i = 0; i < n_fds; i++) {
 178
 179                 if (i < n_socket_fds) {
 180                         r = fd_nonblock(fds[i], nonblock);
 181                         if (r < 0)
 182                                 return r;
 183                 }
 184
 185                 /* We unconditionally drop FD_CLOEXEC from the fds,
 186                  * since after all we want to pass these fds to our
 187                  * children */
 188
 189                 r = fd_cloexec(fds[i], false);
 190                 if (r < 0)
 191                         return r;
 192         }
 193
 194         return 0;
 195 }
 196
 197 static const char *exec_context_tty_path(const ExecContext *context) {
 198         assert(context);
 199
 200         if (context->stdio_as_fds)
 201                 return NULL;
 202
 203         if (context->tty_path)
 204                 return context->tty_path;
 205
 206         return "/dev/console";
 207 }
 208
 209 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 210         const char *path;
 211
 212         assert(context);
 213
 214         path = exec_context_tty_path(context);
 215
 216         if (context->tty_vhangup) {
 217                 if (p && p->stdin_fd >= 0)
 218                         (void) terminal_vhangup_fd(p->stdin_fd);
 219                 else if (path)
 220                         (void) terminal_vhangup(path);
 221         }
 222
 223         if (context->tty_reset) {
 224                 if (p && p->stdin_fd >= 0)
 225                         (void) reset_terminal_fd(p->stdin_fd, true);
 226                 else if (path)
 227                         (void) reset_terminal(path);
 228         }
 229
 230         if (context->tty_vt_disallocate && path)
 231                 (void) vt_disallocate(path);
 232 }
 233
 234 static bool is_terminal_input(ExecInput i) {
 235         return IN_SET(i,
 236                       EXEC_INPUT_TTY,
 237                       EXEC_INPUT_TTY_FORCE,
 238                       EXEC_INPUT_TTY_FAIL);
 239 }
 240
 241 static bool is_terminal_output(ExecOutput o) {
 242         return IN_SET(o,
 243                       EXEC_OUTPUT_TTY,
 244                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 245                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 246                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 247 }
 248
 249 static bool is_syslog_output(ExecOutput o) {
 250         return IN_SET(o,
 251                       EXEC_OUTPUT_SYSLOG,
 252                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 253 }
 254
 255 static bool is_kmsg_output(ExecOutput o) {
 256         return IN_SET(o,
 257                       EXEC_OUTPUT_KMSG,
 258                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 259 }
 260
 261 static bool exec_context_needs_term(const ExecContext *c) {
 262         assert(c);
 263
 264         /* Return true if the execution context suggests we should set $TERM to something useful. */
 265
 266         if (is_terminal_input(c->std_input))
 267                 return true;
 268
 269         if (is_terminal_output(c->std_output))
 270                 return true;
 271
 272         if (is_terminal_output(c->std_error))
 273                 return true;
 274
 275         return !!c->tty_path;
 276 }
 277
 278 static int open_null_as(int flags, int nfd) {
 279         int fd;
 280
 281         assert(nfd >= 0);
 282
 283         fd = open("/dev/null", flags|O_NOCTTY);
 284         if (fd < 0)
 285                 return -errno;
 286
 287         return move_fd(fd, nfd, false);
 288 }
 289
 290 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 291         static const union sockaddr_union sa = {
 292                 .un.sun_family = AF_UNIX,
 293                 .un.sun_path = "/run/systemd/journal/stdout",
 294         };
 295         uid_t olduid = UID_INVALID;
 296         gid_t oldgid = GID_INVALID;
 297         int r;
 298
 299         if (gid_is_valid(gid)) {
 300                 oldgid = getgid();
 301
 302                 if (setegid(gid) < 0)
 303                         return -errno;
 304         }
 305
 306         if (uid_is_valid(uid)) {
 307                 olduid = getuid();
 308
 309                 if (seteuid(uid) < 0) {
 310                         r = -errno;
 311                         goto restore_gid;
 312                 }
 313         }
 314
 315         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 316
 317         /* If we fail to restore the uid or gid, things will likely
 318            fail later on. This should only happen if an LSM interferes. */
 319
 320         if (uid_is_valid(uid))
 321                 (void) seteuid(olduid);
 322
 323  restore_gid:
 324         if (gid_is_valid(gid))
 325                 (void) setegid(oldgid);
 326
 327         return r;
 328 }
 329
 330 static int connect_logger_as(
 331                 Unit *unit,
 332                 const ExecContext *context,
 333                 const ExecParameters *params,
 334                 ExecOutput output,
 335                 const char *ident,
 336                 int nfd,
 337                 uid_t uid,
 338                 gid_t gid) {
 339
 340         int fd, r;
 341
 342         assert(context);
 343         assert(params);
 344         assert(output < _EXEC_OUTPUT_MAX);
 345         assert(ident);
 346         assert(nfd >= 0);
 347
 348         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 349         if (fd < 0)
 350                 return -errno;
 351
 352         r = connect_journal_socket(fd, uid, gid);
 353         if (r < 0)
 354                 return r;
 355
 356         if (shutdown(fd, SHUT_RD) < 0) {
 357                 safe_close(fd);
 358                 return -errno;
 359         }
 360
 361         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 362
 363         dprintf(fd,
 364                 "%s\n"
 365                 "%s\n"
 366                 "%i\n"
 367                 "%i\n"
 368                 "%i\n"
 369                 "%i\n"
 370                 "%i\n",
 371                 context->syslog_identifier ?: ident,
 372                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 373                 context->syslog_priority,
 374                 !!context->syslog_level_prefix,
 375                 is_syslog_output(output),
 376                 is_kmsg_output(output),
 377                 is_terminal_output(output));
 378
 379         return move_fd(fd, nfd, false);
 380 }
 381 static int open_terminal_as(const char *path, int flags, int nfd) {
 382         int fd;
 383
 384         assert(path);
 385         assert(nfd >= 0);
 386
 387         fd = open_terminal(path, flags | O_NOCTTY);
 388         if (fd < 0)
 389                 return fd;
 390
 391         return move_fd(fd, nfd, false);
 392 }
 393
 394 static int acquire_path(const char *path, int flags, mode_t mode) {
 395         union sockaddr_union sa = {
 396                 .sa.sa_family = AF_UNIX,
 397         };
 398         int fd, r;
 399
 400         assert(path);
 401
 402         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 403                 flags |= O_CREAT;
 404
 405         fd = open(path, flags|O_NOCTTY, mode);
 406         if (fd >= 0)
 407                 return fd;
 408
 409         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 410                 return -errno;
 411         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 412                 return -ENXIO;
 413
 414         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 415
 416         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 417         if (fd < 0)
 418                 return -errno;
 419
 420         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 421         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 422                 safe_close(fd);
 423                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 424                                                            * indication that his wasn't an AF_UNIX socket after all */
 425         }
 426
 427         if ((flags & O_ACCMODE) == O_RDONLY)
 428                 r = shutdown(fd, SHUT_WR);
 429         else if ((flags & O_ACCMODE) == O_WRONLY)
 430                 r = shutdown(fd, SHUT_RD);
 431         else
 432                 return fd;
 433         if (r < 0) {
 434                 safe_close(fd);
 435                 return -errno;
 436         }
 437
 438         return fd;
 439 }
 440
 441 static int fixup_input(
 442                 const ExecContext *context,
 443                 int socket_fd,
 444                 bool apply_tty_stdin) {
 445
 446         ExecInput std_input;
 447
 448         assert(context);
 449
 450         std_input = context->std_input;
 451
 452         if (is_terminal_input(std_input) && !apply_tty_stdin)
 453                 return EXEC_INPUT_NULL;
 454
 455         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 456                 return EXEC_INPUT_NULL;
 457
 458         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 459                 return EXEC_INPUT_NULL;
 460
 461         return std_input;
 462 }
 463
 464 static int fixup_output(ExecOutput std_output, int socket_fd) {
 465
 466         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 467                 return EXEC_OUTPUT_INHERIT;
 468
 469         return std_output;
 470 }
 471
 472 static int setup_input(
 473                 const ExecContext *context,
 474                 const ExecParameters *params,
 475                 int socket_fd,
 476                 int named_iofds[3]) {
 477
 478         ExecInput i;
 479
 480         assert(context);
 481         assert(params);
 482
 483         if (params->stdin_fd >= 0) {
 484                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 485                         return -errno;
 486
 487                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 488                 if (isatty(STDIN_FILENO)) {
 489                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 490                         (void) reset_terminal_fd(STDIN_FILENO, true);
 491                 }
 492
 493                 return STDIN_FILENO;
 494         }
 495
 496         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 497
 498         switch (i) {
 499
 500         case EXEC_INPUT_NULL:
 501                 return open_null_as(O_RDONLY, STDIN_FILENO);
 502
 503         case EXEC_INPUT_TTY:
 504         case EXEC_INPUT_TTY_FORCE:
 505         case EXEC_INPUT_TTY_FAIL: {
 506                 int fd;
 507
 508                 fd = acquire_terminal(exec_context_tty_path(context),
 509                                       i == EXEC_INPUT_TTY_FAIL,
 510                                       i == EXEC_INPUT_TTY_FORCE,
 511                                       false,
 512                                       USEC_INFINITY);
 513                 if (fd < 0)
 514                         return fd;
 515
 516                 return move_fd(fd, STDIN_FILENO, false);
 517         }
 518
 519         case EXEC_INPUT_SOCKET:
 520                 assert(socket_fd >= 0);
 521
 522                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 523
 524         case EXEC_INPUT_NAMED_FD:
 525                 assert(named_iofds[STDIN_FILENO] >= 0);
 526
 527                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 528                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 529
 530         case EXEC_INPUT_DATA: {
 531                 int fd;
 532
 533                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 534                 if (fd < 0)
 535                         return fd;
 536
 537                 return move_fd(fd, STDIN_FILENO, false);
 538         }
 539
 540         case EXEC_INPUT_FILE: {
 541                 bool rw;
 542                 int fd;
 543
 544                 assert(context->stdio_file[STDIN_FILENO]);
 545
 546                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 547                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 548
 549                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 550                 if (fd < 0)
 551                         return fd;
 552
 553                 return move_fd(fd, STDIN_FILENO, false);
 554         }
 555
 556         default:
 557                 assert_not_reached("Unknown input type");
 558         }
 559 }
 560
 561 static int setup_output(
 562                 Unit *unit,
 563                 const ExecContext *context,
 564                 const ExecParameters *params,
 565                 int fileno,
 566                 int socket_fd,
 567                 int named_iofds[3],
 568                 const char *ident,
 569                 uid_t uid,
 570                 gid_t gid,
 571                 dev_t *journal_stream_dev,
 572                 ino_t *journal_stream_ino) {
 573
 574         ExecOutput o;
 575         ExecInput i;
 576         int r;
 577
 578         assert(unit);
 579         assert(context);
 580         assert(params);
 581         assert(ident);
 582         assert(journal_stream_dev);
 583         assert(journal_stream_ino);
 584
 585         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 586
 587                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 588                         return -errno;
 589
 590                 return STDOUT_FILENO;
 591         }
 592
 593         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 594                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 595                         return -errno;
 596
 597                 return STDERR_FILENO;
 598         }
 599
 600         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 601         o = fixup_output(context->std_output, socket_fd);
 602
 603         if (fileno == STDERR_FILENO) {
 604                 ExecOutput e;
 605                 e = fixup_output(context->std_error, socket_fd);
 606
 607                 /* This expects the input and output are already set up */
 608
 609                 /* Don't change the stderr file descriptor if we inherit all
 610                  * the way and are not on a tty */
 611                 if (e == EXEC_OUTPUT_INHERIT &&
 612                     o == EXEC_OUTPUT_INHERIT &&
 613                     i == EXEC_INPUT_NULL &&
 614                     !is_terminal_input(context->std_input) &&
 615                     getppid () != 1)
 616                         return fileno;
 617
 618                 /* Duplicate from stdout if possible */
 619                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 620                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 621
 622                 o = e;
 623
 624         } else if (o == EXEC_OUTPUT_INHERIT) {
 625                 /* If input got downgraded, inherit the original value */
 626                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 627                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 628
 629                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 630                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 631                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 632
 633                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 634                 if (getppid() != 1)
 635                         return fileno;
 636
 637                 /* We need to open /dev/null here anew, to get the right access mode. */
 638                 return open_null_as(O_WRONLY, fileno);
 639         }
 640
 641         switch (o) {
 642
 643         case EXEC_OUTPUT_NULL:
 644                 return open_null_as(O_WRONLY, fileno);
 645
 646         case EXEC_OUTPUT_TTY:
 647                 if (is_terminal_input(i))
 648                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 649
 650                 /* We don't reset the terminal if this is just about output */
 651                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 652
 653         case EXEC_OUTPUT_SYSLOG:
 654         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 655         case EXEC_OUTPUT_KMSG:
 656         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 657         case EXEC_OUTPUT_JOURNAL:
 658         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 659                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 660                 if (r < 0) {
 661                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 662                         r = open_null_as(O_WRONLY, fileno);
 663                 } else {
 664                         struct stat st;
 665
 666                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 667                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 668                          * services to detect whether they are connected to the journal or not.
 669                          *
 670                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 671                          * about STDERR as that's usually the best way to do logging. */
 672
 673                         if (fstat(fileno, &st) >= 0 &&
 674                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 675                                 *journal_stream_dev = st.st_dev;
 676                                 *journal_stream_ino = st.st_ino;
 677                         }
 678                 }
 679                 return r;
 680
 681         case EXEC_OUTPUT_SOCKET:
 682                 assert(socket_fd >= 0);
 683
 684                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 685
 686         case EXEC_OUTPUT_NAMED_FD:
 687                 assert(named_iofds[fileno] >= 0);
 688
 689                 (void) fd_nonblock(named_iofds[fileno], false);
 690                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 691
 692         case EXEC_OUTPUT_FILE: {
 693                 bool rw;
 694                 int fd;
 695
 696                 assert(context->stdio_file[fileno]);
 697
 698                 rw = context->std_input == EXEC_INPUT_FILE &&
 699                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 700
 701                 if (rw)
 702                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 703
 704                 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
 705                 if (fd < 0)
 706                         return fd;
 707
 708                 return move_fd(fd, fileno, false);
 709         }
 710
 711         default:
 712                 assert_not_reached("Unknown error type");
 713         }
 714 }
 715
 716 static int chown_terminal(int fd, uid_t uid) {
 717         struct stat st;
 718
 719         assert(fd >= 0);
 720
 721         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 722         if (isatty(fd) < 1)
 723                 return 0;
 724
 725         /* This might fail. What matters are the results. */
 726         (void) fchown(fd, uid, -1);
 727         (void) fchmod(fd, TTY_MODE);
 728
 729         if (fstat(fd, &st) < 0)
 730                 return -errno;
 731
 732         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 733                 return -EPERM;
 734
 735         return 0;
 736 }
 737
 738 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 739         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 740         int r;
 741
 742         assert(_saved_stdin);
 743         assert(_saved_stdout);
 744
 745         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 746         if (saved_stdin < 0)
 747                 return -errno;
 748
 749         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 750         if (saved_stdout < 0)
 751                 return -errno;
 752
 753         fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
 754         if (fd < 0)
 755                 return fd;
 756
 757         r = chown_terminal(fd, getuid());
 758         if (r < 0)
 759                 return r;
 760
 761         r = reset_terminal_fd(fd, true);
 762         if (r < 0)
 763                 return r;
 764
 765         if (dup2(fd, STDIN_FILENO) < 0)
 766                 return -errno;
 767
 768         if (dup2(fd, STDOUT_FILENO) < 0)
 769                 return -errno;
 770
 771         if (fd >= 2)
 772                 safe_close(fd);
 773         fd = -1;
 774
 775         *_saved_stdin = saved_stdin;
 776         *_saved_stdout = saved_stdout;
 777
 778         saved_stdin = saved_stdout = -1;
 779
 780         return 0;
 781 }
 782
 783 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 784         assert(err < 0);
 785
 786         if (err == -ETIMEDOUT)
 787                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 788         else {
 789                 errno = -err;
 790                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 791         }
 792 }
 793
 794 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 795         _cleanup_close_ int fd = -1;
 796
 797         assert(vc);
 798
 799         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 800         if (fd < 0)
 801                 return;
 802
 803         write_confirm_error_fd(err, fd, u);
 804 }
 805
 806 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 807         int r = 0;
 808
 809         assert(saved_stdin);
 810         assert(saved_stdout);
 811
 812         release_terminal();
 813
 814         if (*saved_stdin >= 0)
 815                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 816                         r = -errno;
 817
 818         if (*saved_stdout >= 0)
 819                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 820                         r = -errno;
 821
 822         *saved_stdin = safe_close(*saved_stdin);
 823         *saved_stdout = safe_close(*saved_stdout);
 824
 825         return r;
 826 }
 827
 828 enum {
 829         CONFIRM_PRETEND_FAILURE = -1,
 830         CONFIRM_PRETEND_SUCCESS =  0,
 831         CONFIRM_EXECUTE = 1,
 832 };
 833
 834 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 835         int saved_stdout = -1, saved_stdin = -1, r;
 836         _cleanup_free_ char *e = NULL;
 837         char c;
 838
 839         /* For any internal errors, assume a positive response. */
 840         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 841         if (r < 0) {
 842                 write_confirm_error(r, vc, u);
 843                 return CONFIRM_EXECUTE;
 844         }
 845
 846         /* confirm_spawn might have been disabled while we were sleeping. */
 847         if (manager_is_confirm_spawn_disabled(u->manager)) {
 848                 r = 1;
 849                 goto restore_stdio;
 850         }
 851
 852         e = ellipsize(cmdline, 60, 100);
 853         if (!e) {
 854                 log_oom();
 855                 r = CONFIRM_EXECUTE;
 856                 goto restore_stdio;
 857         }
 858
 859         for (;;) {
 860                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 861                 if (r < 0) {
 862                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 863                         r = CONFIRM_EXECUTE;
 864                         goto restore_stdio;
 865                 }
 866
 867                 switch (c) {
 868                 case 'c':
 869                         printf("Resuming normal execution.\n");
 870                         manager_disable_confirm_spawn();
 871                         r = 1;
 872                         break;
 873                 case 'D':
 874                         unit_dump(u, stdout, "  ");
 875                         continue; /* ask again */
 876                 case 'f':
 877                         printf("Failing execution.\n");
 878                         r = CONFIRM_PRETEND_FAILURE;
 879                         break;
 880                 case 'h':
 881                         printf("  c - continue, proceed without asking anymore\n"
 882                                "  D - dump, show the state of the unit\n"
 883                                "  f - fail, don't execute the command and pretend it failed\n"
 884                                "  h - help\n"
 885                                "  i - info, show a short summary of the unit\n"
 886                                "  j - jobs, show jobs that are in progress\n"
 887                                "  s - skip, don't execute the command and pretend it succeeded\n"
 888                                "  y - yes, execute the command\n");
 889                         continue; /* ask again */
 890                 case 'i':
 891                         printf("  Description: %s\n"
 892                                "  Unit:        %s\n"
 893                                "  Command:     %s\n",
 894                                u->id, u->description, cmdline);
 895                         continue; /* ask again */
 896                 case 'j':
 897                         manager_dump_jobs(u->manager, stdout, "  ");
 898                         continue; /* ask again */
 899                 case 'n':
 900                         /* 'n' was removed in favor of 'f'. */
 901                         printf("Didn't understand 'n', did you mean 'f'?\n");
 902                         continue; /* ask again */
 903                 case 's':
 904                         printf("Skipping execution.\n");
 905                         r = CONFIRM_PRETEND_SUCCESS;
 906                         break;
 907                 case 'y':
 908                         r = CONFIRM_EXECUTE;
 909                         break;
 910                 default:
 911                         assert_not_reached("Unhandled choice");
 912                 }
 913                 break;
 914         }
 915
 916 restore_stdio:
 917         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 918         return r;
 919 }
 920
 921 static int get_fixed_user(const ExecContext *c, const char **user,
 922                           uid_t *uid, gid_t *gid,
 923                           const char **home, const char **shell) {
 924         int r;
 925         const char *name;
 926
 927         assert(c);
 928
 929         if (!c->user)
 930                 return 0;
 931
 932         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 933          * (i.e. are "/" or "/bin/nologin"). */
 934
 935         name = c->user;
 936         r = get_user_creds_clean(&name, uid, gid, home, shell);
 937         if (r < 0)
 938                 return r;
 939
 940         *user = name;
 941         return 0;
 942 }
 943
 944 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 945         int r;
 946         const char *name;
 947
 948         assert(c);
 949
 950         if (!c->group)
 951                 return 0;
 952
 953         name = c->group;
 954         r = get_group_creds(&name, gid);
 955         if (r < 0)
 956                 return r;
 957
 958         *group = name;
 959         return 0;
 960 }
 961
 962 static int get_supplementary_groups(const ExecContext *c, const char *user,
 963                                     const char *group, gid_t gid,
 964                                     gid_t **supplementary_gids, int *ngids) {
 965         char **i;
 966         int r, k = 0;
 967         int ngroups_max;
 968         bool keep_groups = false;
 969         gid_t *groups = NULL;
 970         _cleanup_free_ gid_t *l_gids = NULL;
 971
 972         assert(c);
 973
 974         /*
 975          * If user is given, then lookup GID and supplementary groups list.
 976          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 977          * here and as early as possible so we keep the list of supplementary
 978          * groups of the caller.
 979          */
 980         if (user && gid_is_valid(gid) && gid != 0) {
 981                 /* First step, initialize groups from /etc/groups */
 982                 if (initgroups(user, gid) < 0)
 983                         return -errno;
 984
 985                 keep_groups = true;
 986         }
 987
 988         if (strv_isempty(c->supplementary_groups))
 989                 return 0;
 990
 991         /*
 992          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 993          * be positive, otherwise fail.
 994          */
 995         errno = 0;
 996         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 997         if (ngroups_max <= 0) {
 998                 if (errno > 0)
 999                         return -errno;
1000                 else
1001                         return -EOPNOTSUPP; /* For all other values */
1002         }
1003
1004         l_gids = new(gid_t, ngroups_max);
1005         if (!l_gids)
1006                 return -ENOMEM;
1007
1008         if (keep_groups) {
1009                 /*
1010                  * Lookup the list of groups that the user belongs to, we
1011                  * avoid NSS lookups here too for gid=0.
1012                  */
1013                 k = ngroups_max;
1014                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1015                         return -EINVAL;
1016         } else
1017                 k = 0;
1018
1019         STRV_FOREACH(i, c->supplementary_groups) {
1020                 const char *g;
1021
1022                 if (k >= ngroups_max)
1023                         return -E2BIG;
1024
1025                 g = *i;
1026                 r = get_group_creds(&g, l_gids+k);
1027                 if (r < 0)
1028                         return r;
1029
1030                 k++;
1031         }
1032
1033         /*
1034          * Sets ngids to zero to drop all supplementary groups, happens
1035          * when we are under root and SupplementaryGroups= is empty.
1036          */
1037         if (k == 0) {
1038                 *ngids = 0;
1039                 return 0;
1040         }
1041
1042         /* Otherwise get the final list of supplementary groups */
1043         groups = memdup(l_gids, sizeof(gid_t) * k);
1044         if (!groups)
1045                 return -ENOMEM;
1046
1047         *supplementary_gids = groups;
1048         *ngids = k;
1049
1050         groups = NULL;
1051
1052         return 0;
1053 }
1054
1055 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
1056         int r;
1057
1058         /* Handle SupplementaryGroups= if it is not empty */
1059         if (ngids > 0) {
1060                 r = maybe_setgroups(ngids, supplementary_gids);
1061                 if (r < 0)
1062                         return r;
1063         }
1064
1065         if (gid_is_valid(gid)) {
1066                 /* Then set our gids */
1067                 if (setresgid(gid, gid, gid) < 0)
1068                         return -errno;
1069         }
1070
1071         return 0;
1072 }
1073
1074 static int enforce_user(const ExecContext *context, uid_t uid) {
1075         assert(context);
1076
1077         if (!uid_is_valid(uid))
1078                 return 0;
1079
1080         /* Sets (but doesn't look up) the uid and make sure we keep the
1081          * capabilities while doing so. */
1082
1083         if (context->capability_ambient_set != 0) {
1084
1085                 /* First step: If we need to keep capabilities but
1086                  * drop privileges we need to make sure we keep our
1087                  * caps, while we drop privileges. */
1088                 if (uid != 0) {
1089                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1090
1091                         if (prctl(PR_GET_SECUREBITS) != sb)
1092                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1093                                         return -errno;
1094                 }
1095         }
1096
1097         /* Second step: actually set the uids */
1098         if (setresuid(uid, uid, uid) < 0)
1099                 return -errno;
1100
1101         /* At this point we should have all necessary capabilities but
1102            are otherwise a normal user. However, the caps might got
1103            corrupted due to the setresuid() so we need clean them up
1104            later. This is done outside of this call. */
1105
1106         return 0;
1107 }
1108
1109 #if HAVE_PAM
1110
1111 static int null_conv(
1112                 int num_msg,
1113                 const struct pam_message **msg,
1114                 struct pam_response **resp,
1115                 void *appdata_ptr) {
1116
1117         /* We don't support conversations */
1118
1119         return PAM_CONV_ERR;
1120 }
1121
1122 #endif
1123
1124 static int setup_pam(
1125                 const char *name,
1126                 const char *user,
1127                 uid_t uid,
1128                 gid_t gid,
1129                 const char *tty,
1130                 char ***env,
1131                 int fds[], unsigned n_fds) {
1132
1133 #if HAVE_PAM
1134
1135         static const struct pam_conv conv = {
1136                 .conv = null_conv,
1137                 .appdata_ptr = NULL
1138         };
1139
1140         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1141         pam_handle_t *handle = NULL;
1142         sigset_t old_ss;
1143         int pam_code = PAM_SUCCESS, r;
1144         char **nv, **e = NULL;
1145         bool close_session = false;
1146         pid_t pam_pid = 0, parent_pid;
1147         int flags = 0;
1148
1149         assert(name);
1150         assert(user);
1151         assert(env);
1152
1153         /* We set up PAM in the parent process, then fork. The child
1154          * will then stay around until killed via PR_GET_PDEATHSIG or
1155          * systemd via the cgroup logic. It will then remove the PAM
1156          * session again. The parent process will exec() the actual
1157          * daemon. We do things this way to ensure that the main PID
1158          * of the daemon is the one we initially fork()ed. */
1159
1160         r = barrier_create(&barrier);
1161         if (r < 0)
1162                 goto fail;
1163
1164         if (log_get_max_level() < LOG_DEBUG)
1165                 flags |= PAM_SILENT;
1166
1167         pam_code = pam_start(name, user, &conv, &handle);
1168         if (pam_code != PAM_SUCCESS) {
1169                 handle = NULL;
1170                 goto fail;
1171         }
1172
1173         if (tty) {
1174                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1175                 if (pam_code != PAM_SUCCESS)
1176                         goto fail;
1177         }
1178
1179         STRV_FOREACH(nv, *env) {
1180                 pam_code = pam_putenv(handle, *nv);
1181                 if (pam_code != PAM_SUCCESS)
1182                         goto fail;
1183         }
1184
1185         pam_code = pam_acct_mgmt(handle, flags);
1186         if (pam_code != PAM_SUCCESS)
1187                 goto fail;
1188
1189         pam_code = pam_open_session(handle, flags);
1190         if (pam_code != PAM_SUCCESS)
1191                 goto fail;
1192
1193         close_session = true;
1194
1195         e = pam_getenvlist(handle);
1196         if (!e) {
1197                 pam_code = PAM_BUF_ERR;
1198                 goto fail;
1199         }
1200
1201         /* Block SIGTERM, so that we know that it won't get lost in
1202          * the child */
1203
1204         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1205
1206         parent_pid = getpid_cached();
1207
1208         pam_pid = fork();
1209         if (pam_pid < 0) {
1210                 r = -errno;
1211                 goto fail;
1212         }
1213
1214         if (pam_pid == 0) {
1215                 int sig, ret = EXIT_PAM;
1216
1217                 /* The child's job is to reset the PAM session on
1218                  * termination */
1219                 barrier_set_role(&barrier, BARRIER_CHILD);
1220
1221                 /* This string must fit in 10 chars (i.e. the length
1222                  * of "/sbin/init"), to look pretty in /bin/ps */
1223                 rename_process("(sd-pam)");
1224
1225                 /* Make sure we don't keep open the passed fds in this
1226                 child. We assume that otherwise only those fds are
1227                 open here that have been opened by PAM. */
1228                 close_many(fds, n_fds);
1229
1230                 /* Drop privileges - we don't need any to pam_close_session
1231                  * and this will make PR_SET_PDEATHSIG work in most cases.
1232                  * If this fails, ignore the error - but expect sd-pam threads
1233                  * to fail to exit normally */
1234
1235                 r = maybe_setgroups(0, NULL);
1236                 if (r < 0)
1237                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1238                 if (setresgid(gid, gid, gid) < 0)
1239                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1240                 if (setresuid(uid, uid, uid) < 0)
1241                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1242
1243                 (void) ignore_signals(SIGPIPE, -1);
1244
1245                 /* Wait until our parent died. This will only work if
1246                  * the above setresuid() succeeds, otherwise the kernel
1247                  * will not allow unprivileged parents kill their privileged
1248                  * children this way. We rely on the control groups kill logic
1249                  * to do the rest for us. */
1250                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1251                         goto child_finish;
1252
1253                 /* Tell the parent that our setup is done. This is especially
1254                  * important regarding dropping privileges. Otherwise, unit
1255                  * setup might race against our setresuid(2) call.
1256                  *
1257                  * If the parent aborted, we'll detect this below, hence ignore
1258                  * return failure here. */
1259                 (void) barrier_place(&barrier);
1260
1261                 /* Check if our parent process might already have died? */
1262                 if (getppid() == parent_pid) {
1263                         sigset_t ss;
1264
1265                         assert_se(sigemptyset(&ss) >= 0);
1266                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1267
1268                         for (;;) {
1269                                 if (sigwait(&ss, &sig) < 0) {
1270                                         if (errno == EINTR)
1271                                                 continue;
1272
1273                                         goto child_finish;
1274                                 }
1275
1276                                 assert(sig == SIGTERM);
1277                                 break;
1278                         }
1279                 }
1280
1281                 /* If our parent died we'll end the session */
1282                 if (getppid() != parent_pid) {
1283                         pam_code = pam_close_session(handle, flags);
1284                         if (pam_code != PAM_SUCCESS)
1285                                 goto child_finish;
1286                 }
1287
1288                 ret = 0;
1289
1290         child_finish:
1291                 pam_end(handle, pam_code | flags);
1292                 _exit(ret);
1293         }
1294
1295         barrier_set_role(&barrier, BARRIER_PARENT);
1296
1297         /* If the child was forked off successfully it will do all the
1298          * cleanups, so forget about the handle here. */
1299         handle = NULL;
1300
1301         /* Unblock SIGTERM again in the parent */
1302         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1303
1304         /* We close the log explicitly here, since the PAM modules
1305          * might have opened it, but we don't want this fd around. */
1306         closelog();
1307
1308         /* Synchronously wait for the child to initialize. We don't care for
1309          * errors as we cannot recover. However, warn loudly if it happens. */
1310         if (!barrier_place_and_sync(&barrier))
1311                 log_error("PAM initialization failed");
1312
1313         strv_free(*env);
1314         *env = e;
1315
1316         return 0;
1317
1318 fail:
1319         if (pam_code != PAM_SUCCESS) {
1320                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1321                 r = -EPERM;  /* PAM errors do not map to errno */
1322         } else
1323                 log_error_errno(r, "PAM failed: %m");
1324
1325         if (handle) {
1326                 if (close_session)
1327                         pam_code = pam_close_session(handle, flags);
1328
1329                 pam_end(handle, pam_code | flags);
1330         }
1331
1332         strv_free(e);
1333         closelog();
1334
1335         return r;
1336 #else
1337         return 0;
1338 #endif
1339 }
1340
1341 static void rename_process_from_path(const char *path) {
1342         char process_name[11];
1343         const char *p;
1344         size_t l;
1345
1346         /* This resulting string must fit in 10 chars (i.e. the length
1347          * of "/sbin/init") to look pretty in /bin/ps */
1348
1349         p = basename(path);
1350         if (isempty(p)) {
1351                 rename_process("(...)");
1352                 return;
1353         }
1354
1355         l = strlen(p);
1356         if (l > 8) {
1357                 /* The end of the process name is usually more
1358                  * interesting, since the first bit might just be
1359                  * "systemd-" */
1360                 p = p + l - 8;
1361                 l = 8;
1362         }
1363
1364         process_name[0] = '(';
1365         memcpy(process_name+1, p, l);
1366         process_name[1+l] = ')';
1367         process_name[1+l+1] = 0;
1368
1369         rename_process(process_name);
1370 }
1371
1372 static bool context_has_address_families(const ExecContext *c) {
1373         assert(c);
1374
1375         return c->address_families_whitelist ||
1376                 !set_isempty(c->address_families);
1377 }
1378
1379 static bool context_has_syscall_filters(const ExecContext *c) {
1380         assert(c);
1381
1382         return c->syscall_whitelist ||
1383                 !hashmap_isempty(c->syscall_filter);
1384 }
1385
1386 static bool context_has_no_new_privileges(const ExecContext *c) {
1387         assert(c);
1388
1389         if (c->no_new_privileges)
1390                 return true;
1391
1392         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1393                 return false;
1394
1395         /* We need NNP if we have any form of seccomp and are unprivileged */
1396         return context_has_address_families(c) ||
1397                 c->memory_deny_write_execute ||
1398                 c->restrict_realtime ||
1399                 exec_context_restrict_namespaces_set(c) ||
1400                 c->protect_kernel_tunables ||
1401                 c->protect_kernel_modules ||
1402                 c->private_devices ||
1403                 context_has_syscall_filters(c) ||
1404                 !set_isempty(c->syscall_archs) ||
1405                 c->lock_personality;
1406 }
1407
1408 #if HAVE_SECCOMP
1409
1410 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1411
1412         if (is_seccomp_available())
1413                 return false;
1414
1415         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1416         return true;
1417 }
1418
1419 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1420         uint32_t negative_action, default_action, action;
1421         int r;
1422
1423         assert(u);
1424         assert(c);
1425
1426         if (!context_has_syscall_filters(c))
1427                 return 0;
1428
1429         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1430                 return 0;
1431
1432         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1433
1434         if (c->syscall_whitelist) {
1435                 default_action = negative_action;
1436                 action = SCMP_ACT_ALLOW;
1437         } else {
1438                 default_action = SCMP_ACT_ALLOW;
1439                 action = negative_action;
1440         }
1441
1442         if (needs_ambient_hack) {
1443                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1444                 if (r < 0)
1445                         return r;
1446         }
1447
1448         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1449 }
1450
1451 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1452         assert(u);
1453         assert(c);
1454
1455         if (set_isempty(c->syscall_archs))
1456                 return 0;
1457
1458         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1459                 return 0;
1460
1461         return seccomp_restrict_archs(c->syscall_archs);
1462 }
1463
1464 static int apply_address_families(const Unit* u, const ExecContext *c) {
1465         assert(u);
1466         assert(c);
1467
1468         if (!context_has_address_families(c))
1469                 return 0;
1470
1471         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1472                 return 0;
1473
1474         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1475 }
1476
1477 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1478         assert(u);
1479         assert(c);
1480
1481         if (!c->memory_deny_write_execute)
1482                 return 0;
1483
1484         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1485                 return 0;
1486
1487         return seccomp_memory_deny_write_execute();
1488 }
1489
1490 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1491         assert(u);
1492         assert(c);
1493
1494         if (!c->restrict_realtime)
1495                 return 0;
1496
1497         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1498                 return 0;
1499
1500         return seccomp_restrict_realtime();
1501 }
1502
1503 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1504         assert(u);
1505         assert(c);
1506
1507         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1508          * let's protect even those systems where this is left on in the kernel. */
1509
1510         if (!c->protect_kernel_tunables)
1511                 return 0;
1512
1513         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1514                 return 0;
1515
1516         return seccomp_protect_sysctl();
1517 }
1518
1519 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1520         assert(u);
1521         assert(c);
1522
1523         /* Turn off module syscalls on ProtectKernelModules=yes */
1524
1525         if (!c->protect_kernel_modules)
1526                 return 0;
1527
1528         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1529                 return 0;
1530
1531         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1532 }
1533
1534 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1535         assert(u);
1536         assert(c);
1537
1538         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1539
1540         if (!c->private_devices)
1541                 return 0;
1542
1543         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1544                 return 0;
1545
1546         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1547 }
1548
1549 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1550         assert(u);
1551         assert(c);
1552
1553         if (!exec_context_restrict_namespaces_set(c))
1554                 return 0;
1555
1556         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1557                 return 0;
1558
1559         return seccomp_restrict_namespaces(c->restrict_namespaces);
1560 }
1561
1562 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1563         unsigned long personality;
1564         int r;
1565
1566         assert(u);
1567         assert(c);
1568
1569         if (!c->lock_personality)
1570                 return 0;
1571
1572         if (skip_seccomp_unavailable(u, "LockPersonality="))
1573                 return 0;
1574
1575         personality = c->personality;
1576
1577         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1578         if (personality == PERSONALITY_INVALID) {
1579
1580                 r = opinionated_personality(&personality);
1581                 if (r < 0)
1582                         return r;
1583         }
1584
1585         return seccomp_lock_personality(personality);
1586 }
1587
1588 #endif
1589
1590 static void do_idle_pipe_dance(int idle_pipe[4]) {
1591         assert(idle_pipe);
1592
1593         idle_pipe[1] = safe_close(idle_pipe[1]);
1594         idle_pipe[2] = safe_close(idle_pipe[2]);
1595
1596         if (idle_pipe[0] >= 0) {
1597                 int r;
1598
1599                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1600
1601                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1602                         ssize_t n;
1603
1604                         /* Signal systemd that we are bored and want to continue. */
1605                         n = write(idle_pipe[3], "x", 1);
1606                         if (n > 0)
1607                                 /* Wait for systemd to react to the signal above. */
1608                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1609                 }
1610
1611                 idle_pipe[0] = safe_close(idle_pipe[0]);
1612
1613         }
1614
1615         idle_pipe[3] = safe_close(idle_pipe[3]);
1616 }
1617
1618 static int build_environment(
1619                 Unit *u,
1620                 const ExecContext *c,
1621                 const ExecParameters *p,
1622                 unsigned n_fds,
1623                 const char *home,
1624                 const char *username,
1625                 const char *shell,
1626                 dev_t journal_stream_dev,
1627                 ino_t journal_stream_ino,
1628                 char ***ret) {
1629
1630         _cleanup_strv_free_ char **our_env = NULL;
1631         unsigned n_env = 0;
1632         char *x;
1633
1634         assert(u);
1635         assert(c);
1636         assert(ret);
1637
1638         our_env = new0(char*, 14);
1639         if (!our_env)
1640                 return -ENOMEM;
1641
1642         if (n_fds > 0) {
1643                 _cleanup_free_ char *joined = NULL;
1644
1645                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1646                         return -ENOMEM;
1647                 our_env[n_env++] = x;
1648
1649                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1650                         return -ENOMEM;
1651                 our_env[n_env++] = x;
1652
1653                 joined = strv_join(p->fd_names, ":");
1654                 if (!joined)
1655                         return -ENOMEM;
1656
1657                 x = strjoin("LISTEN_FDNAMES=", joined);
1658                 if (!x)
1659                         return -ENOMEM;
1660                 our_env[n_env++] = x;
1661         }
1662
1663         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1664                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1665                         return -ENOMEM;
1666                 our_env[n_env++] = x;
1667
1668                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1669                         return -ENOMEM;
1670                 our_env[n_env++] = x;
1671         }
1672
1673         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1674          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1675          * check the database directly. */
1676         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1677                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1678                 if (!x)
1679                         return -ENOMEM;
1680                 our_env[n_env++] = x;
1681         }
1682
1683         if (home) {
1684                 x = strappend("HOME=", home);
1685                 if (!x)
1686                         return -ENOMEM;
1687                 our_env[n_env++] = x;
1688         }
1689
1690         if (username) {
1691                 x = strappend("LOGNAME=", username);
1692                 if (!x)
1693                         return -ENOMEM;
1694                 our_env[n_env++] = x;
1695
1696                 x = strappend("USER=", username);
1697                 if (!x)
1698                         return -ENOMEM;
1699                 our_env[n_env++] = x;
1700         }
1701
1702         if (shell) {
1703                 x = strappend("SHELL=", shell);
1704                 if (!x)
1705                         return -ENOMEM;
1706                 our_env[n_env++] = x;
1707         }
1708
1709         if (!sd_id128_is_null(u->invocation_id)) {
1710                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1711                         return -ENOMEM;
1712
1713                 our_env[n_env++] = x;
1714         }
1715
1716         if (exec_context_needs_term(c)) {
1717                 const char *tty_path, *term = NULL;
1718
1719                 tty_path = exec_context_tty_path(c);
1720
1721                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1722                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1723                  * passes to PID 1 ends up all the way in the console login shown. */
1724
1725                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1726                         term = getenv("TERM");
1727                 if (!term)
1728                         term = default_term_for_tty(tty_path);
1729
1730                 x = strappend("TERM=", term);
1731                 if (!x)
1732                         return -ENOMEM;
1733                 our_env[n_env++] = x;
1734         }
1735
1736         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1737                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1738                         return -ENOMEM;
1739
1740                 our_env[n_env++] = x;
1741         }
1742
1743         our_env[n_env++] = NULL;
1744         assert(n_env <= 12);
1745
1746         *ret = our_env;
1747         our_env = NULL;
1748
1749         return 0;
1750 }
1751
1752 static int build_pass_environment(const ExecContext *c, char ***ret) {
1753         _cleanup_strv_free_ char **pass_env = NULL;
1754         size_t n_env = 0, n_bufsize = 0;
1755         char **i;
1756
1757         STRV_FOREACH(i, c->pass_environment) {
1758                 _cleanup_free_ char *x = NULL;
1759                 char *v;
1760
1761                 v = getenv(*i);
1762                 if (!v)
1763                         continue;
1764                 x = strjoin(*i, "=", v);
1765                 if (!x)
1766                         return -ENOMEM;
1767
1768                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1769                         return -ENOMEM;
1770
1771                 pass_env[n_env++] = x;
1772                 pass_env[n_env] = NULL;
1773                 x = NULL;
1774         }
1775
1776         *ret = pass_env;
1777         pass_env = NULL;
1778
1779         return 0;
1780 }
1781
1782 static bool exec_needs_mount_namespace(
1783                 const ExecContext *context,
1784                 const ExecParameters *params,
1785                 ExecRuntime *runtime) {
1786
1787         assert(context);
1788         assert(params);
1789
1790         if (context->root_image)
1791                 return true;
1792
1793         if (!strv_isempty(context->read_write_paths) ||
1794             !strv_isempty(context->read_only_paths) ||
1795             !strv_isempty(context->inaccessible_paths))
1796                 return true;
1797
1798         if (context->n_bind_mounts > 0 ||
1799             !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1800             !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1801             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1802             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1803             !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1804                 return true;
1805
1806         if (context->mount_flags != 0)
1807                 return true;
1808
1809         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1810                 return true;
1811
1812         if (context->private_devices ||
1813             context->protect_system != PROTECT_SYSTEM_NO ||
1814             context->protect_home != PROTECT_HOME_NO ||
1815             context->protect_kernel_tunables ||
1816             context->protect_kernel_modules ||
1817             context->protect_control_groups)
1818                 return true;
1819
1820         if (context->mount_apivfs && (context->root_image || context->root_directory))
1821                 return true;
1822
1823         return false;
1824 }
1825
1826 static int setup_private_users(uid_t uid, gid_t gid) {
1827         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1828         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1829         _cleanup_close_ int unshare_ready_fd = -1;
1830         _cleanup_(sigkill_waitp) pid_t pid = 0;
1831         uint64_t c = 1;
1832         siginfo_t si;
1833         ssize_t n;
1834         int r;
1835
1836         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1837          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1838          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1839          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1840          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1841          * continues execution normally. */
1842
1843         if (uid != 0 && uid_is_valid(uid)) {
1844                 r = asprintf(&uid_map,
1845                              "0 0 1\n"                      /* Map root → root */
1846                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1847                              uid, uid);
1848                 if (r < 0)
1849                         return -ENOMEM;
1850         } else {
1851                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1852                 if (!uid_map)
1853                         return -ENOMEM;
1854         }
1855
1856         if (gid != 0 && gid_is_valid(gid)) {
1857                 r = asprintf(&gid_map,
1858                              "0 0 1\n"                      /* Map root → root */
1859                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1860                              gid, gid);
1861                 if (r < 0)
1862                         return -ENOMEM;
1863         } else {
1864                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1865                 if (!gid_map)
1866                         return -ENOMEM;
1867         }
1868
1869         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1870          * namespace. */
1871         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1872         if (unshare_ready_fd < 0)
1873                 return -errno;
1874
1875         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1876          * failed. */
1877         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1878                 return -errno;
1879
1880         pid = fork();
1881         if (pid < 0)
1882                 return -errno;
1883
1884         if (pid == 0) {
1885                 _cleanup_close_ int fd = -1;
1886                 const char *a;
1887                 pid_t ppid;
1888
1889                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1890                  * here, after the parent opened its own user namespace. */
1891
1892                 ppid = getppid();
1893                 errno_pipe[0] = safe_close(errno_pipe[0]);
1894
1895                 /* Wait until the parent unshared the user namespace */
1896                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1897                         r = -errno;
1898                         goto child_fail;
1899                 }
1900
1901                 /* Disable the setgroups() system call in the child user namespace, for good. */
1902                 a = procfs_file_alloca(ppid, "setgroups");
1903                 fd = open(a, O_WRONLY|O_CLOEXEC);
1904                 if (fd < 0) {
1905                         if (errno != ENOENT) {
1906                                 r = -errno;
1907                                 goto child_fail;
1908                         }
1909
1910                         /* If the file is missing the kernel is too old, let's continue anyway. */
1911                 } else {
1912                         if (write(fd, "deny\n", 5) < 0) {
1913                                 r = -errno;
1914                                 goto child_fail;
1915                         }
1916
1917                         fd = safe_close(fd);
1918                 }
1919
1920                 /* First write the GID map */
1921                 a = procfs_file_alloca(ppid, "gid_map");
1922                 fd = open(a, O_WRONLY|O_CLOEXEC);
1923                 if (fd < 0) {
1924                         r = -errno;
1925                         goto child_fail;
1926                 }
1927                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1928                         r = -errno;
1929                         goto child_fail;
1930                 }
1931                 fd = safe_close(fd);
1932
1933                 /* The write the UID map */
1934                 a = procfs_file_alloca(ppid, "uid_map");
1935                 fd = open(a, O_WRONLY|O_CLOEXEC);
1936                 if (fd < 0) {
1937                         r = -errno;
1938                         goto child_fail;
1939                 }
1940                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1941                         r = -errno;
1942                         goto child_fail;
1943                 }
1944
1945                 _exit(EXIT_SUCCESS);
1946
1947         child_fail:
1948                 (void) write(errno_pipe[1], &r, sizeof(r));
1949                 _exit(EXIT_FAILURE);
1950         }
1951
1952         errno_pipe[1] = safe_close(errno_pipe[1]);
1953
1954         if (unshare(CLONE_NEWUSER) < 0)
1955                 return -errno;
1956
1957         /* Let the child know that the namespace is ready now */
1958         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1959                 return -errno;
1960
1961         /* Try to read an error code from the child */
1962         n = read(errno_pipe[0], &r, sizeof(r));
1963         if (n < 0)
1964                 return -errno;
1965         if (n == sizeof(r)) { /* an error code was sent to us */
1966                 if (r < 0)
1967                         return r;
1968                 return -EIO;
1969         }
1970         if (n != 0) /* on success we should have read 0 bytes */
1971                 return -EIO;
1972
1973         r = wait_for_terminate(pid, &si);
1974         if (r < 0)
1975                 return r;
1976         pid = 0;
1977
1978         /* If something strange happened with the child, let's consider this fatal, too */
1979         if (si.si_code != CLD_EXITED || si.si_status != 0)
1980                 return -EIO;
1981
1982         return 0;
1983 }
1984
1985 static int setup_exec_directory(
1986                 const ExecContext *context,
1987                 const ExecParameters *params,
1988                 uid_t uid,
1989                 gid_t gid,
1990                 ExecDirectoryType type,
1991                 int *exit_status) {
1992
1993         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1994                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1995                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1996                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1997                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1998                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1999         };
2000         char **rt;
2001         int r;
2002
2003         assert(context);
2004         assert(params);
2005         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2006         assert(exit_status);
2007
2008         if (!params->prefix[type])
2009                 return 0;
2010
2011         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2012                 if (!uid_is_valid(uid))
2013                         uid = 0;
2014                 if (!gid_is_valid(gid))
2015                         gid = 0;
2016         }
2017
2018         STRV_FOREACH(rt, context->directories[type].paths) {
2019                 _cleanup_free_ char *p = NULL, *pp = NULL;
2020                 const char *effective;
2021
2022                 p = strjoin(params->prefix[type], "/", *rt);
2023                 if (!p) {
2024                         r = -ENOMEM;
2025                         goto fail;
2026                 }
2027
2028                 r = mkdir_parents_label(p, 0755);
2029                 if (r < 0)
2030                         goto fail;
2031
2032                 if (context->dynamic_user &&
2033                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2034                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2035
2036                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2037                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2038                          * whose UID is later on reused. To lock this down we use the same trick used by container
2039                          * managers to prohibit host users to get access to files of the same UID in containers: we
2040                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2041                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2042                          * to make this directory permeable for the service itself.
2043                          *
2044                          * Specifically: for a service which wants a special directory "foo/" we first create a
2045                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2046                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2047                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2048                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2049                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2050                          * disabling the access boundary for the service and making sure it only gets access to the
2051                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2052                          *
2053                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2054                          * owned by the service itself.
2055                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2056                          * files or sockets with other services. */
2057
2058                         private_root = strjoin(params->prefix[type], "/private");
2059                         if (!private_root) {
2060                                 r = -ENOMEM;
2061                                 goto fail;
2062                         }
2063
2064                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2065                         r = mkdir_safe_label(private_root, 0700, 0, 0, false);
2066                         if (r < 0)
2067                                 goto fail;
2068
2069                         pp = strjoin(private_root, "/", *rt);
2070                         if (!pp) {
2071                                 r = -ENOMEM;
2072                                 goto fail;
2073                         }
2074
2075                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2076                         r = mkdir_parents_label(pp, 0755);
2077                         if (r < 0)
2078                                 goto fail;
2079
2080                         /* Finally, create the actual directory for the service */
2081                         r = mkdir_label(pp, context->directories[type].mode);
2082                         if (r < 0 && r != -EEXIST)
2083                                 goto fail;
2084
2085                         parent = dirname_malloc(p);
2086                         if (!parent) {
2087                                 r = -ENOMEM;
2088                                 goto fail;
2089                         }
2090
2091                         r = path_make_relative(parent, pp, &relative);
2092                         if (r < 0)
2093                                 goto fail;
2094
2095                         /* And link it up from the original place */
2096                         r = symlink_idempotent(relative, p);
2097                         if (r < 0)
2098                                 goto fail;
2099
2100                         effective = pp;
2101
2102                 } else {
2103                         r = mkdir_label(p, context->directories[type].mode);
2104                         if (r < 0 && r != -EEXIST)
2105                                 goto fail;
2106
2107                         effective = p;
2108                 }
2109
2110                 /* First lock down the access mode */
2111                 if (chmod(effective, context->directories[type].mode) < 0) {
2112                         r = -errno;
2113                         goto fail;
2114                 }
2115
2116                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2117                  * a service, and shall not be writable. */
2118                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2119                         continue;
2120
2121                 /* Then, change the ownership of the whole tree, if necessary */
2122                 r = path_chown_recursive(effective, uid, gid);
2123                 if (r < 0)
2124                         goto fail;
2125         }
2126
2127         return 0;
2128
2129 fail:
2130         *exit_status = exit_status_table[type];
2131         return r;
2132 }
2133
2134 static int setup_smack(
2135                 const ExecContext *context,
2136                 const ExecCommand *command) {
2137
2138         int r;
2139
2140         assert(context);
2141         assert(command);
2142
2143         if (context->smack_process_label) {
2144                 r = mac_smack_apply_pid(0, context->smack_process_label);
2145                 if (r < 0)
2146                         return r;
2147         }
2148 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2149         else {
2150                 _cleanup_free_ char *exec_label = NULL;
2151
2152                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2153                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2154                         return r;
2155
2156                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2157                 if (r < 0)
2158                         return r;
2159         }
2160 #endif
2161
2162         return 0;
2163 }
2164
2165 static int compile_bind_mounts(
2166                 const ExecContext *context,
2167                 const ExecParameters *params,
2168                 BindMount **ret_bind_mounts,
2169                 unsigned *ret_n_bind_mounts,
2170                 char ***ret_empty_directories) {
2171
2172         _cleanup_strv_free_ char **empty_directories = NULL;
2173         BindMount *bind_mounts;
2174         unsigned n, h = 0, i;
2175         ExecDirectoryType t;
2176         int r;
2177
2178         assert(context);
2179         assert(params);
2180         assert(ret_bind_mounts);
2181         assert(ret_n_bind_mounts);
2182         assert(ret_empty_directories);
2183
2184         n = context->n_bind_mounts;
2185         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2186                 if (!params->prefix[t])
2187                         continue;
2188
2189                 n += strv_length(context->directories[t].paths);
2190         }
2191
2192         if (n <= 0) {
2193                 *ret_bind_mounts = NULL;
2194                 *ret_n_bind_mounts = 0;
2195                 *ret_empty_directories = NULL;
2196                 return 0;
2197         }
2198
2199         bind_mounts = new(BindMount, n);
2200         if (!bind_mounts)
2201                 return -ENOMEM;
2202
2203         for (i = 0; i < context->n_bind_mounts; i++) {
2204                 BindMount *item = context->bind_mounts + i;
2205                 char *s, *d;
2206
2207                 s = strdup(item->source);
2208                 if (!s) {
2209                         r = -ENOMEM;
2210                         goto finish;
2211                 }
2212
2213                 d = strdup(item->destination);
2214                 if (!d) {
2215                         free(s);
2216                         r = -ENOMEM;
2217                         goto finish;
2218                 }
2219
2220                 bind_mounts[h++] = (BindMount) {
2221                         .source = s,
2222                         .destination = d,
2223                         .read_only = item->read_only,
2224                         .recursive = item->recursive,
2225                         .ignore_enoent = item->ignore_enoent,
2226                 };
2227         }
2228
2229         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2230                 char **suffix;
2231
2232                 if (!params->prefix[t])
2233                         continue;
2234
2235                 if (strv_isempty(context->directories[t].paths))
2236                         continue;
2237
2238                 if (context->dynamic_user &&
2239                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2240                         char *private_root;
2241
2242                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2243                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2244                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2245
2246                         private_root = strjoin(params->prefix[t], "/private");
2247                         if (!private_root) {
2248                                 r = -ENOMEM;
2249                                 goto finish;
2250                         }
2251
2252                         r = strv_consume(&empty_directories, private_root);
2253                         if (r < 0) {
2254                                 r = -ENOMEM;
2255                                 goto finish;
2256                         }
2257                 }
2258
2259                 STRV_FOREACH(suffix, context->directories[t].paths) {
2260                         char *s, *d;
2261
2262                         if (context->dynamic_user &&
2263                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2264                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2265                         else
2266                                 s = strjoin(params->prefix[t], "/", *suffix);
2267                         if (!s) {
2268                                 r = -ENOMEM;
2269                                 goto finish;
2270                         }
2271
2272                         d = strdup(s);
2273                         if (!d) {
2274                                 free(s);
2275                                 r = -ENOMEM;
2276                                 goto finish;
2277                         }
2278
2279                         bind_mounts[h++] = (BindMount) {
2280                                 .source = s,
2281                                 .destination = d,
2282                                 .read_only = false,
2283                                 .recursive = true,
2284                                 .ignore_enoent = false,
2285                         };
2286                 }
2287         }
2288
2289         assert(h == n);
2290
2291         *ret_bind_mounts = bind_mounts;
2292         *ret_n_bind_mounts = n;
2293         *ret_empty_directories = empty_directories;
2294
2295         empty_directories = NULL;
2296
2297         return (int) n;
2298
2299 finish:
2300         bind_mount_free_many(bind_mounts, h);
2301         return r;
2302 }
2303
2304 static int apply_mount_namespace(
2305                 Unit *u,
2306                 ExecCommand *command,
2307                 const ExecContext *context,
2308                 const ExecParameters *params,
2309                 ExecRuntime *runtime) {
2310
2311         _cleanup_strv_free_ char **empty_directories = NULL;
2312         char *tmp = NULL, *var = NULL;
2313         const char *root_dir = NULL, *root_image = NULL;
2314         NamespaceInfo ns_info = {
2315                 .ignore_protect_paths = false,
2316                 .private_dev = context->private_devices,
2317                 .protect_control_groups = context->protect_control_groups,
2318                 .protect_kernel_tunables = context->protect_kernel_tunables,
2319                 .protect_kernel_modules = context->protect_kernel_modules,
2320                 .mount_apivfs = context->mount_apivfs,
2321         };
2322         bool needs_sandboxing;
2323         BindMount *bind_mounts = NULL;
2324         unsigned n_bind_mounts = 0;
2325         int r;
2326
2327         assert(context);
2328
2329         /* The runtime struct only contains the parent of the private /tmp,
2330          * which is non-accessible to world users. Inside of it there's a /tmp
2331          * that is sticky, and that's the one we want to use here. */
2332
2333         if (context->private_tmp && runtime) {
2334                 if (runtime->tmp_dir)
2335                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2336                 if (runtime->var_tmp_dir)
2337                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2338         }
2339
2340         if (params->flags & EXEC_APPLY_CHROOT) {
2341                 root_image = context->root_image;
2342
2343                 if (!root_image)
2344                         root_dir = context->root_directory;
2345         }
2346
2347         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2348         if (r < 0)
2349                 return r;
2350
2351         /*
2352          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2353          * sandbox info, otherwise enforce it, don't ignore protected paths and
2354          * fail if we are enable to apply the sandbox inside the mount namespace.
2355          */
2356         if (!context->dynamic_user && root_dir)
2357                 ns_info.ignore_protect_paths = true;
2358
2359         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2360
2361         r = setup_namespace(root_dir, root_image,
2362                             &ns_info, context->read_write_paths,
2363                             needs_sandboxing ? context->read_only_paths : NULL,
2364                             needs_sandboxing ? context->inaccessible_paths : NULL,
2365                             empty_directories,
2366                             bind_mounts,
2367                             n_bind_mounts,
2368                             tmp,
2369                             var,
2370                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2371                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2372                             context->mount_flags,
2373                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2374
2375         bind_mount_free_many(bind_mounts, n_bind_mounts);
2376
2377         /* If we couldn't set up the namespace this is probably due to a
2378          * missing capability. In this case, silently proceeed. */
2379         if (IN_SET(r, -EPERM, -EACCES)) {
2380                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2381                 return 0;
2382         }
2383
2384         return r;
2385 }
2386
2387 static int apply_working_directory(
2388                 const ExecContext *context,
2389                 const ExecParameters *params,
2390                 const char *home,
2391                 const bool needs_mount_ns,
2392                 int *exit_status) {
2393
2394         const char *d, *wd;
2395
2396         assert(context);
2397         assert(exit_status);
2398
2399         if (context->working_directory_home) {
2400
2401                 if (!home) {
2402                         *exit_status = EXIT_CHDIR;
2403                         return -ENXIO;
2404                 }
2405
2406                 wd = home;
2407
2408         } else if (context->working_directory)
2409                 wd = context->working_directory;
2410         else
2411                 wd = "/";
2412
2413         if (params->flags & EXEC_APPLY_CHROOT) {
2414                 if (!needs_mount_ns && context->root_directory)
2415                         if (chroot(context->root_directory) < 0) {
2416                                 *exit_status = EXIT_CHROOT;
2417                                 return -errno;
2418                         }
2419
2420                 d = wd;
2421         } else
2422                 d = prefix_roota(context->root_directory, wd);
2423
2424         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2425                 *exit_status = EXIT_CHDIR;
2426                 return -errno;
2427         }
2428
2429         return 0;
2430 }
2431
2432 static int setup_keyring(
2433                 Unit *u,
2434                 const ExecContext *context,
2435                 const ExecParameters *p,
2436                 uid_t uid, gid_t gid) {
2437
2438         key_serial_t keyring;
2439         int r;
2440
2441         assert(u);
2442         assert(context);
2443         assert(p);
2444
2445         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2446          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2447          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2448          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2449          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2450          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2451
2452         if (!(p->flags & EXEC_NEW_KEYRING))
2453                 return 0;
2454
2455         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2456                 return 0;
2457
2458         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2459         if (keyring == -1) {
2460                 if (errno == ENOSYS)
2461                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2462                 else if (IN_SET(errno, EACCES, EPERM))
2463                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2464                 else if (errno == EDQUOT)
2465                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2466                 else
2467                         return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2468
2469                 return 0;
2470         }
2471
2472         /* Populate they keyring with the invocation ID by default. */
2473         if (!sd_id128_is_null(u->invocation_id)) {
2474                 key_serial_t key;
2475
2476                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2477                 if (key == -1)
2478                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2479                 else {
2480                         if (keyctl(KEYCTL_SETPERM, key,
2481                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2482                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2483                                 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2484                 }
2485         }
2486
2487         /* And now, make the keyring owned by the service's user */
2488         if (uid_is_valid(uid) || gid_is_valid(gid))
2489                 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2490                         return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2491
2492         /* When requested link the user keyring into the session keyring. */
2493         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2494                 uid_t saved_uid;
2495                 gid_t saved_gid;
2496
2497                 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2498                  * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2499                  * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2500
2501                 saved_uid = getuid();
2502                 saved_gid = getgid();
2503
2504                 if (gid_is_valid(gid) && gid != saved_gid) {
2505                         if (setregid(gid, -1) < 0)
2506                                 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2507                 }
2508
2509                 if (uid_is_valid(uid) && uid != saved_uid) {
2510                         if (setreuid(uid, -1) < 0) {
2511                                 (void) setregid(saved_gid, -1);
2512                                 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2513                         }
2514                 }
2515
2516                 if (keyctl(KEYCTL_LINK,
2517                            KEY_SPEC_USER_KEYRING,
2518                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2519
2520                         r = -errno;
2521
2522                         (void) setreuid(saved_uid, -1);
2523                         (void) setregid(saved_gid, -1);
2524
2525                         return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2526                 }
2527
2528                 if (uid_is_valid(uid) && uid != saved_uid) {
2529                         if (setreuid(saved_uid, -1) < 0) {
2530                                 (void) setregid(saved_gid, -1);
2531                                 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2532                         }
2533                 }
2534
2535                 if (gid_is_valid(gid) && gid != saved_gid) {
2536                         if (setregid(saved_gid, -1) < 0)
2537                                 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2538                 }
2539         }
2540
2541         return 0;
2542 }
2543
2544 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2545         assert(array);
2546         assert(n);
2547
2548         if (!pair)
2549                 return;
2550
2551         if (pair[0] >= 0)
2552                 array[(*n)++] = pair[0];
2553         if (pair[1] >= 0)
2554                 array[(*n)++] = pair[1];
2555 }
2556
2557 static int close_remaining_fds(
2558                 const ExecParameters *params,
2559                 ExecRuntime *runtime,
2560                 DynamicCreds *dcreds,
2561                 int user_lookup_fd,
2562                 int socket_fd,
2563                 int *fds, unsigned n_fds) {
2564
2565         unsigned n_dont_close = 0;
2566         int dont_close[n_fds + 12];
2567
2568         assert(params);
2569
2570         if (params->stdin_fd >= 0)
2571                 dont_close[n_dont_close++] = params->stdin_fd;
2572         if (params->stdout_fd >= 0)
2573                 dont_close[n_dont_close++] = params->stdout_fd;
2574         if (params->stderr_fd >= 0)
2575                 dont_close[n_dont_close++] = params->stderr_fd;
2576
2577         if (socket_fd >= 0)
2578                 dont_close[n_dont_close++] = socket_fd;
2579         if (n_fds > 0) {
2580                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2581                 n_dont_close += n_fds;
2582         }
2583
2584         if (runtime)
2585                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2586
2587         if (dcreds) {
2588                 if (dcreds->user)
2589                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2590                 if (dcreds->group)
2591                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2592         }
2593
2594         if (user_lookup_fd >= 0)
2595                 dont_close[n_dont_close++] = user_lookup_fd;
2596
2597         return close_all_fds(dont_close, n_dont_close);
2598 }
2599
2600 static int send_user_lookup(
2601                 Unit *unit,
2602                 int user_lookup_fd,
2603                 uid_t uid,
2604                 gid_t gid) {
2605
2606         assert(unit);
2607
2608         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2609          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2610          * specified. */
2611
2612         if (user_lookup_fd < 0)
2613                 return 0;
2614
2615         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2616                 return 0;
2617
2618         if (writev(user_lookup_fd,
2619                (struct iovec[]) {
2620                            IOVEC_INIT(&uid, sizeof(uid)),
2621                            IOVEC_INIT(&gid, sizeof(gid)),
2622                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2623                 return -errno;
2624
2625         return 0;
2626 }
2627
2628 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2629         int r;
2630
2631         assert(c);
2632         assert(home);
2633         assert(buf);
2634
2635         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2636
2637         if (*home)
2638                 return 0;
2639
2640         if (!c->working_directory_home)
2641                 return 0;
2642
2643         if (uid == 0) {
2644                 /* Hardcode /root as home directory for UID 0 */
2645                 *home = "/root";
2646                 return 1;
2647         }
2648
2649         r = get_home_dir(buf);
2650         if (r < 0)
2651                 return r;
2652
2653         *home = *buf;
2654         return 1;
2655 }
2656
2657 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2658         _cleanup_strv_free_ char ** list = NULL;
2659         ExecDirectoryType t;
2660         int r;
2661
2662         assert(c);
2663         assert(p);
2664         assert(ret);
2665
2666         assert(c->dynamic_user);
2667
2668         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2669          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2670          * directories. */
2671
2672         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2673                 char **i;
2674
2675                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2676                         continue;
2677
2678                 if (!p->prefix[t])
2679                         continue;
2680
2681                 STRV_FOREACH(i, c->directories[t].paths) {
2682                         char *e;
2683
2684                         if (t == EXEC_DIRECTORY_RUNTIME)
2685                                 e = strjoin(p->prefix[t], "/", *i);
2686                         else
2687                                 e = strjoin(p->prefix[t], "/private/", *i);
2688                         if (!e)
2689                                 return -ENOMEM;
2690
2691                         r = strv_consume(&list, e);
2692                         if (r < 0)
2693                                 return r;
2694                 }
2695         }
2696
2697         *ret = list;
2698         list = NULL;
2699
2700         return 0;
2701 }
2702
2703 static int exec_child(
2704                 Unit *unit,
2705                 ExecCommand *command,
2706                 const ExecContext *context,
2707                 const ExecParameters *params,
2708                 ExecRuntime *runtime,
2709                 DynamicCreds *dcreds,
2710                 char **argv,
2711                 int socket_fd,
2712                 int named_iofds[3],
2713                 int *fds,
2714                 unsigned n_storage_fds,
2715                 unsigned n_socket_fds,
2716                 char **files_env,
2717                 int user_lookup_fd,
2718                 int *exit_status) {
2719
2720         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2721         _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2722         _cleanup_free_ gid_t *supplementary_gids = NULL;
2723         const char *username = NULL, *groupname = NULL;
2724         const char *home = NULL, *shell = NULL;
2725         dev_t journal_stream_dev = 0;
2726         ino_t journal_stream_ino = 0;
2727         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2728                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2729                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2730                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2731 #if HAVE_SELINUX
2732         bool use_selinux = false;
2733 #endif
2734 #if ENABLE_SMACK
2735         bool use_smack = false;
2736 #endif
2737 #if HAVE_APPARMOR
2738         bool use_apparmor = false;
2739 #endif
2740         uid_t uid = UID_INVALID;
2741         gid_t gid = GID_INVALID;
2742         int i, r, ngids = 0;
2743         unsigned n_fds;
2744         ExecDirectoryType dt;
2745         int secure_bits;
2746
2747         assert(unit);
2748         assert(command);
2749         assert(context);
2750         assert(params);
2751         assert(exit_status);
2752
2753         rename_process_from_path(command->path);
2754
2755         /* We reset exactly these signals, since they are the
2756          * only ones we set to SIG_IGN in the main daemon. All
2757          * others we leave untouched because we set them to
2758          * SIG_DFL or a valid handler initially, both of which
2759          * will be demoted to SIG_DFL. */
2760         (void) default_signals(SIGNALS_CRASH_HANDLER,
2761                                SIGNALS_IGNORE, -1);
2762
2763         if (context->ignore_sigpipe)
2764                 (void) ignore_signals(SIGPIPE, -1);
2765
2766         r = reset_signal_mask();
2767         if (r < 0) {
2768                 *exit_status = EXIT_SIGNAL_MASK;
2769                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2770         }
2771
2772         if (params->idle_pipe)
2773                 do_idle_pipe_dance(params->idle_pipe);
2774
2775         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2776          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2777          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2778          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2779
2780         log_forget_fds();
2781         log_set_open_when_needed(true);
2782
2783         /* In case anything used libc syslog(), close this here, too */
2784         closelog();
2785
2786         n_fds = n_storage_fds + n_socket_fds;
2787         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2788         if (r < 0) {
2789                 *exit_status = EXIT_FDS;
2790                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2791         }
2792
2793         if (!context->same_pgrp)
2794                 if (setsid() < 0) {
2795                         *exit_status = EXIT_SETSID;
2796                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2797                 }
2798
2799         exec_context_tty_reset(context, params);
2800
2801         if (unit_shall_confirm_spawn(unit)) {
2802                 const char *vc = params->confirm_spawn;
2803                 _cleanup_free_ char *cmdline = NULL;
2804
2805                 cmdline = exec_command_line(argv);
2806                 if (!cmdline) {
2807                         *exit_status = EXIT_MEMORY;
2808                         return log_oom();
2809                 }
2810
2811                 r = ask_for_confirmation(vc, unit, cmdline);
2812                 if (r != CONFIRM_EXECUTE) {
2813                         if (r == CONFIRM_PRETEND_SUCCESS) {
2814                                 *exit_status = EXIT_SUCCESS;
2815                                 return 0;
2816                         }
2817                         *exit_status = EXIT_CONFIRM;
2818                         log_unit_error(unit, "Execution cancelled by the user");
2819                         return -ECANCELED;
2820                 }
2821         }
2822
2823         if (context->dynamic_user && dcreds) {
2824                 _cleanup_strv_free_ char **suggested_paths = NULL;
2825
2826                 /* Make sure we bypass our own NSS module for any NSS checks */
2827                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2828                         *exit_status = EXIT_USER;
2829                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2830                 }
2831
2832                 r = compile_suggested_paths(context, params, &suggested_paths);
2833                 if (r < 0) {
2834                         *exit_status = EXIT_MEMORY;
2835                         return log_oom();
2836                 }
2837
2838                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2839                 if (r < 0) {
2840                         *exit_status = EXIT_USER;
2841                         if (r == -EILSEQ) {
2842                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2843                                 return -EOPNOTSUPP;
2844                         }
2845                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2846                 }
2847
2848                 if (!uid_is_valid(uid)) {
2849                         *exit_status = EXIT_USER;
2850                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2851                         return -ESRCH;
2852                 }
2853
2854                 if (!gid_is_valid(gid)) {
2855                         *exit_status = EXIT_USER;
2856                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2857                         return -ESRCH;
2858                 }
2859
2860                 if (dcreds->user)
2861                         username = dcreds->user->name;
2862
2863         } else {
2864                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2865                 if (r < 0) {
2866                         *exit_status = EXIT_USER;
2867                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2868                 }
2869
2870                 r = get_fixed_group(context, &groupname, &gid);
2871                 if (r < 0) {
2872                         *exit_status = EXIT_GROUP;
2873                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2874                 }
2875         }
2876
2877         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2878         r = get_supplementary_groups(context, username, groupname, gid,
2879                                      &supplementary_gids, &ngids);
2880         if (r < 0) {
2881                 *exit_status = EXIT_GROUP;
2882                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2883         }
2884
2885         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2886         if (r < 0) {
2887                 *exit_status = EXIT_USER;
2888                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2889         }
2890
2891         user_lookup_fd = safe_close(user_lookup_fd);
2892
2893         r = acquire_home(context, uid, &home, &home_buffer);
2894         if (r < 0) {
2895                 *exit_status = EXIT_CHDIR;
2896                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2897         }
2898
2899         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2900          * must sure to drop O_NONBLOCK */
2901         if (socket_fd >= 0)
2902                 (void) fd_nonblock(socket_fd, false);
2903
2904         r = setup_input(context, params, socket_fd, named_iofds);
2905         if (r < 0) {
2906                 *exit_status = EXIT_STDIN;
2907                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2908         }
2909
2910         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2911         if (r < 0) {
2912                 *exit_status = EXIT_STDOUT;
2913                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2914         }
2915
2916         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2917         if (r < 0) {
2918                 *exit_status = EXIT_STDERR;
2919                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2920         }
2921
2922         if (params->cgroup_path) {
2923                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2924                 if (r < 0) {
2925                         *exit_status = EXIT_CGROUP;
2926                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2927                 }
2928         }
2929
2930         if (context->oom_score_adjust_set) {
2931                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2932
2933                 /* When we can't make this change due to EPERM, then
2934                  * let's silently skip over it. User namespaces
2935                  * prohibit write access to this file, and we
2936                  * shouldn't trip up over that. */
2937
2938                 sprintf(t, "%i", context->oom_score_adjust);
2939                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2940                 if (IN_SET(r, -EPERM, -EACCES))
2941                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2942                 else if (r < 0) {
2943                         *exit_status = EXIT_OOM_ADJUST;
2944                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2945                 }
2946         }
2947
2948         if (context->nice_set)
2949                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2950                         *exit_status = EXIT_NICE;
2951                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2952                 }
2953
2954         if (context->cpu_sched_set) {
2955                 struct sched_param param = {
2956                         .sched_priority = context->cpu_sched_priority,
2957                 };
2958
2959                 r = sched_setscheduler(0,
2960                                        context->cpu_sched_policy |
2961                                        (context->cpu_sched_reset_on_fork ?
2962                                         SCHED_RESET_ON_FORK : 0),
2963                                        &param);
2964                 if (r < 0) {
2965                         *exit_status = EXIT_SETSCHEDULER;
2966                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2967                 }
2968         }
2969
2970         if (context->cpuset)
2971                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2972                         *exit_status = EXIT_CPUAFFINITY;
2973                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2974                 }
2975
2976         if (context->ioprio_set)
2977                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2978                         *exit_status = EXIT_IOPRIO;
2979                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2980                 }
2981
2982         if (context->timer_slack_nsec != NSEC_INFINITY)
2983                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2984                         *exit_status = EXIT_TIMERSLACK;
2985                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2986                 }
2987
2988         if (context->personality != PERSONALITY_INVALID) {
2989                 r = safe_personality(context->personality);
2990                 if (r < 0) {
2991                         *exit_status = EXIT_PERSONALITY;
2992                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2993                 }
2994         }
2995
2996         if (context->utmp_id)
2997                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2998                                       context->tty_path,
2999                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3000                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3001                                       USER_PROCESS,
3002                                       username);
3003
3004         if (context->user) {
3005                 r = chown_terminal(STDIN_FILENO, uid);
3006                 if (r < 0) {
3007                         *exit_status = EXIT_STDIN;
3008                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3009                 }
3010         }
3011
3012         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3013          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3014          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3015          * touch a single hierarchy too. */
3016         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3017                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3018                 if (r < 0) {
3019                         *exit_status = EXIT_CGROUP;
3020                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3021                 }
3022         }
3023
3024         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3025                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3026                 if (r < 0)
3027                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3028         }
3029
3030         r = build_environment(
3031                         unit,
3032                         context,
3033                         params,
3034                         n_fds,
3035                         home,
3036                         username,
3037                         shell,
3038                         journal_stream_dev,
3039                         journal_stream_ino,
3040                         &our_env);
3041         if (r < 0) {
3042                 *exit_status = EXIT_MEMORY;
3043                 return log_oom();
3044         }
3045
3046         r = build_pass_environment(context, &pass_env);
3047         if (r < 0) {
3048                 *exit_status = EXIT_MEMORY;
3049                 return log_oom();
3050         }
3051
3052         accum_env = strv_env_merge(5,
3053                                    params->environment,
3054                                    our_env,
3055                                    pass_env,
3056                                    context->environment,
3057                                    files_env,
3058                                    NULL);
3059         if (!accum_env) {
3060                 *exit_status = EXIT_MEMORY;
3061                 return log_oom();
3062         }
3063         accum_env = strv_env_clean(accum_env);
3064
3065         (void) umask(context->umask);
3066
3067         r = setup_keyring(unit, context, params, uid, gid);
3068         if (r < 0) {
3069                 *exit_status = EXIT_KEYRING;
3070                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3071         }
3072
3073         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3074         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3075
3076         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3077         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3078
3079         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3080         if (needs_ambient_hack)
3081                 needs_setuid = false;
3082         else
3083                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3084
3085         if (needs_sandboxing) {
3086                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3087                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3088                  * impacting our own code paths. */
3089
3090 #if HAVE_SELINUX
3091                 use_selinux = mac_selinux_use();
3092 #endif
3093 #if ENABLE_SMACK
3094                 use_smack = mac_smack_use();
3095 #endif
3096 #if HAVE_APPARMOR
3097                 use_apparmor = mac_apparmor_use();
3098 #endif
3099         }
3100
3101         if (needs_setuid) {
3102                 if (context->pam_name && username) {
3103                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3104                         if (r < 0) {
3105                                 *exit_status = EXIT_PAM;
3106                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3107                         }
3108                 }
3109         }
3110
3111         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3112                 if (ns_type_supported(NAMESPACE_NET)) {
3113                         r = setup_netns(runtime->netns_storage_socket);
3114                         if (r < 0) {
3115                                 *exit_status = EXIT_NETWORK;
3116                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3117                         }
3118                 } else
3119                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3120         }
3121
3122         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3123         if (needs_mount_namespace) {
3124                 r = apply_mount_namespace(unit, command, context, params, runtime);
3125                 if (r < 0) {
3126                         *exit_status = EXIT_NAMESPACE;
3127                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3128                 }
3129         }
3130
3131         /* Apply just after mount namespace setup */
3132         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3133         if (r < 0)
3134                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3135
3136         /* Drop groups as early as possbile */
3137         if (needs_setuid) {
3138                 r = enforce_groups(gid, supplementary_gids, ngids);
3139                 if (r < 0) {
3140                         *exit_status = EXIT_GROUP;
3141                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3142                 }
3143         }
3144
3145         if (needs_sandboxing) {
3146 #if HAVE_SELINUX
3147                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3148                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3149                         if (r < 0) {
3150                                 *exit_status = EXIT_SELINUX_CONTEXT;
3151                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3152                         }
3153                 }
3154 #endif
3155
3156                 if (context->private_users) {
3157                         r = setup_private_users(uid, gid);
3158                         if (r < 0) {
3159                                 *exit_status = EXIT_USER;
3160                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3161                         }
3162                 }
3163         }
3164
3165         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3166          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3167          * was needed to upload the policy and can now be closed as well. */
3168         r = close_all_fds(fds, n_fds);
3169         if (r >= 0)
3170                 r = shift_fds(fds, n_fds);
3171         if (r >= 0)
3172                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3173         if (r < 0) {
3174                 *exit_status = EXIT_FDS;
3175                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3176         }
3177
3178         secure_bits = context->secure_bits;
3179
3180         if (needs_sandboxing) {
3181                 uint64_t bset;
3182
3183                 for (i = 0; i < _RLIMIT_MAX; i++) {
3184
3185                         if (!context->rlimit[i])
3186                                 continue;
3187
3188                         r = setrlimit_closest(i, context->rlimit[i]);
3189                         if (r < 0) {
3190                                 *exit_status = EXIT_LIMITS;
3191                                 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3192                         }
3193                 }
3194
3195                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3196                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3197                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3198                                 *exit_status = EXIT_LIMITS;
3199                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3200                         }
3201                 }
3202
3203 #if ENABLE_SMACK
3204                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3205                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3206                 if (use_smack) {
3207                         r = setup_smack(context, command);
3208                         if (r < 0) {
3209                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3210                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3211                         }
3212                 }
3213 #endif
3214
3215                 bset = context->capability_bounding_set;
3216                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3217                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3218                  * instead of us doing that */
3219                 if (needs_ambient_hack)
3220                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3221                                 (UINT64_C(1) << CAP_SETUID) |
3222                                 (UINT64_C(1) << CAP_SETGID);
3223
3224                 if (!cap_test_all(bset)) {
3225                         r = capability_bounding_set_drop(bset, false);
3226                         if (r < 0) {
3227                                 *exit_status = EXIT_CAPABILITIES;
3228                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3229                         }
3230                 }
3231
3232                 /* This is done before enforce_user, but ambient set
3233                  * does not survive over setresuid() if keep_caps is not set. */
3234                 if (!needs_ambient_hack &&
3235                     context->capability_ambient_set != 0) {
3236                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3237                         if (r < 0) {
3238                                 *exit_status = EXIT_CAPABILITIES;
3239                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3240                         }
3241                 }
3242         }
3243
3244         if (needs_setuid) {
3245                 if (context->user) {
3246                         r = enforce_user(context, uid);
3247                         if (r < 0) {
3248                                 *exit_status = EXIT_USER;
3249                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3250                         }
3251
3252                         if (!needs_ambient_hack &&
3253                             context->capability_ambient_set != 0) {
3254
3255                                 /* Fix the ambient capabilities after user change. */
3256                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3257                                 if (r < 0) {
3258                                         *exit_status = EXIT_CAPABILITIES;
3259                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3260                                 }
3261
3262                                 /* If we were asked to change user and ambient capabilities
3263                                  * were requested, we had to add keep-caps to the securebits
3264                                  * so that we would maintain the inherited capability set
3265                                  * through the setresuid(). Make sure that the bit is added
3266                                  * also to the context secure_bits so that we don't try to
3267                                  * drop the bit away next. */
3268
3269                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3270                         }
3271                 }
3272         }
3273
3274         if (needs_sandboxing) {
3275                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3276                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3277                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3278                  * are restricted. */
3279
3280 #if HAVE_SELINUX
3281                 if (use_selinux) {
3282                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3283
3284                         if (exec_context) {
3285                                 r = setexeccon(exec_context);
3286                                 if (r < 0) {
3287                                         *exit_status = EXIT_SELINUX_CONTEXT;
3288                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3289                                 }
3290                         }
3291                 }
3292 #endif
3293
3294 #if HAVE_APPARMOR
3295                 if (use_apparmor && context->apparmor_profile) {
3296                         r = aa_change_onexec(context->apparmor_profile);
3297                         if (r < 0 && !context->apparmor_profile_ignore) {
3298                                 *exit_status = EXIT_APPARMOR_PROFILE;
3299                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3300                         }
3301                 }
3302 #endif
3303
3304                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3305                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3306                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3307                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3308                                 *exit_status = EXIT_SECUREBITS;
3309                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3310                         }
3311
3312                 if (context_has_no_new_privileges(context))
3313                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3314                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3315                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3316                         }
3317
3318 #if HAVE_SECCOMP
3319                 r = apply_address_families(unit, context);
3320                 if (r < 0) {
3321                         *exit_status = EXIT_ADDRESS_FAMILIES;
3322                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3323                 }
3324
3325                 r = apply_memory_deny_write_execute(unit, context);
3326                 if (r < 0) {
3327                         *exit_status = EXIT_SECCOMP;
3328                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3329                 }
3330
3331                 r = apply_restrict_realtime(unit, context);
3332                 if (r < 0) {
3333                         *exit_status = EXIT_SECCOMP;
3334                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3335                 }
3336
3337                 r = apply_restrict_namespaces(unit, context);
3338                 if (r < 0) {
3339                         *exit_status = EXIT_SECCOMP;
3340                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3341                 }
3342
3343                 r = apply_protect_sysctl(unit, context);
3344                 if (r < 0) {
3345                         *exit_status = EXIT_SECCOMP;
3346                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3347                 }
3348
3349                 r = apply_protect_kernel_modules(unit, context);
3350                 if (r < 0) {
3351                         *exit_status = EXIT_SECCOMP;
3352                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3353                 }
3354
3355                 r = apply_private_devices(unit, context);
3356                 if (r < 0) {
3357                         *exit_status = EXIT_SECCOMP;
3358                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3359                 }
3360
3361                 r = apply_syscall_archs(unit, context);
3362                 if (r < 0) {
3363                         *exit_status = EXIT_SECCOMP;
3364                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3365                 }
3366
3367                 r = apply_lock_personality(unit, context);
3368                 if (r < 0) {
3369                         *exit_status = EXIT_SECCOMP;
3370                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3371                 }
3372
3373                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3374                  * by the filter as little as possible. */
3375                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3376                 if (r < 0) {
3377                         *exit_status = EXIT_SECCOMP;
3378                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3379                 }
3380 #endif
3381         }
3382
3383         if (!strv_isempty(context->unset_environment)) {
3384                 char **ee = NULL;
3385
3386                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3387                 if (!ee) {
3388                         *exit_status = EXIT_MEMORY;
3389                         return log_oom();
3390                 }
3391
3392                 strv_free(accum_env);
3393                 accum_env = ee;
3394         }
3395
3396         final_argv = replace_env_argv(argv, accum_env);
3397         if (!final_argv) {
3398                 *exit_status = EXIT_MEMORY;
3399                 return log_oom();
3400         }
3401
3402         if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3403                 _cleanup_free_ char *line;
3404
3405                 line = exec_command_line(final_argv);
3406                 if (line) {
3407                         log_struct(LOG_DEBUG,
3408                                    "EXECUTABLE=%s", command->path,
3409                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3410                                    LOG_UNIT_ID(unit),
3411                                    LOG_UNIT_INVOCATION_ID(unit),
3412                                    NULL);
3413                 }
3414         }
3415
3416         execve(command->path, final_argv, accum_env);
3417
3418         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3419
3420                 log_struct_errno(LOG_INFO, errno,
3421                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3422                                  LOG_UNIT_ID(unit),
3423                                  LOG_UNIT_INVOCATION_ID(unit),
3424                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3425                                                   command->path),
3426                                  "EXECUTABLE=%s", command->path,
3427                                  NULL);
3428
3429                 return 0;
3430         }
3431
3432         *exit_status = EXIT_EXEC;
3433         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3434 }
3435
3436 int exec_spawn(Unit *unit,
3437                ExecCommand *command,
3438                const ExecContext *context,
3439                const ExecParameters *params,
3440                ExecRuntime *runtime,
3441                DynamicCreds *dcreds,
3442                pid_t *ret) {
3443
3444         _cleanup_strv_free_ char **files_env = NULL;
3445         int *fds = NULL;
3446         unsigned n_storage_fds = 0, n_socket_fds = 0;
3447         _cleanup_free_ char *line = NULL;
3448         int socket_fd, r;
3449         int named_iofds[3] = { -1, -1, -1 };
3450         char **argv;
3451         pid_t pid;
3452
3453         assert(unit);
3454         assert(command);
3455         assert(context);
3456         assert(ret);
3457         assert(params);
3458         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3459
3460         if (context->std_input == EXEC_INPUT_SOCKET ||
3461             context->std_output == EXEC_OUTPUT_SOCKET ||
3462             context->std_error == EXEC_OUTPUT_SOCKET) {
3463
3464                 if (params->n_socket_fds > 1) {
3465                         log_unit_error(unit, "Got more than one socket.");
3466                         return -EINVAL;
3467                 }
3468
3469                 if (params->n_socket_fds == 0) {
3470                         log_unit_error(unit, "Got no socket.");
3471                         return -EINVAL;
3472                 }
3473
3474                 socket_fd = params->fds[0];
3475         } else {
3476                 socket_fd = -1;
3477                 fds = params->fds;
3478                 n_storage_fds = params->n_storage_fds;
3479                 n_socket_fds = params->n_socket_fds;
3480         }
3481
3482         r = exec_context_named_iofds(unit, context, params, named_iofds);
3483         if (r < 0)
3484                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3485
3486         r = exec_context_load_environment(unit, context, &files_env);
3487         if (r < 0)
3488                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3489
3490         argv = params->argv ?: command->argv;
3491         line = exec_command_line(argv);
3492         if (!line)
3493                 return log_oom();
3494
3495         log_struct(LOG_DEBUG,
3496                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3497                    "EXECUTABLE=%s", command->path,
3498                    LOG_UNIT_ID(unit),
3499                    LOG_UNIT_INVOCATION_ID(unit),
3500                    NULL);
3501
3502         pid = fork();
3503         if (pid < 0)
3504                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3505
3506         if (pid == 0) {
3507                 int exit_status = EXIT_SUCCESS;
3508
3509                 r = exec_child(unit,
3510                                command,
3511                                context,
3512                                params,
3513                                runtime,
3514                                dcreds,
3515                                argv,
3516                                socket_fd,
3517                                named_iofds,
3518                                fds,
3519                                n_storage_fds,
3520                                n_socket_fds,
3521                                files_env,
3522                                unit->manager->user_lookup_fds[1],
3523                                &exit_status);
3524
3525                 if (r < 0) {
3526                         log_struct_errno(LOG_ERR, r,
3527                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3528                                          LOG_UNIT_ID(unit),
3529                                          LOG_UNIT_INVOCATION_ID(unit),
3530                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3531                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3532                                                           command->path),
3533                                          "EXECUTABLE=%s", command->path,
3534                                          NULL);
3535                 }
3536
3537                 _exit(exit_status);
3538         }
3539
3540         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3541
3542         /* We add the new process to the cgroup both in the child (so
3543          * that we can be sure that no user code is ever executed
3544          * outside of the cgroup) and in the parent (so that we can be
3545          * sure that when we kill the cgroup the process will be
3546          * killed too). */
3547         if (params->cgroup_path)
3548                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3549
3550         exec_status_start(&command->exec_status, pid);
3551
3552         *ret = pid;
3553         return 0;
3554 }
3555
3556 void exec_context_init(ExecContext *c) {
3557         ExecDirectoryType i;
3558
3559         assert(c);
3560
3561         c->umask = 0022;
3562         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3563         c->cpu_sched_policy = SCHED_OTHER;
3564         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3565         c->syslog_level_prefix = true;
3566         c->ignore_sigpipe = true;
3567         c->timer_slack_nsec = NSEC_INFINITY;
3568         c->personality = PERSONALITY_INVALID;
3569         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3570                 c->directories[i].mode = 0755;
3571         c->capability_bounding_set = CAP_ALL;
3572         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3573         c->log_level_max = -1;
3574 }
3575
3576 void exec_context_done(ExecContext *c) {
3577         ExecDirectoryType i;
3578         size_t l;
3579
3580         assert(c);
3581
3582         c->environment = strv_free(c->environment);
3583         c->environment_files = strv_free(c->environment_files);
3584         c->pass_environment = strv_free(c->pass_environment);
3585         c->unset_environment = strv_free(c->unset_environment);
3586
3587         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3588                 c->rlimit[l] = mfree(c->rlimit[l]);
3589
3590         for (l = 0; l < 3; l++) {
3591                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3592                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3593         }
3594
3595         c->working_directory = mfree(c->working_directory);
3596         c->root_directory = mfree(c->root_directory);
3597         c->root_image = mfree(c->root_image);
3598         c->tty_path = mfree(c->tty_path);
3599         c->syslog_identifier = mfree(c->syslog_identifier);
3600         c->user = mfree(c->user);
3601         c->group = mfree(c->group);
3602
3603         c->supplementary_groups = strv_free(c->supplementary_groups);
3604
3605         c->pam_name = mfree(c->pam_name);
3606
3607         c->read_only_paths = strv_free(c->read_only_paths);
3608         c->read_write_paths = strv_free(c->read_write_paths);
3609         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3610
3611         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3612
3613         if (c->cpuset)
3614                 CPU_FREE(c->cpuset);
3615
3616         c->utmp_id = mfree(c->utmp_id);
3617         c->selinux_context = mfree(c->selinux_context);
3618         c->apparmor_profile = mfree(c->apparmor_profile);
3619         c->smack_process_label = mfree(c->smack_process_label);
3620
3621         c->syscall_filter = hashmap_free(c->syscall_filter);
3622         c->syscall_archs = set_free(c->syscall_archs);
3623         c->address_families = set_free(c->address_families);
3624
3625         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3626                 c->directories[i].paths = strv_free(c->directories[i].paths);
3627
3628         c->log_level_max = -1;
3629
3630         exec_context_free_log_extra_fields(c);
3631
3632         c->stdin_data = mfree(c->stdin_data);
3633         c->stdin_data_size = 0;
3634 }
3635
3636 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3637         char **i;
3638
3639         assert(c);
3640
3641         if (!runtime_prefix)
3642                 return 0;
3643
3644         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3645                 _cleanup_free_ char *p;
3646
3647                 p = strjoin(runtime_prefix, "/", *i);
3648                 if (!p)
3649                         return -ENOMEM;
3650
3651                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3652                  * next. */
3653                 (void) rm_rf(p, REMOVE_ROOT);
3654         }
3655
3656         return 0;
3657 }
3658
3659 void exec_command_done(ExecCommand *c) {
3660         assert(c);
3661
3662         c->path = mfree(c->path);
3663
3664         c->argv = strv_free(c->argv);
3665 }
3666
3667 void exec_command_done_array(ExecCommand *c, unsigned n) {
3668         unsigned i;
3669
3670         for (i = 0; i < n; i++)
3671                 exec_command_done(c+i);
3672 }
3673
3674 ExecCommand* exec_command_free_list(ExecCommand *c) {
3675         ExecCommand *i;
3676
3677         while ((i = c)) {
3678                 LIST_REMOVE(command, c, i);
3679                 exec_command_done(i);
3680                 free(i);
3681         }
3682
3683         return NULL;
3684 }
3685
3686 void exec_command_free_array(ExecCommand **c, unsigned n) {
3687         unsigned i;
3688
3689         for (i = 0; i < n; i++)
3690                 c[i] = exec_command_free_list(c[i]);
3691 }
3692
3693 typedef struct InvalidEnvInfo {
3694         Unit *unit;
3695         const char *path;
3696 } InvalidEnvInfo;
3697
3698 static void invalid_env(const char *p, void *userdata) {
3699         InvalidEnvInfo *info = userdata;
3700
3701         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3702 }
3703
3704 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3705         assert(c);
3706
3707         switch (fd_index) {
3708
3709         case STDIN_FILENO:
3710                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3711                         return NULL;
3712
3713                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3714
3715         case STDOUT_FILENO:
3716                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3717                         return NULL;
3718
3719                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3720
3721         case STDERR_FILENO:
3722                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3723                         return NULL;
3724
3725                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3726
3727         default:
3728                 return NULL;
3729         }
3730 }
3731
3732 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3733         unsigned i, targets;
3734         const char* stdio_fdname[3];
3735         unsigned n_fds;
3736
3737         assert(c);
3738         assert(p);
3739
3740         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3741                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3742                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3743
3744         for (i = 0; i < 3; i++)
3745                 stdio_fdname[i] = exec_context_fdname(c, i);
3746
3747         n_fds = p->n_storage_fds + p->n_socket_fds;
3748
3749         for (i = 0; i < n_fds  && targets > 0; i++)
3750                 if (named_iofds[STDIN_FILENO] < 0 &&
3751                     c->std_input == EXEC_INPUT_NAMED_FD &&
3752                     stdio_fdname[STDIN_FILENO] &&
3753                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3754
3755                         named_iofds[STDIN_FILENO] = p->fds[i];
3756                         targets--;
3757
3758                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3759                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3760                            stdio_fdname[STDOUT_FILENO] &&
3761                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3762
3763                         named_iofds[STDOUT_FILENO] = p->fds[i];
3764                         targets--;
3765
3766                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3767                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3768                            stdio_fdname[STDERR_FILENO] &&
3769                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3770
3771                         named_iofds[STDERR_FILENO] = p->fds[i];
3772                         targets--;
3773                 }
3774
3775         return targets == 0 ? 0 : -ENOENT;
3776 }
3777
3778 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3779         char **i, **r = NULL;
3780
3781         assert(c);
3782         assert(l);
3783
3784         STRV_FOREACH(i, c->environment_files) {
3785                 char *fn;
3786                 int k;
3787                 unsigned n;
3788                 bool ignore = false;
3789                 char **p;
3790                 _cleanup_globfree_ glob_t pglob = {};
3791
3792                 fn = *i;
3793
3794                 if (fn[0] == '-') {
3795                         ignore = true;
3796                         fn++;
3797                 }
3798
3799                 if (!path_is_absolute(fn)) {
3800                         if (ignore)
3801                                 continue;
3802
3803                         strv_free(r);
3804                         return -EINVAL;
3805                 }
3806
3807                 /* Filename supports globbing, take all matching files */
3808                 k = safe_glob(fn, 0, &pglob);
3809                 if (k < 0) {
3810                         if (ignore)
3811                                 continue;
3812
3813                         strv_free(r);
3814                         return k;
3815                 }
3816
3817                 /* When we don't match anything, -ENOENT should be returned */
3818                 assert(pglob.gl_pathc > 0);
3819
3820                 for (n = 0; n < pglob.gl_pathc; n++) {
3821                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3822                         if (k < 0) {
3823                                 if (ignore)
3824                                         continue;
3825
3826                                 strv_free(r);
3827                                 return k;
3828                         }
3829                         /* Log invalid environment variables with filename */
3830                         if (p) {
3831                                 InvalidEnvInfo info = {
3832                                         .unit = unit,
3833                                         .path = pglob.gl_pathv[n]
3834                                 };
3835
3836                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3837                         }
3838
3839                         if (r == NULL)
3840                                 r = p;
3841                         else {
3842                                 char **m;
3843
3844                                 m = strv_env_merge(2, r, p);
3845                                 strv_free(r);
3846                                 strv_free(p);
3847                                 if (!m)
3848                                         return -ENOMEM;
3849
3850                                 r = m;
3851                         }
3852                 }
3853         }
3854
3855         *l = r;
3856
3857         return 0;
3858 }
3859
3860 static bool tty_may_match_dev_console(const char *tty) {
3861         _cleanup_free_ char *active = NULL;
3862         char *console;
3863
3864         if (!tty)
3865                 return true;
3866
3867         tty = skip_dev_prefix(tty);
3868
3869         /* trivial identity? */
3870         if (streq(tty, "console"))
3871                 return true;
3872
3873         console = resolve_dev_console(&active);
3874         /* if we could not resolve, assume it may */
3875         if (!console)
3876                 return true;
3877
3878         /* "tty0" means the active VC, so it may be the same sometimes */
3879         return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3880 }
3881
3882 bool exec_context_may_touch_console(ExecContext *ec) {
3883
3884         return (ec->tty_reset ||
3885                 ec->tty_vhangup ||
3886                 ec->tty_vt_disallocate ||
3887                 is_terminal_input(ec->std_input) ||
3888                 is_terminal_output(ec->std_output) ||
3889                 is_terminal_output(ec->std_error)) &&
3890                tty_may_match_dev_console(exec_context_tty_path(ec));
3891 }
3892
3893 static void strv_fprintf(FILE *f, char **l) {
3894         char **g;
3895
3896         assert(f);
3897
3898         STRV_FOREACH(g, l)
3899                 fprintf(f, " %s", *g);
3900 }
3901
3902 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3903         ExecDirectoryType dt;
3904         char **e, **d;
3905         unsigned i;
3906         int r;
3907
3908         assert(c);
3909         assert(f);
3910
3911         prefix = strempty(prefix);
3912
3913         fprintf(f,
3914                 "%sUMask: %04o\n"
3915                 "%sWorkingDirectory: %s\n"
3916                 "%sRootDirectory: %s\n"
3917                 "%sNonBlocking: %s\n"
3918                 "%sPrivateTmp: %s\n"
3919                 "%sPrivateDevices: %s\n"
3920                 "%sProtectKernelTunables: %s\n"
3921                 "%sProtectKernelModules: %s\n"
3922                 "%sProtectControlGroups: %s\n"
3923                 "%sPrivateNetwork: %s\n"
3924                 "%sPrivateUsers: %s\n"
3925                 "%sProtectHome: %s\n"
3926                 "%sProtectSystem: %s\n"
3927                 "%sMountAPIVFS: %s\n"
3928                 "%sIgnoreSIGPIPE: %s\n"
3929                 "%sMemoryDenyWriteExecute: %s\n"
3930                 "%sRestrictRealtime: %s\n"
3931                 "%sKeyringMode: %s\n",
3932                 prefix, c->umask,
3933                 prefix, c->working_directory ? c->working_directory : "/",
3934                 prefix, c->root_directory ? c->root_directory : "/",
3935                 prefix, yes_no(c->non_blocking),
3936                 prefix, yes_no(c->private_tmp),
3937                 prefix, yes_no(c->private_devices),
3938                 prefix, yes_no(c->protect_kernel_tunables),
3939                 prefix, yes_no(c->protect_kernel_modules),
3940                 prefix, yes_no(c->protect_control_groups),
3941                 prefix, yes_no(c->private_network),
3942                 prefix, yes_no(c->private_users),
3943                 prefix, protect_home_to_string(c->protect_home),
3944                 prefix, protect_system_to_string(c->protect_system),
3945                 prefix, yes_no(c->mount_apivfs),
3946                 prefix, yes_no(c->ignore_sigpipe),
3947                 prefix, yes_no(c->memory_deny_write_execute),
3948                 prefix, yes_no(c->restrict_realtime),
3949                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3950
3951         if (c->root_image)
3952                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3953
3954         STRV_FOREACH(e, c->environment)
3955                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3956
3957         STRV_FOREACH(e, c->environment_files)
3958                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3959
3960         STRV_FOREACH(e, c->pass_environment)
3961                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3962
3963         STRV_FOREACH(e, c->unset_environment)
3964                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3965
3966         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3967
3968         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3969                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3970
3971                 STRV_FOREACH(d, c->directories[dt].paths)
3972                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3973         }
3974
3975         if (c->nice_set)
3976                 fprintf(f,
3977                         "%sNice: %i\n",
3978                         prefix, c->nice);
3979
3980         if (c->oom_score_adjust_set)
3981                 fprintf(f,
3982                         "%sOOMScoreAdjust: %i\n",
3983                         prefix, c->oom_score_adjust);
3984
3985         for (i = 0; i < RLIM_NLIMITS; i++)
3986                 if (c->rlimit[i]) {
3987                         fprintf(f, "%s%s: " RLIM_FMT "\n",
3988                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3989                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3990                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3991                 }
3992
3993         if (c->ioprio_set) {
3994                 _cleanup_free_ char *class_str = NULL;
3995
3996                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3997                 if (r >= 0)
3998                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3999
4000                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4001         }
4002
4003         if (c->cpu_sched_set) {
4004                 _cleanup_free_ char *policy_str = NULL;
4005
4006                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4007                 if (r >= 0)
4008                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4009
4010                 fprintf(f,
4011                         "%sCPUSchedulingPriority: %i\n"
4012                         "%sCPUSchedulingResetOnFork: %s\n",
4013                         prefix, c->cpu_sched_priority,
4014                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4015         }
4016
4017         if (c->cpuset) {
4018                 fprintf(f, "%sCPUAffinity:", prefix);
4019                 for (i = 0; i < c->cpuset_ncpus; i++)
4020                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4021                                 fprintf(f, " %u", i);
4022                 fputs("\n", f);
4023         }
4024
4025         if (c->timer_slack_nsec != NSEC_INFINITY)
4026                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4027
4028         fprintf(f,
4029                 "%sStandardInput: %s\n"
4030                 "%sStandardOutput: %s\n"
4031                 "%sStandardError: %s\n",
4032                 prefix, exec_input_to_string(c->std_input),
4033                 prefix, exec_output_to_string(c->std_output),
4034                 prefix, exec_output_to_string(c->std_error));
4035
4036         if (c->std_input == EXEC_INPUT_NAMED_FD)
4037                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4038         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4039                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4040         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4041                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4042
4043         if (c->std_input == EXEC_INPUT_FILE)
4044                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4045         if (c->std_output == EXEC_OUTPUT_FILE)
4046                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4047         if (c->std_error == EXEC_OUTPUT_FILE)
4048                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4049
4050         if (c->tty_path)
4051                 fprintf(f,
4052                         "%sTTYPath: %s\n"
4053                         "%sTTYReset: %s\n"
4054                         "%sTTYVHangup: %s\n"
4055                         "%sTTYVTDisallocate: %s\n",
4056                         prefix, c->tty_path,
4057                         prefix, yes_no(c->tty_reset),
4058                         prefix, yes_no(c->tty_vhangup),
4059                         prefix, yes_no(c->tty_vt_disallocate));
4060
4061         if (IN_SET(c->std_output,
4062                    EXEC_OUTPUT_SYSLOG,
4063                    EXEC_OUTPUT_KMSG,
4064                    EXEC_OUTPUT_JOURNAL,
4065                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4066                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4067                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4068             IN_SET(c->std_error,
4069                    EXEC_OUTPUT_SYSLOG,
4070                    EXEC_OUTPUT_KMSG,
4071                    EXEC_OUTPUT_JOURNAL,
4072                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4073                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4074                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4075
4076                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4077
4078                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4079                 if (r >= 0)
4080                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4081
4082                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4083                 if (r >= 0)
4084                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4085         }
4086
4087         if (c->log_level_max >= 0) {
4088                 _cleanup_free_ char *t = NULL;
4089
4090                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4091
4092                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4093         }
4094
4095         if (c->n_log_extra_fields > 0) {
4096                 size_t j;
4097
4098                 for (j = 0; j < c->n_log_extra_fields; j++) {
4099                         fprintf(f, "%sLogExtraFields: ", prefix);
4100                         fwrite(c->log_extra_fields[j].iov_base,
4101                                1, c->log_extra_fields[j].iov_len,
4102                                f);
4103                         fputc('\n', f);
4104                 }
4105         }
4106
4107         if (c->secure_bits) {
4108                 _cleanup_free_ char *str = NULL;
4109
4110                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4111                 if (r >= 0)
4112                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4113         }
4114
4115         if (c->capability_bounding_set != CAP_ALL) {
4116                 _cleanup_free_ char *str = NULL;
4117
4118                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4119                 if (r >= 0)
4120                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4121         }
4122
4123         if (c->capability_ambient_set != 0) {
4124                 _cleanup_free_ char *str = NULL;
4125
4126                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4127                 if (r >= 0)
4128                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4129         }
4130
4131         if (c->user)
4132                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4133         if (c->group)
4134                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4135
4136         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4137
4138         if (!strv_isempty(c->supplementary_groups)) {
4139                 fprintf(f, "%sSupplementaryGroups:", prefix);
4140                 strv_fprintf(f, c->supplementary_groups);
4141                 fputs("\n", f);
4142         }
4143
4144         if (c->pam_name)
4145                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4146
4147         if (strv_length(c->read_write_paths) > 0) {
4148                 fprintf(f, "%sReadWritePaths:", prefix);
4149                 strv_fprintf(f, c->read_write_paths);
4150                 fputs("\n", f);
4151         }
4152
4153         if (strv_length(c->read_only_paths) > 0) {
4154                 fprintf(f, "%sReadOnlyPaths:", prefix);
4155                 strv_fprintf(f, c->read_only_paths);
4156                 fputs("\n", f);
4157         }
4158
4159         if (strv_length(c->inaccessible_paths) > 0) {
4160                 fprintf(f, "%sInaccessiblePaths:", prefix);
4161                 strv_fprintf(f, c->inaccessible_paths);
4162                 fputs("\n", f);
4163         }
4164
4165         if (c->n_bind_mounts > 0)
4166                 for (i = 0; i < c->n_bind_mounts; i++) {
4167                         fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4168                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4169                                 c->bind_mounts[i].source,
4170                                 c->bind_mounts[i].destination,
4171                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4172                 }
4173
4174         if (c->utmp_id)
4175                 fprintf(f,
4176                         "%sUtmpIdentifier: %s\n",
4177                         prefix, c->utmp_id);
4178
4179         if (c->selinux_context)
4180                 fprintf(f,
4181                         "%sSELinuxContext: %s%s\n",
4182                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4183
4184         if (c->apparmor_profile)
4185                 fprintf(f,
4186                         "%sAppArmorProfile: %s%s\n",
4187                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4188
4189         if (c->smack_process_label)
4190                 fprintf(f,
4191                         "%sSmackProcessLabel: %s%s\n",
4192                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4193
4194         if (c->personality != PERSONALITY_INVALID)
4195                 fprintf(f,
4196                         "%sPersonality: %s\n",
4197                         prefix, strna(personality_to_string(c->personality)));
4198
4199         fprintf(f,
4200                 "%sLockPersonality: %s\n",
4201                 prefix, yes_no(c->lock_personality));
4202
4203         if (c->syscall_filter) {
4204 #if HAVE_SECCOMP
4205                 Iterator j;
4206                 void *id, *val;
4207                 bool first = true;
4208 #endif
4209
4210                 fprintf(f,
4211                         "%sSystemCallFilter: ",
4212                         prefix);
4213
4214                 if (!c->syscall_whitelist)
4215                         fputc('~', f);
4216
4217 #if HAVE_SECCOMP
4218                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4219                         _cleanup_free_ char *name = NULL;
4220                         const char *errno_name = NULL;
4221                         int num = PTR_TO_INT(val);
4222
4223                         if (first)
4224                                 first = false;
4225                         else
4226                                 fputc(' ', f);
4227
4228                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4229                         fputs(strna(name), f);
4230
4231                         if (num >= 0) {
4232                                 errno_name = errno_to_name(num);
4233                                 if (errno_name)
4234                                         fprintf(f, ":%s", errno_name);
4235                                 else
4236                                         fprintf(f, ":%d", num);
4237                         }
4238                 }
4239 #endif
4240
4241                 fputc('\n', f);
4242         }
4243
4244         if (c->syscall_archs) {
4245 #if HAVE_SECCOMP
4246                 Iterator j;
4247                 void *id;
4248 #endif
4249
4250                 fprintf(f,
4251                         "%sSystemCallArchitectures:",
4252                         prefix);
4253
4254 #if HAVE_SECCOMP
4255                 SET_FOREACH(id, c->syscall_archs, j)
4256                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4257 #endif
4258                 fputc('\n', f);
4259         }
4260
4261         if (exec_context_restrict_namespaces_set(c)) {
4262                 _cleanup_free_ char *s = NULL;
4263
4264                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4265                 if (r >= 0)
4266                         fprintf(f, "%sRestrictNamespaces: %s\n",
4267                                 prefix, s);
4268         }
4269
4270         if (c->syscall_errno > 0) {
4271                 const char *errno_name;
4272
4273                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4274
4275                 errno_name = errno_to_name(c->syscall_errno);
4276                 if (errno_name)
4277                         fprintf(f, "%s\n", errno_name);
4278                 else
4279                         fprintf(f, "%d\n", c->syscall_errno);
4280         }
4281
4282         if (c->apparmor_profile)
4283                 fprintf(f,
4284                         "%sAppArmorProfile: %s%s\n",
4285                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4286 }
4287
4288 bool exec_context_maintains_privileges(ExecContext *c) {
4289         assert(c);
4290
4291         /* Returns true if the process forked off would run under
4292          * an unchanged UID or as root. */
4293
4294         if (!c->user)
4295                 return true;
4296
4297         if (streq(c->user, "root") || streq(c->user, "0"))
4298                 return true;
4299
4300         return false;
4301 }
4302
4303 int exec_context_get_effective_ioprio(ExecContext *c) {
4304         int p;
4305
4306         assert(c);
4307
4308         if (c->ioprio_set)
4309                 return c->ioprio;
4310
4311         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4312         if (p < 0)
4313                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4314
4315         return p;
4316 }
4317
4318 void exec_context_free_log_extra_fields(ExecContext *c) {
4319         size_t l;
4320
4321         assert(c);
4322
4323         for (l = 0; l < c->n_log_extra_fields; l++)
4324                 free(c->log_extra_fields[l].iov_base);
4325         c->log_extra_fields = mfree(c->log_extra_fields);
4326         c->n_log_extra_fields = 0;
4327 }
4328
4329 void exec_status_start(ExecStatus *s, pid_t pid) {
4330         assert(s);
4331
4332         zero(*s);
4333         s->pid = pid;
4334         dual_timestamp_get(&s->start_timestamp);
4335 }
4336
4337 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4338         assert(s);
4339
4340         if (s->pid && s->pid != pid)
4341                 zero(*s);
4342
4343         s->pid = pid;
4344         dual_timestamp_get(&s->exit_timestamp);
4345
4346         s->code = code;
4347         s->status = status;
4348
4349         if (context) {
4350                 if (context->utmp_id)
4351                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4352
4353                 exec_context_tty_reset(context, NULL);
4354         }
4355 }
4356
4357 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4358         char buf[FORMAT_TIMESTAMP_MAX];
4359
4360         assert(s);
4361         assert(f);
4362
4363         if (s->pid <= 0)
4364                 return;
4365
4366         prefix = strempty(prefix);
4367
4368         fprintf(f,
4369                 "%sPID: "PID_FMT"\n",
4370                 prefix, s->pid);
4371
4372         if (dual_timestamp_is_set(&s->start_timestamp))
4373                 fprintf(f,
4374                         "%sStart Timestamp: %s\n",
4375                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4376
4377         if (dual_timestamp_is_set(&s->exit_timestamp))
4378                 fprintf(f,
4379                         "%sExit Timestamp: %s\n"
4380                         "%sExit Code: %s\n"
4381                         "%sExit Status: %i\n",
4382                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4383                         prefix, sigchld_code_to_string(s->code),
4384                         prefix, s->status);
4385 }
4386
4387 char *exec_command_line(char **argv) {
4388         size_t k;
4389         char *n, *p, **a;
4390         bool first = true;
4391
4392         assert(argv);
4393
4394         k = 1;
4395         STRV_FOREACH(a, argv)
4396                 k += strlen(*a)+3;
4397
4398         n = new(char, k);
4399         if (!n)
4400                 return NULL;
4401
4402         p = n;
4403         STRV_FOREACH(a, argv) {
4404
4405                 if (!first)
4406                         *(p++) = ' ';
4407                 else
4408                         first = false;
4409
4410                 if (strpbrk(*a, WHITESPACE)) {
4411                         *(p++) = '\'';
4412                         p = stpcpy(p, *a);
4413                         *(p++) = '\'';
4414                 } else
4415                         p = stpcpy(p, *a);
4416
4417         }
4418
4419         *p = 0;
4420
4421         /* FIXME: this doesn't really handle arguments that have
4422          * spaces and ticks in them */
4423
4424         return n;
4425 }
4426
4427 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4428         _cleanup_free_ char *cmd = NULL;
4429         const char *prefix2;
4430
4431         assert(c);
4432         assert(f);
4433
4434         prefix = strempty(prefix);
4435         prefix2 = strjoina(prefix, "\t");
4436
4437         cmd = exec_command_line(c->argv);
4438         fprintf(f,
4439                 "%sCommand Line: %s\n",
4440                 prefix, cmd ? cmd : strerror(ENOMEM));
4441
4442         exec_status_dump(&c->exec_status, f, prefix2);
4443 }
4444
4445 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4446         assert(f);
4447
4448         prefix = strempty(prefix);
4449
4450         LIST_FOREACH(command, c, c)
4451                 exec_command_dump(c, f, prefix);
4452 }
4453
4454 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4455         ExecCommand *end;
4456
4457         assert(l);
4458         assert(e);
4459
4460         if (*l) {
4461                 /* It's kind of important, that we keep the order here */
4462                 LIST_FIND_TAIL(command, *l, end);
4463                 LIST_INSERT_AFTER(command, *l, end, e);
4464         } else
4465               *l = e;
4466 }
4467
4468 int exec_command_set(ExecCommand *c, const char *path, ...) {
4469         va_list ap;
4470         char **l, *p;
4471
4472         assert(c);
4473         assert(path);
4474
4475         va_start(ap, path);
4476         l = strv_new_ap(path, ap);
4477         va_end(ap);
4478
4479         if (!l)
4480                 return -ENOMEM;
4481
4482         p = strdup(path);
4483         if (!p) {
4484                 strv_free(l);
4485                 return -ENOMEM;
4486         }
4487
4488         free(c->path);
4489         c->path = p;
4490
4491         strv_free(c->argv);
4492         c->argv = l;
4493
4494         return 0;
4495 }
4496
4497 int exec_command_append(ExecCommand *c, const char *path, ...) {
4498         _cleanup_strv_free_ char **l = NULL;
4499         va_list ap;
4500         int r;
4501
4502         assert(c);
4503         assert(path);
4504
4505         va_start(ap, path);
4506         l = strv_new_ap(path, ap);
4507         va_end(ap);
4508
4509         if (!l)
4510                 return -ENOMEM;
4511
4512         r = strv_extend_strv(&c->argv, l, false);
4513         if (r < 0)
4514                 return r;
4515
4516         return 0;
4517 }
4518
4519
4520 static int exec_runtime_allocate(ExecRuntime **rt) {
4521
4522         if (*rt)
4523                 return 0;
4524
4525         *rt = new0(ExecRuntime, 1);
4526         if (!*rt)
4527                 return -ENOMEM;
4528
4529         (*rt)->n_ref = 1;
4530         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4531
4532         return 0;
4533 }
4534
4535 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4536         int r;
4537
4538         assert(rt);
4539         assert(c);
4540         assert(id);
4541
4542         if (*rt)
4543                 return 1;
4544
4545         if (!c->private_network && !c->private_tmp)
4546                 return 0;
4547
4548         r = exec_runtime_allocate(rt);
4549         if (r < 0)
4550                 return r;
4551
4552         if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4553                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4554                         return -errno;
4555         }
4556
4557         if (c->private_tmp && !(*rt)->tmp_dir) {
4558                 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4559                 if (r < 0)
4560                         return r;
4561         }
4562
4563         return 1;
4564 }
4565
4566 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4567         assert(r);
4568         assert(r->n_ref > 0);
4569
4570         r->n_ref++;
4571         return r;
4572 }
4573
4574 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4575
4576         if (!r)
4577                 return NULL;
4578
4579         assert(r->n_ref > 0);
4580
4581         r->n_ref--;
4582         if (r->n_ref > 0)
4583                 return NULL;
4584
4585         free(r->tmp_dir);
4586         free(r->var_tmp_dir);
4587         safe_close_pair(r->netns_storage_socket);
4588         return mfree(r);
4589 }
4590
4591 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4592         assert(u);
4593         assert(f);
4594         assert(fds);
4595
4596         if (!rt)
4597                 return 0;
4598
4599         if (rt->tmp_dir)
4600                 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4601
4602         if (rt->var_tmp_dir)
4603                 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4604
4605         if (rt->netns_storage_socket[0] >= 0) {
4606                 int copy;
4607
4608                 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4609                 if (copy < 0)
4610                         return copy;
4611
4612                 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4613         }
4614
4615         if (rt->netns_storage_socket[1] >= 0) {
4616                 int copy;
4617
4618                 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4619                 if (copy < 0)
4620                         return copy;
4621
4622                 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4623         }
4624
4625         return 0;
4626 }
4627
4628 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4629         int r;
4630
4631         assert(rt);
4632         assert(key);
4633         assert(value);
4634
4635         if (streq(key, "tmp-dir")) {
4636                 char *copy;
4637
4638                 r = exec_runtime_allocate(rt);
4639                 if (r < 0)
4640                         return log_oom();
4641
4642                 copy = strdup(value);
4643                 if (!copy)
4644                         return log_oom();
4645
4646                 free((*rt)->tmp_dir);
4647                 (*rt)->tmp_dir = copy;
4648
4649         } else if (streq(key, "var-tmp-dir")) {
4650                 char *copy;
4651
4652                 r = exec_runtime_allocate(rt);
4653                 if (r < 0)
4654                         return log_oom();
4655
4656                 copy = strdup(value);
4657                 if (!copy)
4658                         return log_oom();
4659
4660                 free((*rt)->var_tmp_dir);
4661                 (*rt)->var_tmp_dir = copy;
4662
4663         } else if (streq(key, "netns-socket-0")) {
4664                 int fd;
4665
4666                 r = exec_runtime_allocate(rt);
4667                 if (r < 0)
4668                         return log_oom();
4669
4670                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4671                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4672                 else {
4673                         safe_close((*rt)->netns_storage_socket[0]);
4674                         (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4675                 }
4676         } else if (streq(key, "netns-socket-1")) {
4677                 int fd;
4678
4679                 r = exec_runtime_allocate(rt);
4680                 if (r < 0)
4681                         return log_oom();
4682
4683                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4684                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4685                 else {
4686                         safe_close((*rt)->netns_storage_socket[1]);
4687                         (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4688                 }
4689         } else
4690                 return 0;
4691
4692         return 1;
4693 }
4694
4695 static void *remove_tmpdir_thread(void *p) {
4696         _cleanup_free_ char *path = p;
4697
4698         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4699         return NULL;
4700 }
4701
4702 void exec_runtime_destroy(ExecRuntime *rt) {
4703         int r;
4704
4705         if (!rt)
4706                 return;
4707
4708         /* If there are multiple users of this, let's leave the stuff around */
4709         if (rt->n_ref > 1)
4710                 return;
4711
4712         if (rt->tmp_dir) {
4713                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4714
4715                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4716                 if (r < 0) {
4717                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4718                         free(rt->tmp_dir);
4719                 }
4720
4721                 rt->tmp_dir = NULL;
4722         }
4723
4724         if (rt->var_tmp_dir) {
4725                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4726
4727                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4728                 if (r < 0) {
4729                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4730                         free(rt->var_tmp_dir);
4731                 }
4732
4733                 rt->var_tmp_dir = NULL;
4734         }
4735
4736         safe_close_pair(rt->netns_storage_socket);
4737 }
4738
4739 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4740         [EXEC_INPUT_NULL] = "null",
4741         [EXEC_INPUT_TTY] = "tty",
4742         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4743         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4744         [EXEC_INPUT_SOCKET] = "socket",
4745         [EXEC_INPUT_NAMED_FD] = "fd",
4746         [EXEC_INPUT_DATA] = "data",
4747         [EXEC_INPUT_FILE] = "file",
4748 };
4749
4750 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4751
4752 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4753         [EXEC_OUTPUT_INHERIT] = "inherit",
4754         [EXEC_OUTPUT_NULL] = "null",
4755         [EXEC_OUTPUT_TTY] = "tty",
4756         [EXEC_OUTPUT_SYSLOG] = "syslog",
4757         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4758         [EXEC_OUTPUT_KMSG] = "kmsg",
4759         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4760         [EXEC_OUTPUT_JOURNAL] = "journal",
4761         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4762         [EXEC_OUTPUT_SOCKET] = "socket",
4763         [EXEC_OUTPUT_NAMED_FD] = "fd",
4764         [EXEC_OUTPUT_FILE] = "file",
4765 };
4766
4767 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4768
4769 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4770         [EXEC_UTMP_INIT] = "init",
4771         [EXEC_UTMP_LOGIN] = "login",
4772         [EXEC_UTMP_USER] = "user",
4773 };
4774
4775 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4776
4777 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4778         [EXEC_PRESERVE_NO] = "no",
4779         [EXEC_PRESERVE_YES] = "yes",
4780         [EXEC_PRESERVE_RESTART] = "restart",
4781 };
4782
4783 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4784
4785 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4786         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4787         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4788         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4789         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4790         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4791 };
4792
4793 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4794
4795 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4796         [EXEC_KEYRING_INHERIT] = "inherit",
4797         [EXEC_KEYRING_PRIVATE] = "private",
4798         [EXEC_KEYRING_SHARED] = "shared",
4799 };
4800
4801 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);