src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <errno.h>
  22 #include <fcntl.h>
  23 #include <glob.h>
  24 #include <grp.h>
  25 #include <poll.h>
  26 #include <signal.h>
  27 #include <string.h>
  28 #include <sys/capability.h>
  29 #include <sys/eventfd.h>
  30 #include <sys/mman.h>
  31 #include <sys/personality.h>
  32 #include <sys/prctl.h>
  33 #include <sys/shm.h>
  34 #include <sys/socket.h>
  35 #include <sys/stat.h>
  36 #include <sys/types.h>
  37 #include <sys/un.h>
  38 #include <unistd.h>
  39 #include <utmpx.h>
  40
  41 #if HAVE_PAM
  42 #include <security/pam_appl.h>
  43 #endif
  44
  45 #if HAVE_SELINUX
  46 #include <selinux/selinux.h>
  47 #endif
  48
  49 #if HAVE_SECCOMP
  50 #include <seccomp.h>
  51 #endif
  52
  53 #if HAVE_APPARMOR
  54 #include <sys/apparmor.h>
  55 #endif
  56
  57 #include "sd-messages.h"
  58
  59 #include "af-list.h"
  60 #include "alloc-util.h"
  61 #if HAVE_APPARMOR
  62 #include "apparmor-util.h"
  63 #endif
  64 #include "async.h"
  65 #include "barrier.h"
  66 #include "cap-list.h"
  67 #include "capability-util.h"
  68 #include "chown-recursive.h"
  69 #include "cpu-set-util.h"
  70 #include "def.h"
  71 #include "env-util.h"
  72 #include "errno-list.h"
  73 #include "execute.h"
  74 #include "exit-status.h"
  75 #include "fd-util.h"
  76 #include "fileio.h"
  77 #include "format-util.h"
  78 #include "fs-util.h"
  79 #include "glob-util.h"
  80 #include "io-util.h"
  81 #include "ioprio.h"
  82 #include "label.h"
  83 #include "log.h"
  84 #include "macro.h"
  85 #include "manager.h"
  86 #include "missing.h"
  87 #include "mkdir.h"
  88 #include "namespace.h"
  89 #include "parse-util.h"
  90 #include "path-util.h"
  91 #include "process-util.h"
  92 #include "rlimit-util.h"
  93 #include "rm-rf.h"
  94 #if HAVE_SECCOMP
  95 #include "seccomp-util.h"
  96 #endif
  97 #include "securebits.h"
  98 #include "securebits-util.h"
  99 #include "selinux-util.h"
 100 #include "signal-util.h"
 101 #include "smack-util.h"
 102 #include "special.h"
 103 #include "stat-util.h"
 104 #include "string-table.h"
 105 #include "string-util.h"
 106 #include "strv.h"
 107 #include "syslog-util.h"
 108 #include "terminal-util.h"
 109 #include "unit.h"
 110 #include "user-util.h"
 111 #include "util.h"
 112 #include "utmp-wtmp.h"
 113
 114 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 115 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 116
 117 /* This assumes there is a 'tty' group */
 118 #define TTY_MODE 0620
 119
 120 #define SNDBUF_SIZE (8*1024*1024)
 121
 122 static int shift_fds(int fds[], unsigned n_fds) {
 123         int start, restart_from;
 124
 125         if (n_fds <= 0)
 126                 return 0;
 127
 128         /* Modifies the fds array! (sorts it) */
 129
 130         assert(fds);
 131
 132         start = 0;
 133         for (;;) {
 134                 int i;
 135
 136                 restart_from = -1;
 137
 138                 for (i = start; i < (int) n_fds; i++) {
 139                         int nfd;
 140
 141                         /* Already at right index? */
 142                         if (fds[i] == i+3)
 143                                 continue;
 144
 145                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 146                         if (nfd < 0)
 147                                 return -errno;
 148
 149                         safe_close(fds[i]);
 150                         fds[i] = nfd;
 151
 152                         /* Hmm, the fd we wanted isn't free? Then
 153                          * let's remember that and try again from here */
 154                         if (nfd != i+3 && restart_from < 0)
 155                                 restart_from = i;
 156                 }
 157
 158                 if (restart_from < 0)
 159                         break;
 160
 161                 start = restart_from;
 162         }
 163
 164         return 0;
 165 }
 166
 167 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 168         unsigned i, n_fds;
 169         int r;
 170
 171         n_fds = n_storage_fds + n_socket_fds;
 172         if (n_fds <= 0)
 173                 return 0;
 174
 175         assert(fds);
 176
 177         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 178          * O_NONBLOCK only applies to socket activation though. */
 179
 180         for (i = 0; i < n_fds; i++) {
 181
 182                 if (i < n_socket_fds) {
 183                         r = fd_nonblock(fds[i], nonblock);
 184                         if (r < 0)
 185                                 return r;
 186                 }
 187
 188                 /* We unconditionally drop FD_CLOEXEC from the fds,
 189                  * since after all we want to pass these fds to our
 190                  * children */
 191
 192                 r = fd_cloexec(fds[i], false);
 193                 if (r < 0)
 194                         return r;
 195         }
 196
 197         return 0;
 198 }
 199
 200 static const char *exec_context_tty_path(const ExecContext *context) {
 201         assert(context);
 202
 203         if (context->stdio_as_fds)
 204                 return NULL;
 205
 206         if (context->tty_path)
 207                 return context->tty_path;
 208
 209         return "/dev/console";
 210 }
 211
 212 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 213         const char *path;
 214
 215         assert(context);
 216
 217         path = exec_context_tty_path(context);
 218
 219         if (context->tty_vhangup) {
 220                 if (p && p->stdin_fd >= 0)
 221                         (void) terminal_vhangup_fd(p->stdin_fd);
 222                 else if (path)
 223                         (void) terminal_vhangup(path);
 224         }
 225
 226         if (context->tty_reset) {
 227                 if (p && p->stdin_fd >= 0)
 228                         (void) reset_terminal_fd(p->stdin_fd, true);
 229                 else if (path)
 230                         (void) reset_terminal(path);
 231         }
 232
 233         if (context->tty_vt_disallocate && path)
 234                 (void) vt_disallocate(path);
 235 }
 236
 237 static bool is_terminal_input(ExecInput i) {
 238         return IN_SET(i,
 239                       EXEC_INPUT_TTY,
 240                       EXEC_INPUT_TTY_FORCE,
 241                       EXEC_INPUT_TTY_FAIL);
 242 }
 243
 244 static bool is_terminal_output(ExecOutput o) {
 245         return IN_SET(o,
 246                       EXEC_OUTPUT_TTY,
 247                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 248                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 249                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 250 }
 251
 252 static bool is_syslog_output(ExecOutput o) {
 253         return IN_SET(o,
 254                       EXEC_OUTPUT_SYSLOG,
 255                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 256 }
 257
 258 static bool is_kmsg_output(ExecOutput o) {
 259         return IN_SET(o,
 260                       EXEC_OUTPUT_KMSG,
 261                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 262 }
 263
 264 static bool exec_context_needs_term(const ExecContext *c) {
 265         assert(c);
 266
 267         /* Return true if the execution context suggests we should set $TERM to something useful. */
 268
 269         if (is_terminal_input(c->std_input))
 270                 return true;
 271
 272         if (is_terminal_output(c->std_output))
 273                 return true;
 274
 275         if (is_terminal_output(c->std_error))
 276                 return true;
 277
 278         return !!c->tty_path;
 279 }
 280
 281 static int open_null_as(int flags, int nfd) {
 282         int fd;
 283
 284         assert(nfd >= 0);
 285
 286         fd = open("/dev/null", flags|O_NOCTTY);
 287         if (fd < 0)
 288                 return -errno;
 289
 290         return move_fd(fd, nfd, false);
 291 }
 292
 293 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 294         static const union sockaddr_union sa = {
 295                 .un.sun_family = AF_UNIX,
 296                 .un.sun_path = "/run/systemd/journal/stdout",
 297         };
 298         uid_t olduid = UID_INVALID;
 299         gid_t oldgid = GID_INVALID;
 300         int r;
 301
 302         if (gid_is_valid(gid)) {
 303                 oldgid = getgid();
 304
 305                 if (setegid(gid) < 0)
 306                         return -errno;
 307         }
 308
 309         if (uid_is_valid(uid)) {
 310                 olduid = getuid();
 311
 312                 if (seteuid(uid) < 0) {
 313                         r = -errno;
 314                         goto restore_gid;
 315                 }
 316         }
 317
 318         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 319
 320         /* If we fail to restore the uid or gid, things will likely
 321            fail later on. This should only happen if an LSM interferes. */
 322
 323         if (uid_is_valid(uid))
 324                 (void) seteuid(olduid);
 325
 326  restore_gid:
 327         if (gid_is_valid(gid))
 328                 (void) setegid(oldgid);
 329
 330         return r;
 331 }
 332
 333 static int connect_logger_as(
 334                 const Unit *unit,
 335                 const ExecContext *context,
 336                 const ExecParameters *params,
 337                 ExecOutput output,
 338                 const char *ident,
 339                 int nfd,
 340                 uid_t uid,
 341                 gid_t gid) {
 342
 343         int fd, r;
 344
 345         assert(context);
 346         assert(params);
 347         assert(output < _EXEC_OUTPUT_MAX);
 348         assert(ident);
 349         assert(nfd >= 0);
 350
 351         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 352         if (fd < 0)
 353                 return -errno;
 354
 355         r = connect_journal_socket(fd, uid, gid);
 356         if (r < 0)
 357                 return r;
 358
 359         if (shutdown(fd, SHUT_RD) < 0) {
 360                 safe_close(fd);
 361                 return -errno;
 362         }
 363
 364         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 365
 366         dprintf(fd,
 367                 "%s\n"
 368                 "%s\n"
 369                 "%i\n"
 370                 "%i\n"
 371                 "%i\n"
 372                 "%i\n"
 373                 "%i\n",
 374                 context->syslog_identifier ?: ident,
 375                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 376                 context->syslog_priority,
 377                 !!context->syslog_level_prefix,
 378                 is_syslog_output(output),
 379                 is_kmsg_output(output),
 380                 is_terminal_output(output));
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384 static int open_terminal_as(const char *path, int flags, int nfd) {
 385         int fd;
 386
 387         assert(path);
 388         assert(nfd >= 0);
 389
 390         fd = open_terminal(path, flags | O_NOCTTY);
 391         if (fd < 0)
 392                 return fd;
 393
 394         return move_fd(fd, nfd, false);
 395 }
 396
 397 static int acquire_path(const char *path, int flags, mode_t mode) {
 398         union sockaddr_union sa = {
 399                 .sa.sa_family = AF_UNIX,
 400         };
 401         int fd, r;
 402
 403         assert(path);
 404
 405         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 406                 flags |= O_CREAT;
 407
 408         fd = open(path, flags|O_NOCTTY, mode);
 409         if (fd >= 0)
 410                 return fd;
 411
 412         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 413                 return -errno;
 414         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 415                 return -ENXIO;
 416
 417         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 418
 419         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 420         if (fd < 0)
 421                 return -errno;
 422
 423         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 424         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 425                 safe_close(fd);
 426                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 427                                                            * indication that his wasn't an AF_UNIX socket after all */
 428         }
 429
 430         if ((flags & O_ACCMODE) == O_RDONLY)
 431                 r = shutdown(fd, SHUT_WR);
 432         else if ((flags & O_ACCMODE) == O_WRONLY)
 433                 r = shutdown(fd, SHUT_RD);
 434         else
 435                 return fd;
 436         if (r < 0) {
 437                 safe_close(fd);
 438                 return -errno;
 439         }
 440
 441         return fd;
 442 }
 443
 444 static int fixup_input(
 445                 const ExecContext *context,
 446                 int socket_fd,
 447                 bool apply_tty_stdin) {
 448
 449         ExecInput std_input;
 450
 451         assert(context);
 452
 453         std_input = context->std_input;
 454
 455         if (is_terminal_input(std_input) && !apply_tty_stdin)
 456                 return EXEC_INPUT_NULL;
 457
 458         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 459                 return EXEC_INPUT_NULL;
 460
 461         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 462                 return EXEC_INPUT_NULL;
 463
 464         return std_input;
 465 }
 466
 467 static int fixup_output(ExecOutput std_output, int socket_fd) {
 468
 469         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 470                 return EXEC_OUTPUT_INHERIT;
 471
 472         return std_output;
 473 }
 474
 475 static int setup_input(
 476                 const ExecContext *context,
 477                 const ExecParameters *params,
 478                 int socket_fd,
 479                 int named_iofds[3]) {
 480
 481         ExecInput i;
 482
 483         assert(context);
 484         assert(params);
 485
 486         if (params->stdin_fd >= 0) {
 487                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 488                         return -errno;
 489
 490                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 491                 if (isatty(STDIN_FILENO)) {
 492                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 493                         (void) reset_terminal_fd(STDIN_FILENO, true);
 494                 }
 495
 496                 return STDIN_FILENO;
 497         }
 498
 499         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 500
 501         switch (i) {
 502
 503         case EXEC_INPUT_NULL:
 504                 return open_null_as(O_RDONLY, STDIN_FILENO);
 505
 506         case EXEC_INPUT_TTY:
 507         case EXEC_INPUT_TTY_FORCE:
 508         case EXEC_INPUT_TTY_FAIL: {
 509                 int fd;
 510
 511                 fd = acquire_terminal(exec_context_tty_path(context),
 512                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 513                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 514                                                                   ACQUIRE_TERMINAL_WAIT,
 515                                       USEC_INFINITY);
 516                 if (fd < 0)
 517                         return fd;
 518
 519                 return move_fd(fd, STDIN_FILENO, false);
 520         }
 521
 522         case EXEC_INPUT_SOCKET:
 523                 assert(socket_fd >= 0);
 524
 525                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 526
 527         case EXEC_INPUT_NAMED_FD:
 528                 assert(named_iofds[STDIN_FILENO] >= 0);
 529
 530                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 531                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 532
 533         case EXEC_INPUT_DATA: {
 534                 int fd;
 535
 536                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 537                 if (fd < 0)
 538                         return fd;
 539
 540                 return move_fd(fd, STDIN_FILENO, false);
 541         }
 542
 543         case EXEC_INPUT_FILE: {
 544                 bool rw;
 545                 int fd;
 546
 547                 assert(context->stdio_file[STDIN_FILENO]);
 548
 549                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 550                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 551
 552                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 553                 if (fd < 0)
 554                         return fd;
 555
 556                 return move_fd(fd, STDIN_FILENO, false);
 557         }
 558
 559         default:
 560                 assert_not_reached("Unknown input type");
 561         }
 562 }
 563
 564 static int setup_output(
 565                 const Unit *unit,
 566                 const ExecContext *context,
 567                 const ExecParameters *params,
 568                 int fileno,
 569                 int socket_fd,
 570                 int named_iofds[3],
 571                 const char *ident,
 572                 uid_t uid,
 573                 gid_t gid,
 574                 dev_t *journal_stream_dev,
 575                 ino_t *journal_stream_ino) {
 576
 577         ExecOutput o;
 578         ExecInput i;
 579         int r;
 580
 581         assert(unit);
 582         assert(context);
 583         assert(params);
 584         assert(ident);
 585         assert(journal_stream_dev);
 586         assert(journal_stream_ino);
 587
 588         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 589
 590                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 591                         return -errno;
 592
 593                 return STDOUT_FILENO;
 594         }
 595
 596         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 597                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 598                         return -errno;
 599
 600                 return STDERR_FILENO;
 601         }
 602
 603         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 604         o = fixup_output(context->std_output, socket_fd);
 605
 606         if (fileno == STDERR_FILENO) {
 607                 ExecOutput e;
 608                 e = fixup_output(context->std_error, socket_fd);
 609
 610                 /* This expects the input and output are already set up */
 611
 612                 /* Don't change the stderr file descriptor if we inherit all
 613                  * the way and are not on a tty */
 614                 if (e == EXEC_OUTPUT_INHERIT &&
 615                     o == EXEC_OUTPUT_INHERIT &&
 616                     i == EXEC_INPUT_NULL &&
 617                     !is_terminal_input(context->std_input) &&
 618                     getppid () != 1)
 619                         return fileno;
 620
 621                 /* Duplicate from stdout if possible */
 622                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 623                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 624
 625                 o = e;
 626
 627         } else if (o == EXEC_OUTPUT_INHERIT) {
 628                 /* If input got downgraded, inherit the original value */
 629                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 630                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 631
 632                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 633                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 634                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 635
 636                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 637                 if (getppid() != 1)
 638                         return fileno;
 639
 640                 /* We need to open /dev/null here anew, to get the right access mode. */
 641                 return open_null_as(O_WRONLY, fileno);
 642         }
 643
 644         switch (o) {
 645
 646         case EXEC_OUTPUT_NULL:
 647                 return open_null_as(O_WRONLY, fileno);
 648
 649         case EXEC_OUTPUT_TTY:
 650                 if (is_terminal_input(i))
 651                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 652
 653                 /* We don't reset the terminal if this is just about output */
 654                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 655
 656         case EXEC_OUTPUT_SYSLOG:
 657         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 658         case EXEC_OUTPUT_KMSG:
 659         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 660         case EXEC_OUTPUT_JOURNAL:
 661         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 662                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 663                 if (r < 0) {
 664                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 665                         r = open_null_as(O_WRONLY, fileno);
 666                 } else {
 667                         struct stat st;
 668
 669                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 670                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 671                          * services to detect whether they are connected to the journal or not.
 672                          *
 673                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 674                          * about STDERR as that's usually the best way to do logging. */
 675
 676                         if (fstat(fileno, &st) >= 0 &&
 677                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 678                                 *journal_stream_dev = st.st_dev;
 679                                 *journal_stream_ino = st.st_ino;
 680                         }
 681                 }
 682                 return r;
 683
 684         case EXEC_OUTPUT_SOCKET:
 685                 assert(socket_fd >= 0);
 686
 687                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 688
 689         case EXEC_OUTPUT_NAMED_FD:
 690                 assert(named_iofds[fileno] >= 0);
 691
 692                 (void) fd_nonblock(named_iofds[fileno], false);
 693                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 694
 695         case EXEC_OUTPUT_FILE: {
 696                 bool rw;
 697                 int fd;
 698
 699                 assert(context->stdio_file[fileno]);
 700
 701                 rw = context->std_input == EXEC_INPUT_FILE &&
 702                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 703
 704                 if (rw)
 705                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 706
 707                 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
 708                 if (fd < 0)
 709                         return fd;
 710
 711                 return move_fd(fd, fileno, false);
 712         }
 713
 714         default:
 715                 assert_not_reached("Unknown error type");
 716         }
 717 }
 718
 719 static int chown_terminal(int fd, uid_t uid) {
 720         struct stat st;
 721
 722         assert(fd >= 0);
 723
 724         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 725         if (isatty(fd) < 1)
 726                 return 0;
 727
 728         /* This might fail. What matters are the results. */
 729         (void) fchown(fd, uid, -1);
 730         (void) fchmod(fd, TTY_MODE);
 731
 732         if (fstat(fd, &st) < 0)
 733                 return -errno;
 734
 735         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 736                 return -EPERM;
 737
 738         return 0;
 739 }
 740
 741 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 742         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 743         int r;
 744
 745         assert(_saved_stdin);
 746         assert(_saved_stdout);
 747
 748         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 749         if (saved_stdin < 0)
 750                 return -errno;
 751
 752         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 753         if (saved_stdout < 0)
 754                 return -errno;
 755
 756         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 757         if (fd < 0)
 758                 return fd;
 759
 760         r = chown_terminal(fd, getuid());
 761         if (r < 0)
 762                 return r;
 763
 764         r = reset_terminal_fd(fd, true);
 765         if (r < 0)
 766                 return r;
 767
 768         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 769         fd = -1;
 770         if (r < 0)
 771                 return r;
 772
 773         *_saved_stdin = saved_stdin;
 774         *_saved_stdout = saved_stdout;
 775
 776         saved_stdin = saved_stdout = -1;
 777
 778         return 0;
 779 }
 780
 781 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 782         assert(err < 0);
 783
 784         if (err == -ETIMEDOUT)
 785                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 786         else {
 787                 errno = -err;
 788                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 789         }
 790 }
 791
 792 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 793         _cleanup_close_ int fd = -1;
 794
 795         assert(vc);
 796
 797         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 798         if (fd < 0)
 799                 return;
 800
 801         write_confirm_error_fd(err, fd, u);
 802 }
 803
 804 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 805         int r = 0;
 806
 807         assert(saved_stdin);
 808         assert(saved_stdout);
 809
 810         release_terminal();
 811
 812         if (*saved_stdin >= 0)
 813                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 814                         r = -errno;
 815
 816         if (*saved_stdout >= 0)
 817                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 818                         r = -errno;
 819
 820         *saved_stdin = safe_close(*saved_stdin);
 821         *saved_stdout = safe_close(*saved_stdout);
 822
 823         return r;
 824 }
 825
 826 enum {
 827         CONFIRM_PRETEND_FAILURE = -1,
 828         CONFIRM_PRETEND_SUCCESS =  0,
 829         CONFIRM_EXECUTE = 1,
 830 };
 831
 832 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 833         int saved_stdout = -1, saved_stdin = -1, r;
 834         _cleanup_free_ char *e = NULL;
 835         char c;
 836
 837         /* For any internal errors, assume a positive response. */
 838         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 839         if (r < 0) {
 840                 write_confirm_error(r, vc, u);
 841                 return CONFIRM_EXECUTE;
 842         }
 843
 844         /* confirm_spawn might have been disabled while we were sleeping. */
 845         if (manager_is_confirm_spawn_disabled(u->manager)) {
 846                 r = 1;
 847                 goto restore_stdio;
 848         }
 849
 850         e = ellipsize(cmdline, 60, 100);
 851         if (!e) {
 852                 log_oom();
 853                 r = CONFIRM_EXECUTE;
 854                 goto restore_stdio;
 855         }
 856
 857         for (;;) {
 858                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 859                 if (r < 0) {
 860                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 861                         r = CONFIRM_EXECUTE;
 862                         goto restore_stdio;
 863                 }
 864
 865                 switch (c) {
 866                 case 'c':
 867                         printf("Resuming normal execution.\n");
 868                         manager_disable_confirm_spawn();
 869                         r = 1;
 870                         break;
 871                 case 'D':
 872                         unit_dump(u, stdout, "  ");
 873                         continue; /* ask again */
 874                 case 'f':
 875                         printf("Failing execution.\n");
 876                         r = CONFIRM_PRETEND_FAILURE;
 877                         break;
 878                 case 'h':
 879                         printf("  c - continue, proceed without asking anymore\n"
 880                                "  D - dump, show the state of the unit\n"
 881                                "  f - fail, don't execute the command and pretend it failed\n"
 882                                "  h - help\n"
 883                                "  i - info, show a short summary of the unit\n"
 884                                "  j - jobs, show jobs that are in progress\n"
 885                                "  s - skip, don't execute the command and pretend it succeeded\n"
 886                                "  y - yes, execute the command\n");
 887                         continue; /* ask again */
 888                 case 'i':
 889                         printf("  Description: %s\n"
 890                                "  Unit:        %s\n"
 891                                "  Command:     %s\n",
 892                                u->id, u->description, cmdline);
 893                         continue; /* ask again */
 894                 case 'j':
 895                         manager_dump_jobs(u->manager, stdout, "  ");
 896                         continue; /* ask again */
 897                 case 'n':
 898                         /* 'n' was removed in favor of 'f'. */
 899                         printf("Didn't understand 'n', did you mean 'f'?\n");
 900                         continue; /* ask again */
 901                 case 's':
 902                         printf("Skipping execution.\n");
 903                         r = CONFIRM_PRETEND_SUCCESS;
 904                         break;
 905                 case 'y':
 906                         r = CONFIRM_EXECUTE;
 907                         break;
 908                 default:
 909                         assert_not_reached("Unhandled choice");
 910                 }
 911                 break;
 912         }
 913
 914 restore_stdio:
 915         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 916         return r;
 917 }
 918
 919 static int get_fixed_user(const ExecContext *c, const char **user,
 920                           uid_t *uid, gid_t *gid,
 921                           const char **home, const char **shell) {
 922         int r;
 923         const char *name;
 924
 925         assert(c);
 926
 927         if (!c->user)
 928                 return 0;
 929
 930         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 931          * (i.e. are "/" or "/bin/nologin"). */
 932
 933         name = c->user;
 934         r = get_user_creds_clean(&name, uid, gid, home, shell);
 935         if (r < 0)
 936                 return r;
 937
 938         *user = name;
 939         return 0;
 940 }
 941
 942 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 943         int r;
 944         const char *name;
 945
 946         assert(c);
 947
 948         if (!c->group)
 949                 return 0;
 950
 951         name = c->group;
 952         r = get_group_creds(&name, gid);
 953         if (r < 0)
 954                 return r;
 955
 956         *group = name;
 957         return 0;
 958 }
 959
 960 static int get_supplementary_groups(const ExecContext *c, const char *user,
 961                                     const char *group, gid_t gid,
 962                                     gid_t **supplementary_gids, int *ngids) {
 963         char **i;
 964         int r, k = 0;
 965         int ngroups_max;
 966         bool keep_groups = false;
 967         gid_t *groups = NULL;
 968         _cleanup_free_ gid_t *l_gids = NULL;
 969
 970         assert(c);
 971
 972         /*
 973          * If user is given, then lookup GID and supplementary groups list.
 974          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 975          * here and as early as possible so we keep the list of supplementary
 976          * groups of the caller.
 977          */
 978         if (user && gid_is_valid(gid) && gid != 0) {
 979                 /* First step, initialize groups from /etc/groups */
 980                 if (initgroups(user, gid) < 0)
 981                         return -errno;
 982
 983                 keep_groups = true;
 984         }
 985
 986         if (strv_isempty(c->supplementary_groups))
 987                 return 0;
 988
 989         /*
 990          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 991          * be positive, otherwise fail.
 992          */
 993         errno = 0;
 994         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 995         if (ngroups_max <= 0) {
 996                 if (errno > 0)
 997                         return -errno;
 998                 else
 999                         return -EOPNOTSUPP; /* For all other values */
1000         }
1001
1002         l_gids = new(gid_t, ngroups_max);
1003         if (!l_gids)
1004                 return -ENOMEM;
1005
1006         if (keep_groups) {
1007                 /*
1008                  * Lookup the list of groups that the user belongs to, we
1009                  * avoid NSS lookups here too for gid=0.
1010                  */
1011                 k = ngroups_max;
1012                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1013                         return -EINVAL;
1014         } else
1015                 k = 0;
1016
1017         STRV_FOREACH(i, c->supplementary_groups) {
1018                 const char *g;
1019
1020                 if (k >= ngroups_max)
1021                         return -E2BIG;
1022
1023                 g = *i;
1024                 r = get_group_creds(&g, l_gids+k);
1025                 if (r < 0)
1026                         return r;
1027
1028                 k++;
1029         }
1030
1031         /*
1032          * Sets ngids to zero to drop all supplementary groups, happens
1033          * when we are under root and SupplementaryGroups= is empty.
1034          */
1035         if (k == 0) {
1036                 *ngids = 0;
1037                 return 0;
1038         }
1039
1040         /* Otherwise get the final list of supplementary groups */
1041         groups = memdup(l_gids, sizeof(gid_t) * k);
1042         if (!groups)
1043                 return -ENOMEM;
1044
1045         *supplementary_gids = groups;
1046         *ngids = k;
1047
1048         groups = NULL;
1049
1050         return 0;
1051 }
1052
1053 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1054         int r;
1055
1056         /* Handle SupplementaryGroups= if it is not empty */
1057         if (ngids > 0) {
1058                 r = maybe_setgroups(ngids, supplementary_gids);
1059                 if (r < 0)
1060                         return r;
1061         }
1062
1063         if (gid_is_valid(gid)) {
1064                 /* Then set our gids */
1065                 if (setresgid(gid, gid, gid) < 0)
1066                         return -errno;
1067         }
1068
1069         return 0;
1070 }
1071
1072 static int enforce_user(const ExecContext *context, uid_t uid) {
1073         assert(context);
1074
1075         if (!uid_is_valid(uid))
1076                 return 0;
1077
1078         /* Sets (but doesn't look up) the uid and make sure we keep the
1079          * capabilities while doing so. */
1080
1081         if (context->capability_ambient_set != 0) {
1082
1083                 /* First step: If we need to keep capabilities but
1084                  * drop privileges we need to make sure we keep our
1085                  * caps, while we drop privileges. */
1086                 if (uid != 0) {
1087                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1088
1089                         if (prctl(PR_GET_SECUREBITS) != sb)
1090                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1091                                         return -errno;
1092                 }
1093         }
1094
1095         /* Second step: actually set the uids */
1096         if (setresuid(uid, uid, uid) < 0)
1097                 return -errno;
1098
1099         /* At this point we should have all necessary capabilities but
1100            are otherwise a normal user. However, the caps might got
1101            corrupted due to the setresuid() so we need clean them up
1102            later. This is done outside of this call. */
1103
1104         return 0;
1105 }
1106
1107 #if HAVE_PAM
1108
1109 static int null_conv(
1110                 int num_msg,
1111                 const struct pam_message **msg,
1112                 struct pam_response **resp,
1113                 void *appdata_ptr) {
1114
1115         /* We don't support conversations */
1116
1117         return PAM_CONV_ERR;
1118 }
1119
1120 #endif
1121
1122 static int setup_pam(
1123                 const char *name,
1124                 const char *user,
1125                 uid_t uid,
1126                 gid_t gid,
1127                 const char *tty,
1128                 char ***env,
1129                 int fds[], unsigned n_fds) {
1130
1131 #if HAVE_PAM
1132
1133         static const struct pam_conv conv = {
1134                 .conv = null_conv,
1135                 .appdata_ptr = NULL
1136         };
1137
1138         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1139         pam_handle_t *handle = NULL;
1140         sigset_t old_ss;
1141         int pam_code = PAM_SUCCESS, r;
1142         char **nv, **e = NULL;
1143         bool close_session = false;
1144         pid_t pam_pid = 0, parent_pid;
1145         int flags = 0;
1146
1147         assert(name);
1148         assert(user);
1149         assert(env);
1150
1151         /* We set up PAM in the parent process, then fork. The child
1152          * will then stay around until killed via PR_GET_PDEATHSIG or
1153          * systemd via the cgroup logic. It will then remove the PAM
1154          * session again. The parent process will exec() the actual
1155          * daemon. We do things this way to ensure that the main PID
1156          * of the daemon is the one we initially fork()ed. */
1157
1158         r = barrier_create(&barrier);
1159         if (r < 0)
1160                 goto fail;
1161
1162         if (log_get_max_level() < LOG_DEBUG)
1163                 flags |= PAM_SILENT;
1164
1165         pam_code = pam_start(name, user, &conv, &handle);
1166         if (pam_code != PAM_SUCCESS) {
1167                 handle = NULL;
1168                 goto fail;
1169         }
1170
1171         if (tty) {
1172                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1173                 if (pam_code != PAM_SUCCESS)
1174                         goto fail;
1175         }
1176
1177         STRV_FOREACH(nv, *env) {
1178                 pam_code = pam_putenv(handle, *nv);
1179                 if (pam_code != PAM_SUCCESS)
1180                         goto fail;
1181         }
1182
1183         pam_code = pam_acct_mgmt(handle, flags);
1184         if (pam_code != PAM_SUCCESS)
1185                 goto fail;
1186
1187         pam_code = pam_open_session(handle, flags);
1188         if (pam_code != PAM_SUCCESS)
1189                 goto fail;
1190
1191         close_session = true;
1192
1193         e = pam_getenvlist(handle);
1194         if (!e) {
1195                 pam_code = PAM_BUF_ERR;
1196                 goto fail;
1197         }
1198
1199         /* Block SIGTERM, so that we know that it won't get lost in
1200          * the child */
1201
1202         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1203
1204         parent_pid = getpid_cached();
1205
1206         r = safe_fork("(sd-pam)", 0, &pam_pid);
1207         if (r < 0)
1208                 goto fail;
1209         if (r == 0) {
1210                 int sig, ret = EXIT_PAM;
1211
1212                 /* The child's job is to reset the PAM session on
1213                  * termination */
1214                 barrier_set_role(&barrier, BARRIER_CHILD);
1215
1216                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1217                  * are open here that have been opened by PAM. */
1218                 (void) close_many(fds, n_fds);
1219
1220                 /* Drop privileges - we don't need any to pam_close_session
1221                  * and this will make PR_SET_PDEATHSIG work in most cases.
1222                  * If this fails, ignore the error - but expect sd-pam threads
1223                  * to fail to exit normally */
1224
1225                 r = maybe_setgroups(0, NULL);
1226                 if (r < 0)
1227                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1228                 if (setresgid(gid, gid, gid) < 0)
1229                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1230                 if (setresuid(uid, uid, uid) < 0)
1231                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1232
1233                 (void) ignore_signals(SIGPIPE, -1);
1234
1235                 /* Wait until our parent died. This will only work if
1236                  * the above setresuid() succeeds, otherwise the kernel
1237                  * will not allow unprivileged parents kill their privileged
1238                  * children this way. We rely on the control groups kill logic
1239                  * to do the rest for us. */
1240                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1241                         goto child_finish;
1242
1243                 /* Tell the parent that our setup is done. This is especially
1244                  * important regarding dropping privileges. Otherwise, unit
1245                  * setup might race against our setresuid(2) call.
1246                  *
1247                  * If the parent aborted, we'll detect this below, hence ignore
1248                  * return failure here. */
1249                 (void) barrier_place(&barrier);
1250
1251                 /* Check if our parent process might already have died? */
1252                 if (getppid() == parent_pid) {
1253                         sigset_t ss;
1254
1255                         assert_se(sigemptyset(&ss) >= 0);
1256                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1257
1258                         for (;;) {
1259                                 if (sigwait(&ss, &sig) < 0) {
1260                                         if (errno == EINTR)
1261                                                 continue;
1262
1263                                         goto child_finish;
1264                                 }
1265
1266                                 assert(sig == SIGTERM);
1267                                 break;
1268                         }
1269                 }
1270
1271                 /* If our parent died we'll end the session */
1272                 if (getppid() != parent_pid) {
1273                         pam_code = pam_close_session(handle, flags);
1274                         if (pam_code != PAM_SUCCESS)
1275                                 goto child_finish;
1276                 }
1277
1278                 ret = 0;
1279
1280         child_finish:
1281                 pam_end(handle, pam_code | flags);
1282                 _exit(ret);
1283         }
1284
1285         barrier_set_role(&barrier, BARRIER_PARENT);
1286
1287         /* If the child was forked off successfully it will do all the
1288          * cleanups, so forget about the handle here. */
1289         handle = NULL;
1290
1291         /* Unblock SIGTERM again in the parent */
1292         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1293
1294         /* We close the log explicitly here, since the PAM modules
1295          * might have opened it, but we don't want this fd around. */
1296         closelog();
1297
1298         /* Synchronously wait for the child to initialize. We don't care for
1299          * errors as we cannot recover. However, warn loudly if it happens. */
1300         if (!barrier_place_and_sync(&barrier))
1301                 log_error("PAM initialization failed");
1302
1303         strv_free(*env);
1304         *env = e;
1305
1306         return 0;
1307
1308 fail:
1309         if (pam_code != PAM_SUCCESS) {
1310                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1311                 r = -EPERM;  /* PAM errors do not map to errno */
1312         } else
1313                 log_error_errno(r, "PAM failed: %m");
1314
1315         if (handle) {
1316                 if (close_session)
1317                         pam_code = pam_close_session(handle, flags);
1318
1319                 pam_end(handle, pam_code | flags);
1320         }
1321
1322         strv_free(e);
1323         closelog();
1324
1325         return r;
1326 #else
1327         return 0;
1328 #endif
1329 }
1330
1331 static void rename_process_from_path(const char *path) {
1332         char process_name[11];
1333         const char *p;
1334         size_t l;
1335
1336         /* This resulting string must fit in 10 chars (i.e. the length
1337          * of "/sbin/init") to look pretty in /bin/ps */
1338
1339         p = basename(path);
1340         if (isempty(p)) {
1341                 rename_process("(...)");
1342                 return;
1343         }
1344
1345         l = strlen(p);
1346         if (l > 8) {
1347                 /* The end of the process name is usually more
1348                  * interesting, since the first bit might just be
1349                  * "systemd-" */
1350                 p = p + l - 8;
1351                 l = 8;
1352         }
1353
1354         process_name[0] = '(';
1355         memcpy(process_name+1, p, l);
1356         process_name[1+l] = ')';
1357         process_name[1+l+1] = 0;
1358
1359         rename_process(process_name);
1360 }
1361
1362 static bool context_has_address_families(const ExecContext *c) {
1363         assert(c);
1364
1365         return c->address_families_whitelist ||
1366                 !set_isempty(c->address_families);
1367 }
1368
1369 static bool context_has_syscall_filters(const ExecContext *c) {
1370         assert(c);
1371
1372         return c->syscall_whitelist ||
1373                 !hashmap_isempty(c->syscall_filter);
1374 }
1375
1376 static bool context_has_no_new_privileges(const ExecContext *c) {
1377         assert(c);
1378
1379         if (c->no_new_privileges)
1380                 return true;
1381
1382         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1383                 return false;
1384
1385         /* We need NNP if we have any form of seccomp and are unprivileged */
1386         return context_has_address_families(c) ||
1387                 c->memory_deny_write_execute ||
1388                 c->restrict_realtime ||
1389                 exec_context_restrict_namespaces_set(c) ||
1390                 c->protect_kernel_tunables ||
1391                 c->protect_kernel_modules ||
1392                 c->private_devices ||
1393                 context_has_syscall_filters(c) ||
1394                 !set_isempty(c->syscall_archs) ||
1395                 c->lock_personality;
1396 }
1397
1398 #if HAVE_SECCOMP
1399
1400 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1401
1402         if (is_seccomp_available())
1403                 return false;
1404
1405         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1406         return true;
1407 }
1408
1409 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1410         uint32_t negative_action, default_action, action;
1411         int r;
1412
1413         assert(u);
1414         assert(c);
1415
1416         if (!context_has_syscall_filters(c))
1417                 return 0;
1418
1419         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1420                 return 0;
1421
1422         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1423
1424         if (c->syscall_whitelist) {
1425                 default_action = negative_action;
1426                 action = SCMP_ACT_ALLOW;
1427         } else {
1428                 default_action = SCMP_ACT_ALLOW;
1429                 action = negative_action;
1430         }
1431
1432         if (needs_ambient_hack) {
1433                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1434                 if (r < 0)
1435                         return r;
1436         }
1437
1438         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1439 }
1440
1441 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1442         assert(u);
1443         assert(c);
1444
1445         if (set_isempty(c->syscall_archs))
1446                 return 0;
1447
1448         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1449                 return 0;
1450
1451         return seccomp_restrict_archs(c->syscall_archs);
1452 }
1453
1454 static int apply_address_families(const Unit* u, const ExecContext *c) {
1455         assert(u);
1456         assert(c);
1457
1458         if (!context_has_address_families(c))
1459                 return 0;
1460
1461         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1462                 return 0;
1463
1464         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1465 }
1466
1467 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1468         assert(u);
1469         assert(c);
1470
1471         if (!c->memory_deny_write_execute)
1472                 return 0;
1473
1474         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1475                 return 0;
1476
1477         return seccomp_memory_deny_write_execute();
1478 }
1479
1480 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1481         assert(u);
1482         assert(c);
1483
1484         if (!c->restrict_realtime)
1485                 return 0;
1486
1487         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1488                 return 0;
1489
1490         return seccomp_restrict_realtime();
1491 }
1492
1493 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1494         assert(u);
1495         assert(c);
1496
1497         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1498          * let's protect even those systems where this is left on in the kernel. */
1499
1500         if (!c->protect_kernel_tunables)
1501                 return 0;
1502
1503         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1504                 return 0;
1505
1506         return seccomp_protect_sysctl();
1507 }
1508
1509 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1510         assert(u);
1511         assert(c);
1512
1513         /* Turn off module syscalls on ProtectKernelModules=yes */
1514
1515         if (!c->protect_kernel_modules)
1516                 return 0;
1517
1518         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1519                 return 0;
1520
1521         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1522 }
1523
1524 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1525         assert(u);
1526         assert(c);
1527
1528         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1529
1530         if (!c->private_devices)
1531                 return 0;
1532
1533         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1534                 return 0;
1535
1536         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1537 }
1538
1539 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1540         assert(u);
1541         assert(c);
1542
1543         if (!exec_context_restrict_namespaces_set(c))
1544                 return 0;
1545
1546         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1547                 return 0;
1548
1549         return seccomp_restrict_namespaces(c->restrict_namespaces);
1550 }
1551
1552 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1553         unsigned long personality;
1554         int r;
1555
1556         assert(u);
1557         assert(c);
1558
1559         if (!c->lock_personality)
1560                 return 0;
1561
1562         if (skip_seccomp_unavailable(u, "LockPersonality="))
1563                 return 0;
1564
1565         personality = c->personality;
1566
1567         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1568         if (personality == PERSONALITY_INVALID) {
1569
1570                 r = opinionated_personality(&personality);
1571                 if (r < 0)
1572                         return r;
1573         }
1574
1575         return seccomp_lock_personality(personality);
1576 }
1577
1578 #endif
1579
1580 static void do_idle_pipe_dance(int idle_pipe[4]) {
1581         assert(idle_pipe);
1582
1583         idle_pipe[1] = safe_close(idle_pipe[1]);
1584         idle_pipe[2] = safe_close(idle_pipe[2]);
1585
1586         if (idle_pipe[0] >= 0) {
1587                 int r;
1588
1589                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1590
1591                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1592                         ssize_t n;
1593
1594                         /* Signal systemd that we are bored and want to continue. */
1595                         n = write(idle_pipe[3], "x", 1);
1596                         if (n > 0)
1597                                 /* Wait for systemd to react to the signal above. */
1598                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1599                 }
1600
1601                 idle_pipe[0] = safe_close(idle_pipe[0]);
1602
1603         }
1604
1605         idle_pipe[3] = safe_close(idle_pipe[3]);
1606 }
1607
1608 static int build_environment(
1609                 const Unit *u,
1610                 const ExecContext *c,
1611                 const ExecParameters *p,
1612                 unsigned n_fds,
1613                 const char *home,
1614                 const char *username,
1615                 const char *shell,
1616                 dev_t journal_stream_dev,
1617                 ino_t journal_stream_ino,
1618                 char ***ret) {
1619
1620         _cleanup_strv_free_ char **our_env = NULL;
1621         unsigned n_env = 0;
1622         char *x;
1623
1624         assert(u);
1625         assert(c);
1626         assert(ret);
1627
1628         our_env = new0(char*, 14);
1629         if (!our_env)
1630                 return -ENOMEM;
1631
1632         if (n_fds > 0) {
1633                 _cleanup_free_ char *joined = NULL;
1634
1635                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1636                         return -ENOMEM;
1637                 our_env[n_env++] = x;
1638
1639                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1640                         return -ENOMEM;
1641                 our_env[n_env++] = x;
1642
1643                 joined = strv_join(p->fd_names, ":");
1644                 if (!joined)
1645                         return -ENOMEM;
1646
1647                 x = strjoin("LISTEN_FDNAMES=", joined);
1648                 if (!x)
1649                         return -ENOMEM;
1650                 our_env[n_env++] = x;
1651         }
1652
1653         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1654                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1655                         return -ENOMEM;
1656                 our_env[n_env++] = x;
1657
1658                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1659                         return -ENOMEM;
1660                 our_env[n_env++] = x;
1661         }
1662
1663         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1664          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1665          * check the database directly. */
1666         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1667                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1668                 if (!x)
1669                         return -ENOMEM;
1670                 our_env[n_env++] = x;
1671         }
1672
1673         if (home) {
1674                 x = strappend("HOME=", home);
1675                 if (!x)
1676                         return -ENOMEM;
1677                 our_env[n_env++] = x;
1678         }
1679
1680         if (username) {
1681                 x = strappend("LOGNAME=", username);
1682                 if (!x)
1683                         return -ENOMEM;
1684                 our_env[n_env++] = x;
1685
1686                 x = strappend("USER=", username);
1687                 if (!x)
1688                         return -ENOMEM;
1689                 our_env[n_env++] = x;
1690         }
1691
1692         if (shell) {
1693                 x = strappend("SHELL=", shell);
1694                 if (!x)
1695                         return -ENOMEM;
1696                 our_env[n_env++] = x;
1697         }
1698
1699         if (!sd_id128_is_null(u->invocation_id)) {
1700                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1701                         return -ENOMEM;
1702
1703                 our_env[n_env++] = x;
1704         }
1705
1706         if (exec_context_needs_term(c)) {
1707                 const char *tty_path, *term = NULL;
1708
1709                 tty_path = exec_context_tty_path(c);
1710
1711                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1712                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1713                  * passes to PID 1 ends up all the way in the console login shown. */
1714
1715                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1716                         term = getenv("TERM");
1717                 if (!term)
1718                         term = default_term_for_tty(tty_path);
1719
1720                 x = strappend("TERM=", term);
1721                 if (!x)
1722                         return -ENOMEM;
1723                 our_env[n_env++] = x;
1724         }
1725
1726         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1727                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1728                         return -ENOMEM;
1729
1730                 our_env[n_env++] = x;
1731         }
1732
1733         our_env[n_env++] = NULL;
1734         assert(n_env <= 12);
1735
1736         *ret = our_env;
1737         our_env = NULL;
1738
1739         return 0;
1740 }
1741
1742 static int build_pass_environment(const ExecContext *c, char ***ret) {
1743         _cleanup_strv_free_ char **pass_env = NULL;
1744         size_t n_env = 0, n_bufsize = 0;
1745         char **i;
1746
1747         STRV_FOREACH(i, c->pass_environment) {
1748                 _cleanup_free_ char *x = NULL;
1749                 char *v;
1750
1751                 v = getenv(*i);
1752                 if (!v)
1753                         continue;
1754                 x = strjoin(*i, "=", v);
1755                 if (!x)
1756                         return -ENOMEM;
1757
1758                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1759                         return -ENOMEM;
1760
1761                 pass_env[n_env++] = x;
1762                 pass_env[n_env] = NULL;
1763                 x = NULL;
1764         }
1765
1766         *ret = pass_env;
1767         pass_env = NULL;
1768
1769         return 0;
1770 }
1771
1772 static bool exec_needs_mount_namespace(
1773                 const ExecContext *context,
1774                 const ExecParameters *params,
1775                 const ExecRuntime *runtime) {
1776
1777         assert(context);
1778         assert(params);
1779
1780         if (context->root_image)
1781                 return true;
1782
1783         if (!strv_isempty(context->read_write_paths) ||
1784             !strv_isempty(context->read_only_paths) ||
1785             !strv_isempty(context->inaccessible_paths))
1786                 return true;
1787
1788         if (context->n_bind_mounts > 0)
1789                 return true;
1790
1791         if (context->n_temporary_filesystems > 0)
1792                 return true;
1793
1794         if (context->mount_flags != 0)
1795                 return true;
1796
1797         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1798                 return true;
1799
1800         if (context->private_devices ||
1801             context->protect_system != PROTECT_SYSTEM_NO ||
1802             context->protect_home != PROTECT_HOME_NO ||
1803             context->protect_kernel_tunables ||
1804             context->protect_kernel_modules ||
1805             context->protect_control_groups)
1806                 return true;
1807
1808         if (context->mount_apivfs && (context->root_image || context->root_directory))
1809                 return true;
1810
1811         if (context->dynamic_user &&
1812             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1813              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1814              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1815                 return true;
1816
1817         return false;
1818 }
1819
1820 static int setup_private_users(uid_t uid, gid_t gid) {
1821         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1822         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1823         _cleanup_close_ int unshare_ready_fd = -1;
1824         _cleanup_(sigkill_waitp) pid_t pid = 0;
1825         uint64_t c = 1;
1826         ssize_t n;
1827         int r;
1828
1829         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1830          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1831          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1832          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1833          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1834          * continues execution normally. */
1835
1836         if (uid != 0 && uid_is_valid(uid)) {
1837                 r = asprintf(&uid_map,
1838                              "0 0 1\n"                      /* Map root → root */
1839                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1840                              uid, uid);
1841                 if (r < 0)
1842                         return -ENOMEM;
1843         } else {
1844                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1845                 if (!uid_map)
1846                         return -ENOMEM;
1847         }
1848
1849         if (gid != 0 && gid_is_valid(gid)) {
1850                 r = asprintf(&gid_map,
1851                              "0 0 1\n"                      /* Map root → root */
1852                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1853                              gid, gid);
1854                 if (r < 0)
1855                         return -ENOMEM;
1856         } else {
1857                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1858                 if (!gid_map)
1859                         return -ENOMEM;
1860         }
1861
1862         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1863          * namespace. */
1864         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1865         if (unshare_ready_fd < 0)
1866                 return -errno;
1867
1868         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1869          * failed. */
1870         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1871                 return -errno;
1872
1873         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1874         if (r < 0)
1875                 return r;
1876         if (r == 0) {
1877                 _cleanup_close_ int fd = -1;
1878                 const char *a;
1879                 pid_t ppid;
1880
1881                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1882                  * here, after the parent opened its own user namespace. */
1883
1884                 ppid = getppid();
1885                 errno_pipe[0] = safe_close(errno_pipe[0]);
1886
1887                 /* Wait until the parent unshared the user namespace */
1888                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1889                         r = -errno;
1890                         goto child_fail;
1891                 }
1892
1893                 /* Disable the setgroups() system call in the child user namespace, for good. */
1894                 a = procfs_file_alloca(ppid, "setgroups");
1895                 fd = open(a, O_WRONLY|O_CLOEXEC);
1896                 if (fd < 0) {
1897                         if (errno != ENOENT) {
1898                                 r = -errno;
1899                                 goto child_fail;
1900                         }
1901
1902                         /* If the file is missing the kernel is too old, let's continue anyway. */
1903                 } else {
1904                         if (write(fd, "deny\n", 5) < 0) {
1905                                 r = -errno;
1906                                 goto child_fail;
1907                         }
1908
1909                         fd = safe_close(fd);
1910                 }
1911
1912                 /* First write the GID map */
1913                 a = procfs_file_alloca(ppid, "gid_map");
1914                 fd = open(a, O_WRONLY|O_CLOEXEC);
1915                 if (fd < 0) {
1916                         r = -errno;
1917                         goto child_fail;
1918                 }
1919                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1920                         r = -errno;
1921                         goto child_fail;
1922                 }
1923                 fd = safe_close(fd);
1924
1925                 /* The write the UID map */
1926                 a = procfs_file_alloca(ppid, "uid_map");
1927                 fd = open(a, O_WRONLY|O_CLOEXEC);
1928                 if (fd < 0) {
1929                         r = -errno;
1930                         goto child_fail;
1931                 }
1932                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1933                         r = -errno;
1934                         goto child_fail;
1935                 }
1936
1937                 _exit(EXIT_SUCCESS);
1938
1939         child_fail:
1940                 (void) write(errno_pipe[1], &r, sizeof(r));
1941                 _exit(EXIT_FAILURE);
1942         }
1943
1944         errno_pipe[1] = safe_close(errno_pipe[1]);
1945
1946         if (unshare(CLONE_NEWUSER) < 0)
1947                 return -errno;
1948
1949         /* Let the child know that the namespace is ready now */
1950         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1951                 return -errno;
1952
1953         /* Try to read an error code from the child */
1954         n = read(errno_pipe[0], &r, sizeof(r));
1955         if (n < 0)
1956                 return -errno;
1957         if (n == sizeof(r)) { /* an error code was sent to us */
1958                 if (r < 0)
1959                         return r;
1960                 return -EIO;
1961         }
1962         if (n != 0) /* on success we should have read 0 bytes */
1963                 return -EIO;
1964
1965         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1966         pid = 0;
1967         if (r < 0)
1968                 return r;
1969         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1970                 return -EIO;
1971
1972         return 0;
1973 }
1974
1975 static int setup_exec_directory(
1976                 const ExecContext *context,
1977                 const ExecParameters *params,
1978                 uid_t uid,
1979                 gid_t gid,
1980                 ExecDirectoryType type,
1981                 int *exit_status) {
1982
1983         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1984                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1985                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1986                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1987                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1988                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1989         };
1990         char **rt;
1991         int r;
1992
1993         assert(context);
1994         assert(params);
1995         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1996         assert(exit_status);
1997
1998         if (!params->prefix[type])
1999                 return 0;
2000
2001         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2002                 if (!uid_is_valid(uid))
2003                         uid = 0;
2004                 if (!gid_is_valid(gid))
2005                         gid = 0;
2006         }
2007
2008         STRV_FOREACH(rt, context->directories[type].paths) {
2009                 _cleanup_free_ char *p = NULL, *pp = NULL;
2010
2011                 p = strjoin(params->prefix[type], "/", *rt);
2012                 if (!p) {
2013                         r = -ENOMEM;
2014                         goto fail;
2015                 }
2016
2017                 r = mkdir_parents_label(p, 0755);
2018                 if (r < 0)
2019                         goto fail;
2020
2021                 if (context->dynamic_user &&
2022                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2023                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2024
2025                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2026                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2027                          * whose UID is later on reused. To lock this down we use the same trick used by container
2028                          * managers to prohibit host users to get access to files of the same UID in containers: we
2029                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2030                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2031                          * to make this directory permeable for the service itself.
2032                          *
2033                          * Specifically: for a service which wants a special directory "foo/" we first create a
2034                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2035                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2036                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2037                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2038                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2039                          * disabling the access boundary for the service and making sure it only gets access to the
2040                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2041                          *
2042                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2043                          * owned by the service itself.
2044                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2045                          * files or sockets with other services. */
2046
2047                         private_root = strjoin(params->prefix[type], "/private");
2048                         if (!private_root) {
2049                                 r = -ENOMEM;
2050                                 goto fail;
2051                         }
2052
2053                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2054                         r = mkdir_safe_label(private_root, 0700, 0, 0, 0);
2055                         if (r < 0)
2056                                 goto fail;
2057
2058                         pp = strjoin(private_root, "/", *rt);
2059                         if (!pp) {
2060                                 r = -ENOMEM;
2061                                 goto fail;
2062                         }
2063
2064                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2065                         r = mkdir_parents_label(pp, 0755);
2066                         if (r < 0)
2067                                 goto fail;
2068
2069                         if (is_dir(p, false) > 0 &&
2070                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2071
2072                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2073                                  * it over. Most likely the service has been upgraded from one that didn't use
2074                                  * DynamicUser=1, to one that does. */
2075
2076                                 if (rename(p, pp) < 0) {
2077                                         r = -errno;
2078                                         goto fail;
2079                                 }
2080                         } else {
2081                                 /* Otherwise, create the actual directory for the service */
2082
2083                                 r = mkdir_label(pp, context->directories[type].mode);
2084                                 if (r < 0 && r != -EEXIST)
2085                                         goto fail;
2086                         }
2087
2088                         parent = dirname_malloc(p);
2089                         if (!parent) {
2090                                 r = -ENOMEM;
2091                                 goto fail;
2092                         }
2093
2094                         r = path_make_relative(parent, pp, &relative);
2095                         if (r < 0)
2096                                 goto fail;
2097
2098                         /* And link it up from the original place */
2099                         r = symlink_idempotent(relative, p);
2100                         if (r < 0)
2101                                 goto fail;
2102
2103                         /* Lock down the access mode */
2104                         if (chmod(pp, context->directories[type].mode) < 0) {
2105                                 r = -errno;
2106                                 goto fail;
2107                         }
2108                 } else {
2109                         r = mkdir_label(p, context->directories[type].mode);
2110                         if (r == -EEXIST)
2111                                 continue;
2112                         if (r < 0)
2113                                 goto fail;
2114                 }
2115
2116                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2117                  * a service, and shall not be writable. */
2118                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2119                         continue;
2120
2121                 /* Then, change the ownership of the whole tree, if necessary */
2122                 r = path_chown_recursive(pp ?: p, uid, gid);
2123                 if (r < 0)
2124                         goto fail;
2125         }
2126
2127         return 0;
2128
2129 fail:
2130         *exit_status = exit_status_table[type];
2131         return r;
2132 }
2133
2134 #if ENABLE_SMACK
2135 static int setup_smack(
2136                 const ExecContext *context,
2137                 const ExecCommand *command) {
2138
2139         int r;
2140
2141         assert(context);
2142         assert(command);
2143
2144         if (context->smack_process_label) {
2145                 r = mac_smack_apply_pid(0, context->smack_process_label);
2146                 if (r < 0)
2147                         return r;
2148         }
2149 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2150         else {
2151                 _cleanup_free_ char *exec_label = NULL;
2152
2153                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2154                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2155                         return r;
2156
2157                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2158                 if (r < 0)
2159                         return r;
2160         }
2161 #endif
2162
2163         return 0;
2164 }
2165 #endif
2166
2167 static int compile_bind_mounts(
2168                 const ExecContext *context,
2169                 const ExecParameters *params,
2170                 BindMount **ret_bind_mounts,
2171                 unsigned *ret_n_bind_mounts,
2172                 char ***ret_empty_directories) {
2173
2174         _cleanup_strv_free_ char **empty_directories = NULL;
2175         BindMount *bind_mounts;
2176         unsigned n, h = 0, i;
2177         ExecDirectoryType t;
2178         int r;
2179
2180         assert(context);
2181         assert(params);
2182         assert(ret_bind_mounts);
2183         assert(ret_n_bind_mounts);
2184         assert(ret_empty_directories);
2185
2186         n = context->n_bind_mounts;
2187         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2188                 if (!params->prefix[t])
2189                         continue;
2190
2191                 n += strv_length(context->directories[t].paths);
2192         }
2193
2194         if (n <= 0) {
2195                 *ret_bind_mounts = NULL;
2196                 *ret_n_bind_mounts = 0;
2197                 *ret_empty_directories = NULL;
2198                 return 0;
2199         }
2200
2201         bind_mounts = new(BindMount, n);
2202         if (!bind_mounts)
2203                 return -ENOMEM;
2204
2205         for (i = 0; i < context->n_bind_mounts; i++) {
2206                 BindMount *item = context->bind_mounts + i;
2207                 char *s, *d;
2208
2209                 s = strdup(item->source);
2210                 if (!s) {
2211                         r = -ENOMEM;
2212                         goto finish;
2213                 }
2214
2215                 d = strdup(item->destination);
2216                 if (!d) {
2217                         free(s);
2218                         r = -ENOMEM;
2219                         goto finish;
2220                 }
2221
2222                 bind_mounts[h++] = (BindMount) {
2223                         .source = s,
2224                         .destination = d,
2225                         .read_only = item->read_only,
2226                         .recursive = item->recursive,
2227                         .ignore_enoent = item->ignore_enoent,
2228                 };
2229         }
2230
2231         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2232                 char **suffix;
2233
2234                 if (!params->prefix[t])
2235                         continue;
2236
2237                 if (strv_isempty(context->directories[t].paths))
2238                         continue;
2239
2240                 if (context->dynamic_user &&
2241                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2242                         char *private_root;
2243
2244                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2245                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2246                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2247
2248                         private_root = strjoin(params->prefix[t], "/private");
2249                         if (!private_root) {
2250                                 r = -ENOMEM;
2251                                 goto finish;
2252                         }
2253
2254                         r = strv_consume(&empty_directories, private_root);
2255                         if (r < 0)
2256                                 goto finish;
2257                 }
2258
2259                 STRV_FOREACH(suffix, context->directories[t].paths) {
2260                         char *s, *d;
2261
2262                         if (context->dynamic_user &&
2263                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2264                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2265                         else
2266                                 s = strjoin(params->prefix[t], "/", *suffix);
2267                         if (!s) {
2268                                 r = -ENOMEM;
2269                                 goto finish;
2270                         }
2271
2272                         d = strdup(s);
2273                         if (!d) {
2274                                 free(s);
2275                                 r = -ENOMEM;
2276                                 goto finish;
2277                         }
2278
2279                         bind_mounts[h++] = (BindMount) {
2280                                 .source = s,
2281                                 .destination = d,
2282                                 .read_only = false,
2283                                 .recursive = true,
2284                                 .ignore_enoent = false,
2285                         };
2286                 }
2287         }
2288
2289         assert(h == n);
2290
2291         *ret_bind_mounts = bind_mounts;
2292         *ret_n_bind_mounts = n;
2293         *ret_empty_directories = empty_directories;
2294
2295         empty_directories = NULL;
2296
2297         return (int) n;
2298
2299 finish:
2300         bind_mount_free_many(bind_mounts, h);
2301         return r;
2302 }
2303
2304 static int apply_mount_namespace(
2305                 const Unit *u,
2306                 const ExecCommand *command,
2307                 const ExecContext *context,
2308                 const ExecParameters *params,
2309                 const ExecRuntime *runtime) {
2310
2311         _cleanup_strv_free_ char **empty_directories = NULL;
2312         char *tmp = NULL, *var = NULL;
2313         const char *root_dir = NULL, *root_image = NULL;
2314         NamespaceInfo ns_info = {
2315                 .ignore_protect_paths = false,
2316                 .private_dev = context->private_devices,
2317                 .protect_control_groups = context->protect_control_groups,
2318                 .protect_kernel_tunables = context->protect_kernel_tunables,
2319                 .protect_kernel_modules = context->protect_kernel_modules,
2320                 .mount_apivfs = context->mount_apivfs,
2321         };
2322         bool needs_sandboxing;
2323         BindMount *bind_mounts = NULL;
2324         unsigned n_bind_mounts = 0;
2325         int r;
2326
2327         assert(context);
2328
2329         /* The runtime struct only contains the parent of the private /tmp,
2330          * which is non-accessible to world users. Inside of it there's a /tmp
2331          * that is sticky, and that's the one we want to use here. */
2332
2333         if (context->private_tmp && runtime) {
2334                 if (runtime->tmp_dir)
2335                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2336                 if (runtime->var_tmp_dir)
2337                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2338         }
2339
2340         if (params->flags & EXEC_APPLY_CHROOT) {
2341                 root_image = context->root_image;
2342
2343                 if (!root_image)
2344                         root_dir = context->root_directory;
2345         }
2346
2347         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2348         if (r < 0)
2349                 return r;
2350
2351         /*
2352          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2353          * sandbox info, otherwise enforce it, don't ignore protected paths and
2354          * fail if we are enable to apply the sandbox inside the mount namespace.
2355          */
2356         if (!context->dynamic_user && root_dir)
2357                 ns_info.ignore_protect_paths = true;
2358
2359         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2360
2361         r = setup_namespace(root_dir, root_image,
2362                             &ns_info, context->read_write_paths,
2363                             needs_sandboxing ? context->read_only_paths : NULL,
2364                             needs_sandboxing ? context->inaccessible_paths : NULL,
2365                             empty_directories,
2366                             bind_mounts,
2367                             n_bind_mounts,
2368                             context->temporary_filesystems,
2369                             context->n_temporary_filesystems,
2370                             tmp,
2371                             var,
2372                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2373                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2374                             context->mount_flags,
2375                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2376
2377         bind_mount_free_many(bind_mounts, n_bind_mounts);
2378
2379         /* If we couldn't set up the namespace this is probably due to a
2380          * missing capability. In this case, silently proceeed. */
2381         if (IN_SET(r, -EPERM, -EACCES)) {
2382                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2383                 return 0;
2384         }
2385
2386         return r;
2387 }
2388
2389 static int apply_working_directory(
2390                 const ExecContext *context,
2391                 const ExecParameters *params,
2392                 const char *home,
2393                 const bool needs_mount_ns,
2394                 int *exit_status) {
2395
2396         const char *d, *wd;
2397
2398         assert(context);
2399         assert(exit_status);
2400
2401         if (context->working_directory_home) {
2402
2403                 if (!home) {
2404                         *exit_status = EXIT_CHDIR;
2405                         return -ENXIO;
2406                 }
2407
2408                 wd = home;
2409
2410         } else if (context->working_directory)
2411                 wd = context->working_directory;
2412         else
2413                 wd = "/";
2414
2415         if (params->flags & EXEC_APPLY_CHROOT) {
2416                 if (!needs_mount_ns && context->root_directory)
2417                         if (chroot(context->root_directory) < 0) {
2418                                 *exit_status = EXIT_CHROOT;
2419                                 return -errno;
2420                         }
2421
2422                 d = wd;
2423         } else
2424                 d = prefix_roota(context->root_directory, wd);
2425
2426         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2427                 *exit_status = EXIT_CHDIR;
2428                 return -errno;
2429         }
2430
2431         return 0;
2432 }
2433
2434 static int setup_keyring(
2435                 const Unit *u,
2436                 const ExecContext *context,
2437                 const ExecParameters *p,
2438                 uid_t uid, gid_t gid) {
2439
2440         key_serial_t keyring;
2441         int r;
2442
2443         assert(u);
2444         assert(context);
2445         assert(p);
2446
2447         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2448          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2449          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2450          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2451          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2452          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2453
2454         if (!(p->flags & EXEC_NEW_KEYRING))
2455                 return 0;
2456
2457         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2458                 return 0;
2459
2460         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2461         if (keyring == -1) {
2462                 if (errno == ENOSYS)
2463                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2464                 else if (IN_SET(errno, EACCES, EPERM))
2465                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2466                 else if (errno == EDQUOT)
2467                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2468                 else
2469                         return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2470
2471                 return 0;
2472         }
2473
2474         /* Populate they keyring with the invocation ID by default. */
2475         if (!sd_id128_is_null(u->invocation_id)) {
2476                 key_serial_t key;
2477
2478                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2479                 if (key == -1)
2480                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2481                 else {
2482                         if (keyctl(KEYCTL_SETPERM, key,
2483                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2484                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2485                                 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2486                 }
2487         }
2488
2489         /* And now, make the keyring owned by the service's user */
2490         if (uid_is_valid(uid) || gid_is_valid(gid))
2491                 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2492                         return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2493
2494         /* When requested link the user keyring into the session keyring. */
2495         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2496                 uid_t saved_uid;
2497                 gid_t saved_gid;
2498
2499                 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2500                  * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2501                  * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2502
2503                 saved_uid = getuid();
2504                 saved_gid = getgid();
2505
2506                 if (gid_is_valid(gid) && gid != saved_gid) {
2507                         if (setregid(gid, -1) < 0)
2508                                 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2509                 }
2510
2511                 if (uid_is_valid(uid) && uid != saved_uid) {
2512                         if (setreuid(uid, -1) < 0) {
2513                                 (void) setregid(saved_gid, -1);
2514                                 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2515                         }
2516                 }
2517
2518                 if (keyctl(KEYCTL_LINK,
2519                            KEY_SPEC_USER_KEYRING,
2520                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2521
2522                         r = -errno;
2523
2524                         (void) setreuid(saved_uid, -1);
2525                         (void) setregid(saved_gid, -1);
2526
2527                         return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2528                 }
2529
2530                 if (uid_is_valid(uid) && uid != saved_uid) {
2531                         if (setreuid(saved_uid, -1) < 0) {
2532                                 (void) setregid(saved_gid, -1);
2533                                 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2534                         }
2535                 }
2536
2537                 if (gid_is_valid(gid) && gid != saved_gid) {
2538                         if (setregid(saved_gid, -1) < 0)
2539                                 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2540                 }
2541         }
2542
2543         return 0;
2544 }
2545
2546 static void append_socket_pair(int *array, unsigned *n, const int pair[2]) {
2547         assert(array);
2548         assert(n);
2549
2550         if (!pair)
2551                 return;
2552
2553         if (pair[0] >= 0)
2554                 array[(*n)++] = pair[0];
2555         if (pair[1] >= 0)
2556                 array[(*n)++] = pair[1];
2557 }
2558
2559 static int close_remaining_fds(
2560                 const ExecParameters *params,
2561                 const ExecRuntime *runtime,
2562                 const DynamicCreds *dcreds,
2563                 int user_lookup_fd,
2564                 int socket_fd,
2565                 int *fds, unsigned n_fds) {
2566
2567         unsigned n_dont_close = 0;
2568         int dont_close[n_fds + 12];
2569
2570         assert(params);
2571
2572         if (params->stdin_fd >= 0)
2573                 dont_close[n_dont_close++] = params->stdin_fd;
2574         if (params->stdout_fd >= 0)
2575                 dont_close[n_dont_close++] = params->stdout_fd;
2576         if (params->stderr_fd >= 0)
2577                 dont_close[n_dont_close++] = params->stderr_fd;
2578
2579         if (socket_fd >= 0)
2580                 dont_close[n_dont_close++] = socket_fd;
2581         if (n_fds > 0) {
2582                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2583                 n_dont_close += n_fds;
2584         }
2585
2586         if (runtime)
2587                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2588
2589         if (dcreds) {
2590                 if (dcreds->user)
2591                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2592                 if (dcreds->group)
2593                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2594         }
2595
2596         if (user_lookup_fd >= 0)
2597                 dont_close[n_dont_close++] = user_lookup_fd;
2598
2599         return close_all_fds(dont_close, n_dont_close);
2600 }
2601
2602 static int send_user_lookup(
2603                 Unit *unit,
2604                 int user_lookup_fd,
2605                 uid_t uid,
2606                 gid_t gid) {
2607
2608         assert(unit);
2609
2610         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2611          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2612          * specified. */
2613
2614         if (user_lookup_fd < 0)
2615                 return 0;
2616
2617         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2618                 return 0;
2619
2620         if (writev(user_lookup_fd,
2621                (struct iovec[]) {
2622                            IOVEC_INIT(&uid, sizeof(uid)),
2623                            IOVEC_INIT(&gid, sizeof(gid)),
2624                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2625                 return -errno;
2626
2627         return 0;
2628 }
2629
2630 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2631         int r;
2632
2633         assert(c);
2634         assert(home);
2635         assert(buf);
2636
2637         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2638
2639         if (*home)
2640                 return 0;
2641
2642         if (!c->working_directory_home)
2643                 return 0;
2644
2645         if (uid == 0) {
2646                 /* Hardcode /root as home directory for UID 0 */
2647                 *home = "/root";
2648                 return 1;
2649         }
2650
2651         r = get_home_dir(buf);
2652         if (r < 0)
2653                 return r;
2654
2655         *home = *buf;
2656         return 1;
2657 }
2658
2659 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2660         _cleanup_strv_free_ char ** list = NULL;
2661         ExecDirectoryType t;
2662         int r;
2663
2664         assert(c);
2665         assert(p);
2666         assert(ret);
2667
2668         assert(c->dynamic_user);
2669
2670         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2671          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2672          * directories. */
2673
2674         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2675                 char **i;
2676
2677                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2678                         continue;
2679
2680                 if (!p->prefix[t])
2681                         continue;
2682
2683                 STRV_FOREACH(i, c->directories[t].paths) {
2684                         char *e;
2685
2686                         if (t == EXEC_DIRECTORY_RUNTIME)
2687                                 e = strjoin(p->prefix[t], "/", *i);
2688                         else
2689                                 e = strjoin(p->prefix[t], "/private/", *i);
2690                         if (!e)
2691                                 return -ENOMEM;
2692
2693                         r = strv_consume(&list, e);
2694                         if (r < 0)
2695                                 return r;
2696                 }
2697         }
2698
2699         *ret = list;
2700         list = NULL;
2701
2702         return 0;
2703 }
2704
2705 static char *exec_command_line(char **argv);
2706
2707 static int exec_child(
2708                 Unit *unit,
2709                 const ExecCommand *command,
2710                 const ExecContext *context,
2711                 const ExecParameters *params,
2712                 ExecRuntime *runtime,
2713                 DynamicCreds *dcreds,
2714                 char **argv,
2715                 int socket_fd,
2716                 int named_iofds[3],
2717                 int *fds,
2718                 unsigned n_storage_fds,
2719                 unsigned n_socket_fds,
2720                 char **files_env,
2721                 int user_lookup_fd,
2722                 int *exit_status) {
2723
2724         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2725         _cleanup_free_ char *home_buffer = NULL;
2726         _cleanup_free_ gid_t *supplementary_gids = NULL;
2727         const char *username = NULL, *groupname = NULL;
2728         const char *home = NULL, *shell = NULL;
2729         dev_t journal_stream_dev = 0;
2730         ino_t journal_stream_ino = 0;
2731         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2732                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2733                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2734                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2735 #if HAVE_SELINUX
2736         _cleanup_free_ char *mac_selinux_context_net = NULL;
2737         bool use_selinux = false;
2738 #endif
2739 #if ENABLE_SMACK
2740         bool use_smack = false;
2741 #endif
2742 #if HAVE_APPARMOR
2743         bool use_apparmor = false;
2744 #endif
2745         uid_t uid = UID_INVALID;
2746         gid_t gid = GID_INVALID;
2747         int i, r, ngids = 0;
2748         unsigned n_fds;
2749         ExecDirectoryType dt;
2750         int secure_bits;
2751
2752         assert(unit);
2753         assert(command);
2754         assert(context);
2755         assert(params);
2756         assert(exit_status);
2757
2758         rename_process_from_path(command->path);
2759
2760         /* We reset exactly these signals, since they are the
2761          * only ones we set to SIG_IGN in the main daemon. All
2762          * others we leave untouched because we set them to
2763          * SIG_DFL or a valid handler initially, both of which
2764          * will be demoted to SIG_DFL. */
2765         (void) default_signals(SIGNALS_CRASH_HANDLER,
2766                                SIGNALS_IGNORE, -1);
2767
2768         if (context->ignore_sigpipe)
2769                 (void) ignore_signals(SIGPIPE, -1);
2770
2771         r = reset_signal_mask();
2772         if (r < 0) {
2773                 *exit_status = EXIT_SIGNAL_MASK;
2774                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2775         }
2776
2777         if (params->idle_pipe)
2778                 do_idle_pipe_dance(params->idle_pipe);
2779
2780         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2781          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2782          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2783          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2784
2785         log_forget_fds();
2786         log_set_open_when_needed(true);
2787
2788         /* In case anything used libc syslog(), close this here, too */
2789         closelog();
2790
2791         n_fds = n_storage_fds + n_socket_fds;
2792         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2793         if (r < 0) {
2794                 *exit_status = EXIT_FDS;
2795                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2796         }
2797
2798         if (!context->same_pgrp)
2799                 if (setsid() < 0) {
2800                         *exit_status = EXIT_SETSID;
2801                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2802                 }
2803
2804         exec_context_tty_reset(context, params);
2805
2806         if (unit_shall_confirm_spawn(unit)) {
2807                 const char *vc = params->confirm_spawn;
2808                 _cleanup_free_ char *cmdline = NULL;
2809
2810                 cmdline = exec_command_line(argv);
2811                 if (!cmdline) {
2812                         *exit_status = EXIT_MEMORY;
2813                         return log_oom();
2814                 }
2815
2816                 r = ask_for_confirmation(vc, unit, cmdline);
2817                 if (r != CONFIRM_EXECUTE) {
2818                         if (r == CONFIRM_PRETEND_SUCCESS) {
2819                                 *exit_status = EXIT_SUCCESS;
2820                                 return 0;
2821                         }
2822                         *exit_status = EXIT_CONFIRM;
2823                         log_unit_error(unit, "Execution cancelled by the user");
2824                         return -ECANCELED;
2825                 }
2826         }
2827
2828         if (context->dynamic_user && dcreds) {
2829                 _cleanup_strv_free_ char **suggested_paths = NULL;
2830
2831                 /* Make sure we bypass our own NSS module for any NSS checks */
2832                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2833                         *exit_status = EXIT_USER;
2834                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2835                 }
2836
2837                 r = compile_suggested_paths(context, params, &suggested_paths);
2838                 if (r < 0) {
2839                         *exit_status = EXIT_MEMORY;
2840                         return log_oom();
2841                 }
2842
2843                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2844                 if (r < 0) {
2845                         *exit_status = EXIT_USER;
2846                         if (r == -EILSEQ) {
2847                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2848                                 return -EOPNOTSUPP;
2849                         }
2850                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2851                 }
2852
2853                 if (!uid_is_valid(uid)) {
2854                         *exit_status = EXIT_USER;
2855                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2856                         return -ESRCH;
2857                 }
2858
2859                 if (!gid_is_valid(gid)) {
2860                         *exit_status = EXIT_USER;
2861                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2862                         return -ESRCH;
2863                 }
2864
2865                 if (dcreds->user)
2866                         username = dcreds->user->name;
2867
2868         } else {
2869                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2870                 if (r < 0) {
2871                         *exit_status = EXIT_USER;
2872                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2873                 }
2874
2875                 r = get_fixed_group(context, &groupname, &gid);
2876                 if (r < 0) {
2877                         *exit_status = EXIT_GROUP;
2878                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2879                 }
2880         }
2881
2882         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2883         r = get_supplementary_groups(context, username, groupname, gid,
2884                                      &supplementary_gids, &ngids);
2885         if (r < 0) {
2886                 *exit_status = EXIT_GROUP;
2887                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2888         }
2889
2890         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2891         if (r < 0) {
2892                 *exit_status = EXIT_USER;
2893                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2894         }
2895
2896         user_lookup_fd = safe_close(user_lookup_fd);
2897
2898         r = acquire_home(context, uid, &home, &home_buffer);
2899         if (r < 0) {
2900                 *exit_status = EXIT_CHDIR;
2901                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2902         }
2903
2904         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2905          * must sure to drop O_NONBLOCK */
2906         if (socket_fd >= 0)
2907                 (void) fd_nonblock(socket_fd, false);
2908
2909         r = setup_input(context, params, socket_fd, named_iofds);
2910         if (r < 0) {
2911                 *exit_status = EXIT_STDIN;
2912                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2913         }
2914
2915         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2916         if (r < 0) {
2917                 *exit_status = EXIT_STDOUT;
2918                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2919         }
2920
2921         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2922         if (r < 0) {
2923                 *exit_status = EXIT_STDERR;
2924                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2925         }
2926
2927         if (params->cgroup_path) {
2928                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2929                 if (r < 0) {
2930                         *exit_status = EXIT_CGROUP;
2931                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2932                 }
2933         }
2934
2935         if (context->oom_score_adjust_set) {
2936                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2937
2938                 /* When we can't make this change due to EPERM, then
2939                  * let's silently skip over it. User namespaces
2940                  * prohibit write access to this file, and we
2941                  * shouldn't trip up over that. */
2942
2943                 sprintf(t, "%i", context->oom_score_adjust);
2944                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2945                 if (IN_SET(r, -EPERM, -EACCES))
2946                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2947                 else if (r < 0) {
2948                         *exit_status = EXIT_OOM_ADJUST;
2949                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2950                 }
2951         }
2952
2953         if (context->nice_set)
2954                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2955                         *exit_status = EXIT_NICE;
2956                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2957                 }
2958
2959         if (context->cpu_sched_set) {
2960                 struct sched_param param = {
2961                         .sched_priority = context->cpu_sched_priority,
2962                 };
2963
2964                 r = sched_setscheduler(0,
2965                                        context->cpu_sched_policy |
2966                                        (context->cpu_sched_reset_on_fork ?
2967                                         SCHED_RESET_ON_FORK : 0),
2968                                        &param);
2969                 if (r < 0) {
2970                         *exit_status = EXIT_SETSCHEDULER;
2971                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2972                 }
2973         }
2974
2975         if (context->cpuset)
2976                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2977                         *exit_status = EXIT_CPUAFFINITY;
2978                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2979                 }
2980
2981         if (context->ioprio_set)
2982                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2983                         *exit_status = EXIT_IOPRIO;
2984                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2985                 }
2986
2987         if (context->timer_slack_nsec != NSEC_INFINITY)
2988                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2989                         *exit_status = EXIT_TIMERSLACK;
2990                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2991                 }
2992
2993         if (context->personality != PERSONALITY_INVALID) {
2994                 r = safe_personality(context->personality);
2995                 if (r < 0) {
2996                         *exit_status = EXIT_PERSONALITY;
2997                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2998                 }
2999         }
3000
3001         if (context->utmp_id)
3002                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3003                                       context->tty_path,
3004                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3005                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3006                                       USER_PROCESS,
3007                                       username);
3008
3009         if (context->user) {
3010                 r = chown_terminal(STDIN_FILENO, uid);
3011                 if (r < 0) {
3012                         *exit_status = EXIT_STDIN;
3013                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3014                 }
3015         }
3016
3017         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3018          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3019          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3020          * touch a single hierarchy too. */
3021         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3022                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3023                 if (r < 0) {
3024                         *exit_status = EXIT_CGROUP;
3025                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3026                 }
3027         }
3028
3029         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3030                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3031                 if (r < 0)
3032                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3033         }
3034
3035         r = build_environment(
3036                         unit,
3037                         context,
3038                         params,
3039                         n_fds,
3040                         home,
3041                         username,
3042                         shell,
3043                         journal_stream_dev,
3044                         journal_stream_ino,
3045                         &our_env);
3046         if (r < 0) {
3047                 *exit_status = EXIT_MEMORY;
3048                 return log_oom();
3049         }
3050
3051         r = build_pass_environment(context, &pass_env);
3052         if (r < 0) {
3053                 *exit_status = EXIT_MEMORY;
3054                 return log_oom();
3055         }
3056
3057         accum_env = strv_env_merge(5,
3058                                    params->environment,
3059                                    our_env,
3060                                    pass_env,
3061                                    context->environment,
3062                                    files_env,
3063                                    NULL);
3064         if (!accum_env) {
3065                 *exit_status = EXIT_MEMORY;
3066                 return log_oom();
3067         }
3068         accum_env = strv_env_clean(accum_env);
3069
3070         (void) umask(context->umask);
3071
3072         r = setup_keyring(unit, context, params, uid, gid);
3073         if (r < 0) {
3074                 *exit_status = EXIT_KEYRING;
3075                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3076         }
3077
3078         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3079         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3080
3081         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3082         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3083
3084         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3085         if (needs_ambient_hack)
3086                 needs_setuid = false;
3087         else
3088                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3089
3090         if (needs_sandboxing) {
3091                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3092                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3093                  * impacting our own code paths. */
3094
3095 #if HAVE_SELINUX
3096                 use_selinux = mac_selinux_use();
3097 #endif
3098 #if ENABLE_SMACK
3099                 use_smack = mac_smack_use();
3100 #endif
3101 #if HAVE_APPARMOR
3102                 use_apparmor = mac_apparmor_use();
3103 #endif
3104         }
3105
3106         if (needs_setuid) {
3107                 if (context->pam_name && username) {
3108                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3109                         if (r < 0) {
3110                                 *exit_status = EXIT_PAM;
3111                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3112                         }
3113                 }
3114         }
3115
3116         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3117                 if (ns_type_supported(NAMESPACE_NET)) {
3118                         r = setup_netns(runtime->netns_storage_socket);
3119                         if (r < 0) {
3120                                 *exit_status = EXIT_NETWORK;
3121                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3122                         }
3123                 } else
3124                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3125         }
3126
3127         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3128         if (needs_mount_namespace) {
3129                 r = apply_mount_namespace(unit, command, context, params, runtime);
3130                 if (r < 0) {
3131                         *exit_status = EXIT_NAMESPACE;
3132                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3133                 }
3134         }
3135
3136         /* Apply just after mount namespace setup */
3137         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3138         if (r < 0)
3139                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3140
3141         /* Drop groups as early as possbile */
3142         if (needs_setuid) {
3143                 r = enforce_groups(gid, supplementary_gids, ngids);
3144                 if (r < 0) {
3145                         *exit_status = EXIT_GROUP;
3146                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3147                 }
3148         }
3149
3150         if (needs_sandboxing) {
3151 #if HAVE_SELINUX
3152                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3153                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3154                         if (r < 0) {
3155                                 *exit_status = EXIT_SELINUX_CONTEXT;
3156                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3157                         }
3158                 }
3159 #endif
3160
3161                 if (context->private_users) {
3162                         r = setup_private_users(uid, gid);
3163                         if (r < 0) {
3164                                 *exit_status = EXIT_USER;
3165                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3166                         }
3167                 }
3168         }
3169
3170         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3171          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3172          * was needed to upload the policy and can now be closed as well. */
3173         r = close_all_fds(fds, n_fds);
3174         if (r >= 0)
3175                 r = shift_fds(fds, n_fds);
3176         if (r >= 0)
3177                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3178         if (r < 0) {
3179                 *exit_status = EXIT_FDS;
3180                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3181         }
3182
3183         secure_bits = context->secure_bits;
3184
3185         if (needs_sandboxing) {
3186                 uint64_t bset;
3187
3188                 for (i = 0; i < _RLIMIT_MAX; i++) {
3189
3190                         if (!context->rlimit[i])
3191                                 continue;
3192
3193                         r = setrlimit_closest(i, context->rlimit[i]);
3194                         if (r < 0) {
3195                                 *exit_status = EXIT_LIMITS;
3196                                 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3197                         }
3198                 }
3199
3200                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3201                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3202                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3203                                 *exit_status = EXIT_LIMITS;
3204                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3205                         }
3206                 }
3207
3208 #if ENABLE_SMACK
3209                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3210                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3211                 if (use_smack) {
3212                         r = setup_smack(context, command);
3213                         if (r < 0) {
3214                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3215                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3216                         }
3217                 }
3218 #endif
3219
3220                 bset = context->capability_bounding_set;
3221                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3222                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3223                  * instead of us doing that */
3224                 if (needs_ambient_hack)
3225                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3226                                 (UINT64_C(1) << CAP_SETUID) |
3227                                 (UINT64_C(1) << CAP_SETGID);
3228
3229                 if (!cap_test_all(bset)) {
3230                         r = capability_bounding_set_drop(bset, false);
3231                         if (r < 0) {
3232                                 *exit_status = EXIT_CAPABILITIES;
3233                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3234                         }
3235                 }
3236
3237                 /* This is done before enforce_user, but ambient set
3238                  * does not survive over setresuid() if keep_caps is not set. */
3239                 if (!needs_ambient_hack &&
3240                     context->capability_ambient_set != 0) {
3241                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3242                         if (r < 0) {
3243                                 *exit_status = EXIT_CAPABILITIES;
3244                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3245                         }
3246                 }
3247         }
3248
3249         if (needs_setuid) {
3250                 if (context->user) {
3251                         r = enforce_user(context, uid);
3252                         if (r < 0) {
3253                                 *exit_status = EXIT_USER;
3254                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3255                         }
3256
3257                         if (!needs_ambient_hack &&
3258                             context->capability_ambient_set != 0) {
3259
3260                                 /* Fix the ambient capabilities after user change. */
3261                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3262                                 if (r < 0) {
3263                                         *exit_status = EXIT_CAPABILITIES;
3264                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3265                                 }
3266
3267                                 /* If we were asked to change user and ambient capabilities
3268                                  * were requested, we had to add keep-caps to the securebits
3269                                  * so that we would maintain the inherited capability set
3270                                  * through the setresuid(). Make sure that the bit is added
3271                                  * also to the context secure_bits so that we don't try to
3272                                  * drop the bit away next. */
3273
3274                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3275                         }
3276                 }
3277         }
3278
3279         if (needs_sandboxing) {
3280                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3281                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3282                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3283                  * are restricted. */
3284
3285 #if HAVE_SELINUX
3286                 if (use_selinux) {
3287                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3288
3289                         if (exec_context) {
3290                                 r = setexeccon(exec_context);
3291                                 if (r < 0) {
3292                                         *exit_status = EXIT_SELINUX_CONTEXT;
3293                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3294                                 }
3295                         }
3296                 }
3297 #endif
3298
3299 #if HAVE_APPARMOR
3300                 if (use_apparmor && context->apparmor_profile) {
3301                         r = aa_change_onexec(context->apparmor_profile);
3302                         if (r < 0 && !context->apparmor_profile_ignore) {
3303                                 *exit_status = EXIT_APPARMOR_PROFILE;
3304                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3305                         }
3306                 }
3307 #endif
3308
3309                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3310                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3311                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3312                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3313                                 *exit_status = EXIT_SECUREBITS;
3314                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3315                         }
3316
3317                 if (context_has_no_new_privileges(context))
3318                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3319                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3320                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3321                         }
3322
3323 #if HAVE_SECCOMP
3324                 r = apply_address_families(unit, context);
3325                 if (r < 0) {
3326                         *exit_status = EXIT_ADDRESS_FAMILIES;
3327                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3328                 }
3329
3330                 r = apply_memory_deny_write_execute(unit, context);
3331                 if (r < 0) {
3332                         *exit_status = EXIT_SECCOMP;
3333                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3334                 }
3335
3336                 r = apply_restrict_realtime(unit, context);
3337                 if (r < 0) {
3338                         *exit_status = EXIT_SECCOMP;
3339                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3340                 }
3341
3342                 r = apply_restrict_namespaces(unit, context);
3343                 if (r < 0) {
3344                         *exit_status = EXIT_SECCOMP;
3345                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3346                 }
3347
3348                 r = apply_protect_sysctl(unit, context);
3349                 if (r < 0) {
3350                         *exit_status = EXIT_SECCOMP;
3351                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3352                 }
3353
3354                 r = apply_protect_kernel_modules(unit, context);
3355                 if (r < 0) {
3356                         *exit_status = EXIT_SECCOMP;
3357                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3358                 }
3359
3360                 r = apply_private_devices(unit, context);
3361                 if (r < 0) {
3362                         *exit_status = EXIT_SECCOMP;
3363                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3364                 }
3365
3366                 r = apply_syscall_archs(unit, context);
3367                 if (r < 0) {
3368                         *exit_status = EXIT_SECCOMP;
3369                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3370                 }
3371
3372                 r = apply_lock_personality(unit, context);
3373                 if (r < 0) {
3374                         *exit_status = EXIT_SECCOMP;
3375                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3376                 }
3377
3378                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3379                  * by the filter as little as possible. */
3380                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3381                 if (r < 0) {
3382                         *exit_status = EXIT_SECCOMP;
3383                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3384                 }
3385 #endif
3386         }
3387
3388         if (!strv_isempty(context->unset_environment)) {
3389                 char **ee = NULL;
3390
3391                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3392                 if (!ee) {
3393                         *exit_status = EXIT_MEMORY;
3394                         return log_oom();
3395                 }
3396
3397                 strv_free(accum_env);
3398                 accum_env = ee;
3399         }
3400
3401         final_argv = replace_env_argv(argv, accum_env);
3402         if (!final_argv) {
3403                 *exit_status = EXIT_MEMORY;
3404                 return log_oom();
3405         }
3406
3407         if (DEBUG_LOGGING) {
3408                 _cleanup_free_ char *line;
3409
3410                 line = exec_command_line(final_argv);
3411                 if (line) {
3412                         log_struct(LOG_DEBUG,
3413                                    "EXECUTABLE=%s", command->path,
3414                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3415                                    LOG_UNIT_ID(unit),
3416                                    LOG_UNIT_INVOCATION_ID(unit),
3417                                    NULL);
3418                 }
3419         }
3420
3421         execve(command->path, final_argv, accum_env);
3422
3423         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3424
3425                 log_struct_errno(LOG_INFO, errno,
3426                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3427                                  LOG_UNIT_ID(unit),
3428                                  LOG_UNIT_INVOCATION_ID(unit),
3429                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3430                                                   command->path),
3431                                  "EXECUTABLE=%s", command->path,
3432                                  NULL);
3433
3434                 return 0;
3435         }
3436
3437         *exit_status = EXIT_EXEC;
3438         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3439 }
3440
3441 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3442 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3443
3444 int exec_spawn(Unit *unit,
3445                ExecCommand *command,
3446                const ExecContext *context,
3447                const ExecParameters *params,
3448                ExecRuntime *runtime,
3449                DynamicCreds *dcreds,
3450                pid_t *ret) {
3451
3452         _cleanup_strv_free_ char **files_env = NULL;
3453         int *fds = NULL;
3454         unsigned n_storage_fds = 0, n_socket_fds = 0;
3455         _cleanup_free_ char *line = NULL;
3456         int socket_fd, r;
3457         int named_iofds[3] = { -1, -1, -1 };
3458         char **argv;
3459         pid_t pid;
3460
3461         assert(unit);
3462         assert(command);
3463         assert(context);
3464         assert(ret);
3465         assert(params);
3466         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3467
3468         if (context->std_input == EXEC_INPUT_SOCKET ||
3469             context->std_output == EXEC_OUTPUT_SOCKET ||
3470             context->std_error == EXEC_OUTPUT_SOCKET) {
3471
3472                 if (params->n_socket_fds > 1) {
3473                         log_unit_error(unit, "Got more than one socket.");
3474                         return -EINVAL;
3475                 }
3476
3477                 if (params->n_socket_fds == 0) {
3478                         log_unit_error(unit, "Got no socket.");
3479                         return -EINVAL;
3480                 }
3481
3482                 socket_fd = params->fds[0];
3483         } else {
3484                 socket_fd = -1;
3485                 fds = params->fds;
3486                 n_storage_fds = params->n_storage_fds;
3487                 n_socket_fds = params->n_socket_fds;
3488         }
3489
3490         r = exec_context_named_iofds(context, params, named_iofds);
3491         if (r < 0)
3492                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3493
3494         r = exec_context_load_environment(unit, context, &files_env);
3495         if (r < 0)
3496                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3497
3498         argv = params->argv ?: command->argv;
3499         line = exec_command_line(argv);
3500         if (!line)
3501                 return log_oom();
3502
3503         log_struct(LOG_DEBUG,
3504                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3505                    "EXECUTABLE=%s", command->path,
3506                    LOG_UNIT_ID(unit),
3507                    LOG_UNIT_INVOCATION_ID(unit),
3508                    NULL);
3509
3510         pid = fork();
3511         if (pid < 0)
3512                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3513
3514         if (pid == 0) {
3515                 int exit_status = EXIT_SUCCESS;
3516
3517                 r = exec_child(unit,
3518                                command,
3519                                context,
3520                                params,
3521                                runtime,
3522                                dcreds,
3523                                argv,
3524                                socket_fd,
3525                                named_iofds,
3526                                fds,
3527                                n_storage_fds,
3528                                n_socket_fds,
3529                                files_env,
3530                                unit->manager->user_lookup_fds[1],
3531                                &exit_status);
3532
3533                 if (r < 0) {
3534                         log_struct_errno(LOG_ERR, r,
3535                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3536                                          LOG_UNIT_ID(unit),
3537                                          LOG_UNIT_INVOCATION_ID(unit),
3538                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3539                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3540                                                           command->path),
3541                                          "EXECUTABLE=%s", command->path,
3542                                          NULL);
3543                 }
3544
3545                 _exit(exit_status);
3546         }
3547
3548         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3549
3550         /* We add the new process to the cgroup both in the child (so
3551          * that we can be sure that no user code is ever executed
3552          * outside of the cgroup) and in the parent (so that we can be
3553          * sure that when we kill the cgroup the process will be
3554          * killed too). */
3555         if (params->cgroup_path)
3556                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3557
3558         exec_status_start(&command->exec_status, pid);
3559
3560         *ret = pid;
3561         return 0;
3562 }
3563
3564 void exec_context_init(ExecContext *c) {
3565         ExecDirectoryType i;
3566
3567         assert(c);
3568
3569         c->umask = 0022;
3570         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3571         c->cpu_sched_policy = SCHED_OTHER;
3572         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3573         c->syslog_level_prefix = true;
3574         c->ignore_sigpipe = true;
3575         c->timer_slack_nsec = NSEC_INFINITY;
3576         c->personality = PERSONALITY_INVALID;
3577         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3578                 c->directories[i].mode = 0755;
3579         c->capability_bounding_set = CAP_ALL;
3580         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3581         c->log_level_max = -1;
3582 }
3583
3584 void exec_context_done(ExecContext *c) {
3585         ExecDirectoryType i;
3586         size_t l;
3587
3588         assert(c);
3589
3590         c->environment = strv_free(c->environment);
3591         c->environment_files = strv_free(c->environment_files);
3592         c->pass_environment = strv_free(c->pass_environment);
3593         c->unset_environment = strv_free(c->unset_environment);
3594
3595         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3596                 c->rlimit[l] = mfree(c->rlimit[l]);
3597
3598         for (l = 0; l < 3; l++) {
3599                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3600                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3601         }
3602
3603         c->working_directory = mfree(c->working_directory);
3604         c->root_directory = mfree(c->root_directory);
3605         c->root_image = mfree(c->root_image);
3606         c->tty_path = mfree(c->tty_path);
3607         c->syslog_identifier = mfree(c->syslog_identifier);
3608         c->user = mfree(c->user);
3609         c->group = mfree(c->group);
3610
3611         c->supplementary_groups = strv_free(c->supplementary_groups);
3612
3613         c->pam_name = mfree(c->pam_name);
3614
3615         c->read_only_paths = strv_free(c->read_only_paths);
3616         c->read_write_paths = strv_free(c->read_write_paths);
3617         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3618
3619         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3620         c->bind_mounts = NULL;
3621         c->n_bind_mounts = 0;
3622         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3623         c->temporary_filesystems = NULL;
3624         c->n_temporary_filesystems = 0;
3625
3626         c->cpuset = cpu_set_mfree(c->cpuset);
3627
3628         c->utmp_id = mfree(c->utmp_id);
3629         c->selinux_context = mfree(c->selinux_context);
3630         c->apparmor_profile = mfree(c->apparmor_profile);
3631         c->smack_process_label = mfree(c->smack_process_label);
3632
3633         c->syscall_filter = hashmap_free(c->syscall_filter);
3634         c->syscall_archs = set_free(c->syscall_archs);
3635         c->address_families = set_free(c->address_families);
3636
3637         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3638                 c->directories[i].paths = strv_free(c->directories[i].paths);
3639
3640         c->log_level_max = -1;
3641
3642         exec_context_free_log_extra_fields(c);
3643
3644         c->stdin_data = mfree(c->stdin_data);
3645         c->stdin_data_size = 0;
3646 }
3647
3648 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3649         char **i;
3650
3651         assert(c);
3652
3653         if (!runtime_prefix)
3654                 return 0;
3655
3656         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3657                 _cleanup_free_ char *p;
3658
3659                 p = strjoin(runtime_prefix, "/", *i);
3660                 if (!p)
3661                         return -ENOMEM;
3662
3663                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3664                  * next. */
3665                 (void) rm_rf(p, REMOVE_ROOT);
3666         }
3667
3668         return 0;
3669 }
3670
3671 static void exec_command_done(ExecCommand *c) {
3672         assert(c);
3673
3674         c->path = mfree(c->path);
3675
3676         c->argv = strv_free(c->argv);
3677 }
3678
3679 void exec_command_done_array(ExecCommand *c, unsigned n) {
3680         unsigned i;
3681
3682         for (i = 0; i < n; i++)
3683                 exec_command_done(c+i);
3684 }
3685
3686 ExecCommand* exec_command_free_list(ExecCommand *c) {
3687         ExecCommand *i;
3688
3689         while ((i = c)) {
3690                 LIST_REMOVE(command, c, i);
3691                 exec_command_done(i);
3692                 free(i);
3693         }
3694
3695         return NULL;
3696 }
3697
3698 void exec_command_free_array(ExecCommand **c, unsigned n) {
3699         unsigned i;
3700
3701         for (i = 0; i < n; i++)
3702                 c[i] = exec_command_free_list(c[i]);
3703 }
3704
3705 typedef struct InvalidEnvInfo {
3706         const Unit *unit;
3707         const char *path;
3708 } InvalidEnvInfo;
3709
3710 static void invalid_env(const char *p, void *userdata) {
3711         InvalidEnvInfo *info = userdata;
3712
3713         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3714 }
3715
3716 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3717         assert(c);
3718
3719         switch (fd_index) {
3720
3721         case STDIN_FILENO:
3722                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3723                         return NULL;
3724
3725                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3726
3727         case STDOUT_FILENO:
3728                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3729                         return NULL;
3730
3731                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3732
3733         case STDERR_FILENO:
3734                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3735                         return NULL;
3736
3737                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3738
3739         default:
3740                 return NULL;
3741         }
3742 }
3743
3744 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3745         unsigned i, targets;
3746         const char* stdio_fdname[3];
3747         unsigned n_fds;
3748
3749         assert(c);
3750         assert(p);
3751
3752         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3753                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3754                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3755
3756         for (i = 0; i < 3; i++)
3757                 stdio_fdname[i] = exec_context_fdname(c, i);
3758
3759         n_fds = p->n_storage_fds + p->n_socket_fds;
3760
3761         for (i = 0; i < n_fds  && targets > 0; i++)
3762                 if (named_iofds[STDIN_FILENO] < 0 &&
3763                     c->std_input == EXEC_INPUT_NAMED_FD &&
3764                     stdio_fdname[STDIN_FILENO] &&
3765                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3766
3767                         named_iofds[STDIN_FILENO] = p->fds[i];
3768                         targets--;
3769
3770                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3771                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3772                            stdio_fdname[STDOUT_FILENO] &&
3773                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3774
3775                         named_iofds[STDOUT_FILENO] = p->fds[i];
3776                         targets--;
3777
3778                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3779                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3780                            stdio_fdname[STDERR_FILENO] &&
3781                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3782
3783                         named_iofds[STDERR_FILENO] = p->fds[i];
3784                         targets--;
3785                 }
3786
3787         return targets == 0 ? 0 : -ENOENT;
3788 }
3789
3790 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3791         char **i, **r = NULL;
3792
3793         assert(c);
3794         assert(l);
3795
3796         STRV_FOREACH(i, c->environment_files) {
3797                 char *fn;
3798                 int k;
3799                 unsigned n;
3800                 bool ignore = false;
3801                 char **p;
3802                 _cleanup_globfree_ glob_t pglob = {};
3803
3804                 fn = *i;
3805
3806                 if (fn[0] == '-') {
3807                         ignore = true;
3808                         fn++;
3809                 }
3810
3811                 if (!path_is_absolute(fn)) {
3812                         if (ignore)
3813                                 continue;
3814
3815                         strv_free(r);
3816                         return -EINVAL;
3817                 }
3818
3819                 /* Filename supports globbing, take all matching files */
3820                 k = safe_glob(fn, 0, &pglob);
3821                 if (k < 0) {
3822                         if (ignore)
3823                                 continue;
3824
3825                         strv_free(r);
3826                         return k;
3827                 }
3828
3829                 /* When we don't match anything, -ENOENT should be returned */
3830                 assert(pglob.gl_pathc > 0);
3831
3832                 for (n = 0; n < pglob.gl_pathc; n++) {
3833                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3834                         if (k < 0) {
3835                                 if (ignore)
3836                                         continue;
3837
3838                                 strv_free(r);
3839                                 return k;
3840                         }
3841                         /* Log invalid environment variables with filename */
3842                         if (p) {
3843                                 InvalidEnvInfo info = {
3844                                         .unit = unit,
3845                                         .path = pglob.gl_pathv[n]
3846                                 };
3847
3848                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3849                         }
3850
3851                         if (!r)
3852                                 r = p;
3853                         else {
3854                                 char **m;
3855
3856                                 m = strv_env_merge(2, r, p);
3857                                 strv_free(r);
3858                                 strv_free(p);
3859                                 if (!m)
3860                                         return -ENOMEM;
3861
3862                                 r = m;
3863                         }
3864                 }
3865         }
3866
3867         *l = r;
3868
3869         return 0;
3870 }
3871
3872 static bool tty_may_match_dev_console(const char *tty) {
3873         _cleanup_free_ char *resolved = NULL;
3874
3875         if (!tty)
3876                 return true;
3877
3878         tty = skip_dev_prefix(tty);
3879
3880         /* trivial identity? */
3881         if (streq(tty, "console"))
3882                 return true;
3883
3884         if (resolve_dev_console(&resolved) < 0)
3885                 return true; /* if we could not resolve, assume it may */
3886
3887         /* "tty0" means the active VC, so it may be the same sometimes */
3888         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3889 }
3890
3891 bool exec_context_may_touch_console(const ExecContext *ec) {
3892
3893         return (ec->tty_reset ||
3894                 ec->tty_vhangup ||
3895                 ec->tty_vt_disallocate ||
3896                 is_terminal_input(ec->std_input) ||
3897                 is_terminal_output(ec->std_output) ||
3898                 is_terminal_output(ec->std_error)) &&
3899                tty_may_match_dev_console(exec_context_tty_path(ec));
3900 }
3901
3902 static void strv_fprintf(FILE *f, char **l) {
3903         char **g;
3904
3905         assert(f);
3906
3907         STRV_FOREACH(g, l)
3908                 fprintf(f, " %s", *g);
3909 }
3910
3911 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3912         ExecDirectoryType dt;
3913         char **e, **d;
3914         unsigned i;
3915         int r;
3916
3917         assert(c);
3918         assert(f);
3919
3920         prefix = strempty(prefix);
3921
3922         fprintf(f,
3923                 "%sUMask: %04o\n"
3924                 "%sWorkingDirectory: %s\n"
3925                 "%sRootDirectory: %s\n"
3926                 "%sNonBlocking: %s\n"
3927                 "%sPrivateTmp: %s\n"
3928                 "%sPrivateDevices: %s\n"
3929                 "%sProtectKernelTunables: %s\n"
3930                 "%sProtectKernelModules: %s\n"
3931                 "%sProtectControlGroups: %s\n"
3932                 "%sPrivateNetwork: %s\n"
3933                 "%sPrivateUsers: %s\n"
3934                 "%sProtectHome: %s\n"
3935                 "%sProtectSystem: %s\n"
3936                 "%sMountAPIVFS: %s\n"
3937                 "%sIgnoreSIGPIPE: %s\n"
3938                 "%sMemoryDenyWriteExecute: %s\n"
3939                 "%sRestrictRealtime: %s\n"
3940                 "%sKeyringMode: %s\n",
3941                 prefix, c->umask,
3942                 prefix, c->working_directory ? c->working_directory : "/",
3943                 prefix, c->root_directory ? c->root_directory : "/",
3944                 prefix, yes_no(c->non_blocking),
3945                 prefix, yes_no(c->private_tmp),
3946                 prefix, yes_no(c->private_devices),
3947                 prefix, yes_no(c->protect_kernel_tunables),
3948                 prefix, yes_no(c->protect_kernel_modules),
3949                 prefix, yes_no(c->protect_control_groups),
3950                 prefix, yes_no(c->private_network),
3951                 prefix, yes_no(c->private_users),
3952                 prefix, protect_home_to_string(c->protect_home),
3953                 prefix, protect_system_to_string(c->protect_system),
3954                 prefix, yes_no(c->mount_apivfs),
3955                 prefix, yes_no(c->ignore_sigpipe),
3956                 prefix, yes_no(c->memory_deny_write_execute),
3957                 prefix, yes_no(c->restrict_realtime),
3958                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3959
3960         if (c->root_image)
3961                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3962
3963         STRV_FOREACH(e, c->environment)
3964                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3965
3966         STRV_FOREACH(e, c->environment_files)
3967                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3968
3969         STRV_FOREACH(e, c->pass_environment)
3970                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3971
3972         STRV_FOREACH(e, c->unset_environment)
3973                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3974
3975         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3976
3977         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3978                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3979
3980                 STRV_FOREACH(d, c->directories[dt].paths)
3981                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3982         }
3983
3984         if (c->nice_set)
3985                 fprintf(f,
3986                         "%sNice: %i\n",
3987                         prefix, c->nice);
3988
3989         if (c->oom_score_adjust_set)
3990                 fprintf(f,
3991                         "%sOOMScoreAdjust: %i\n",
3992                         prefix, c->oom_score_adjust);
3993
3994         for (i = 0; i < RLIM_NLIMITS; i++)
3995                 if (c->rlimit[i]) {
3996                         fprintf(f, "%s%s: " RLIM_FMT "\n",
3997                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3998                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3999                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4000                 }
4001
4002         if (c->ioprio_set) {
4003                 _cleanup_free_ char *class_str = NULL;
4004
4005                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4006                 if (r >= 0)
4007                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4008
4009                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4010         }
4011
4012         if (c->cpu_sched_set) {
4013                 _cleanup_free_ char *policy_str = NULL;
4014
4015                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4016                 if (r >= 0)
4017                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4018
4019                 fprintf(f,
4020                         "%sCPUSchedulingPriority: %i\n"
4021                         "%sCPUSchedulingResetOnFork: %s\n",
4022                         prefix, c->cpu_sched_priority,
4023                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4024         }
4025
4026         if (c->cpuset) {
4027                 fprintf(f, "%sCPUAffinity:", prefix);
4028                 for (i = 0; i < c->cpuset_ncpus; i++)
4029                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4030                                 fprintf(f, " %u", i);
4031                 fputs("\n", f);
4032         }
4033
4034         if (c->timer_slack_nsec != NSEC_INFINITY)
4035                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4036
4037         fprintf(f,
4038                 "%sStandardInput: %s\n"
4039                 "%sStandardOutput: %s\n"
4040                 "%sStandardError: %s\n",
4041                 prefix, exec_input_to_string(c->std_input),
4042                 prefix, exec_output_to_string(c->std_output),
4043                 prefix, exec_output_to_string(c->std_error));
4044
4045         if (c->std_input == EXEC_INPUT_NAMED_FD)
4046                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4047         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4048                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4049         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4050                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4051
4052         if (c->std_input == EXEC_INPUT_FILE)
4053                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4054         if (c->std_output == EXEC_OUTPUT_FILE)
4055                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4056         if (c->std_error == EXEC_OUTPUT_FILE)
4057                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4058
4059         if (c->tty_path)
4060                 fprintf(f,
4061                         "%sTTYPath: %s\n"
4062                         "%sTTYReset: %s\n"
4063                         "%sTTYVHangup: %s\n"
4064                         "%sTTYVTDisallocate: %s\n",
4065                         prefix, c->tty_path,
4066                         prefix, yes_no(c->tty_reset),
4067                         prefix, yes_no(c->tty_vhangup),
4068                         prefix, yes_no(c->tty_vt_disallocate));
4069
4070         if (IN_SET(c->std_output,
4071                    EXEC_OUTPUT_SYSLOG,
4072                    EXEC_OUTPUT_KMSG,
4073                    EXEC_OUTPUT_JOURNAL,
4074                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4075                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4076                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4077             IN_SET(c->std_error,
4078                    EXEC_OUTPUT_SYSLOG,
4079                    EXEC_OUTPUT_KMSG,
4080                    EXEC_OUTPUT_JOURNAL,
4081                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4082                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4083                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4084
4085                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4086
4087                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4088                 if (r >= 0)
4089                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4090
4091                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4092                 if (r >= 0)
4093                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4094         }
4095
4096         if (c->log_level_max >= 0) {
4097                 _cleanup_free_ char *t = NULL;
4098
4099                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4100
4101                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4102         }
4103
4104         if (c->n_log_extra_fields > 0) {
4105                 size_t j;
4106
4107                 for (j = 0; j < c->n_log_extra_fields; j++) {
4108                         fprintf(f, "%sLogExtraFields: ", prefix);
4109                         fwrite(c->log_extra_fields[j].iov_base,
4110                                1, c->log_extra_fields[j].iov_len,
4111                                f);
4112                         fputc('\n', f);
4113                 }
4114         }
4115
4116         if (c->secure_bits) {
4117                 _cleanup_free_ char *str = NULL;
4118
4119                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4120                 if (r >= 0)
4121                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4122         }
4123
4124         if (c->capability_bounding_set != CAP_ALL) {
4125                 _cleanup_free_ char *str = NULL;
4126
4127                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4128                 if (r >= 0)
4129                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4130         }
4131
4132         if (c->capability_ambient_set != 0) {
4133                 _cleanup_free_ char *str = NULL;
4134
4135                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4136                 if (r >= 0)
4137                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4138         }
4139
4140         if (c->user)
4141                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4142         if (c->group)
4143                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4144
4145         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4146
4147         if (!strv_isempty(c->supplementary_groups)) {
4148                 fprintf(f, "%sSupplementaryGroups:", prefix);
4149                 strv_fprintf(f, c->supplementary_groups);
4150                 fputs("\n", f);
4151         }
4152
4153         if (c->pam_name)
4154                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4155
4156         if (!strv_isempty(c->read_write_paths)) {
4157                 fprintf(f, "%sReadWritePaths:", prefix);
4158                 strv_fprintf(f, c->read_write_paths);
4159                 fputs("\n", f);
4160         }
4161
4162         if (!strv_isempty(c->read_only_paths)) {
4163                 fprintf(f, "%sReadOnlyPaths:", prefix);
4164                 strv_fprintf(f, c->read_only_paths);
4165                 fputs("\n", f);
4166         }
4167
4168         if (!strv_isempty(c->inaccessible_paths)) {
4169                 fprintf(f, "%sInaccessiblePaths:", prefix);
4170                 strv_fprintf(f, c->inaccessible_paths);
4171                 fputs("\n", f);
4172         }
4173
4174         if (c->n_bind_mounts > 0)
4175                 for (i = 0; i < c->n_bind_mounts; i++)
4176                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4177                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4178                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4179                                 c->bind_mounts[i].source,
4180                                 c->bind_mounts[i].destination,
4181                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4182
4183         if (c->n_temporary_filesystems > 0)
4184                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4185                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4186
4187                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4188                                 t->path,
4189                                 isempty(t->options) ? "" : ":",
4190                                 strempty(t->options));
4191                 }
4192
4193         if (c->utmp_id)
4194                 fprintf(f,
4195                         "%sUtmpIdentifier: %s\n",
4196                         prefix, c->utmp_id);
4197
4198         if (c->selinux_context)
4199                 fprintf(f,
4200                         "%sSELinuxContext: %s%s\n",
4201                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4202
4203         if (c->apparmor_profile)
4204                 fprintf(f,
4205                         "%sAppArmorProfile: %s%s\n",
4206                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4207
4208         if (c->smack_process_label)
4209                 fprintf(f,
4210                         "%sSmackProcessLabel: %s%s\n",
4211                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4212
4213         if (c->personality != PERSONALITY_INVALID)
4214                 fprintf(f,
4215                         "%sPersonality: %s\n",
4216                         prefix, strna(personality_to_string(c->personality)));
4217
4218         fprintf(f,
4219                 "%sLockPersonality: %s\n",
4220                 prefix, yes_no(c->lock_personality));
4221
4222         if (c->syscall_filter) {
4223 #if HAVE_SECCOMP
4224                 Iterator j;
4225                 void *id, *val;
4226                 bool first = true;
4227 #endif
4228
4229                 fprintf(f,
4230                         "%sSystemCallFilter: ",
4231                         prefix);
4232
4233                 if (!c->syscall_whitelist)
4234                         fputc('~', f);
4235
4236 #if HAVE_SECCOMP
4237                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4238                         _cleanup_free_ char *name = NULL;
4239                         const char *errno_name = NULL;
4240                         int num = PTR_TO_INT(val);
4241
4242                         if (first)
4243                                 first = false;
4244                         else
4245                                 fputc(' ', f);
4246
4247                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4248                         fputs(strna(name), f);
4249
4250                         if (num >= 0) {
4251                                 errno_name = errno_to_name(num);
4252                                 if (errno_name)
4253                                         fprintf(f, ":%s", errno_name);
4254                                 else
4255                                         fprintf(f, ":%d", num);
4256                         }
4257                 }
4258 #endif
4259
4260                 fputc('\n', f);
4261         }
4262
4263         if (c->syscall_archs) {
4264 #if HAVE_SECCOMP
4265                 Iterator j;
4266                 void *id;
4267 #endif
4268
4269                 fprintf(f,
4270                         "%sSystemCallArchitectures:",
4271                         prefix);
4272
4273 #if HAVE_SECCOMP
4274                 SET_FOREACH(id, c->syscall_archs, j)
4275                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4276 #endif
4277                 fputc('\n', f);
4278         }
4279
4280         if (exec_context_restrict_namespaces_set(c)) {
4281                 _cleanup_free_ char *s = NULL;
4282
4283                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4284                 if (r >= 0)
4285                         fprintf(f, "%sRestrictNamespaces: %s\n",
4286                                 prefix, s);
4287         }
4288
4289         if (c->syscall_errno > 0) {
4290                 const char *errno_name;
4291
4292                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4293
4294                 errno_name = errno_to_name(c->syscall_errno);
4295                 if (errno_name)
4296                         fprintf(f, "%s\n", errno_name);
4297                 else
4298                         fprintf(f, "%d\n", c->syscall_errno);
4299         }
4300
4301         if (c->apparmor_profile)
4302                 fprintf(f,
4303                         "%sAppArmorProfile: %s%s\n",
4304                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4305 }
4306
4307 bool exec_context_maintains_privileges(const ExecContext *c) {
4308         assert(c);
4309
4310         /* Returns true if the process forked off would run under
4311          * an unchanged UID or as root. */
4312
4313         if (!c->user)
4314                 return true;
4315
4316         if (streq(c->user, "root") || streq(c->user, "0"))
4317                 return true;
4318
4319         return false;
4320 }
4321
4322 int exec_context_get_effective_ioprio(const ExecContext *c) {
4323         int p;
4324
4325         assert(c);
4326
4327         if (c->ioprio_set)
4328                 return c->ioprio;
4329
4330         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4331         if (p < 0)
4332                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4333
4334         return p;
4335 }
4336
4337 void exec_context_free_log_extra_fields(ExecContext *c) {
4338         size_t l;
4339
4340         assert(c);
4341
4342         for (l = 0; l < c->n_log_extra_fields; l++)
4343                 free(c->log_extra_fields[l].iov_base);
4344         c->log_extra_fields = mfree(c->log_extra_fields);
4345         c->n_log_extra_fields = 0;
4346 }
4347
4348 void exec_status_start(ExecStatus *s, pid_t pid) {
4349         assert(s);
4350
4351         zero(*s);
4352         s->pid = pid;
4353         dual_timestamp_get(&s->start_timestamp);
4354 }
4355
4356 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4357         assert(s);
4358
4359         if (s->pid && s->pid != pid)
4360                 zero(*s);
4361
4362         s->pid = pid;
4363         dual_timestamp_get(&s->exit_timestamp);
4364
4365         s->code = code;
4366         s->status = status;
4367
4368         if (context) {
4369                 if (context->utmp_id)
4370                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4371
4372                 exec_context_tty_reset(context, NULL);
4373         }
4374 }
4375
4376 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4377         char buf[FORMAT_TIMESTAMP_MAX];
4378
4379         assert(s);
4380         assert(f);
4381
4382         if (s->pid <= 0)
4383                 return;
4384
4385         prefix = strempty(prefix);
4386
4387         fprintf(f,
4388                 "%sPID: "PID_FMT"\n",
4389                 prefix, s->pid);
4390
4391         if (dual_timestamp_is_set(&s->start_timestamp))
4392                 fprintf(f,
4393                         "%sStart Timestamp: %s\n",
4394                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4395
4396         if (dual_timestamp_is_set(&s->exit_timestamp))
4397                 fprintf(f,
4398                         "%sExit Timestamp: %s\n"
4399                         "%sExit Code: %s\n"
4400                         "%sExit Status: %i\n",
4401                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4402                         prefix, sigchld_code_to_string(s->code),
4403                         prefix, s->status);
4404 }
4405
4406 static char *exec_command_line(char **argv) {
4407         size_t k;
4408         char *n, *p, **a;
4409         bool first = true;
4410
4411         assert(argv);
4412
4413         k = 1;
4414         STRV_FOREACH(a, argv)
4415                 k += strlen(*a)+3;
4416
4417         n = new(char, k);
4418         if (!n)
4419                 return NULL;
4420
4421         p = n;
4422         STRV_FOREACH(a, argv) {
4423
4424                 if (!first)
4425                         *(p++) = ' ';
4426                 else
4427                         first = false;
4428
4429                 if (strpbrk(*a, WHITESPACE)) {
4430                         *(p++) = '\'';
4431                         p = stpcpy(p, *a);
4432                         *(p++) = '\'';
4433                 } else
4434                         p = stpcpy(p, *a);
4435
4436         }
4437
4438         *p = 0;
4439
4440         /* FIXME: this doesn't really handle arguments that have
4441          * spaces and ticks in them */
4442
4443         return n;
4444 }
4445
4446 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4447         _cleanup_free_ char *cmd = NULL;
4448         const char *prefix2;
4449
4450         assert(c);
4451         assert(f);
4452
4453         prefix = strempty(prefix);
4454         prefix2 = strjoina(prefix, "\t");
4455
4456         cmd = exec_command_line(c->argv);
4457         fprintf(f,
4458                 "%sCommand Line: %s\n",
4459                 prefix, cmd ? cmd : strerror(ENOMEM));
4460
4461         exec_status_dump(&c->exec_status, f, prefix2);
4462 }
4463
4464 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4465         assert(f);
4466
4467         prefix = strempty(prefix);
4468
4469         LIST_FOREACH(command, c, c)
4470                 exec_command_dump(c, f, prefix);
4471 }
4472
4473 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4474         ExecCommand *end;
4475
4476         assert(l);
4477         assert(e);
4478
4479         if (*l) {
4480                 /* It's kind of important, that we keep the order here */
4481                 LIST_FIND_TAIL(command, *l, end);
4482                 LIST_INSERT_AFTER(command, *l, end, e);
4483         } else
4484               *l = e;
4485 }
4486
4487 int exec_command_set(ExecCommand *c, const char *path, ...) {
4488         va_list ap;
4489         char **l, *p;
4490
4491         assert(c);
4492         assert(path);
4493
4494         va_start(ap, path);
4495         l = strv_new_ap(path, ap);
4496         va_end(ap);
4497
4498         if (!l)
4499                 return -ENOMEM;
4500
4501         p = strdup(path);
4502         if (!p) {
4503                 strv_free(l);
4504                 return -ENOMEM;
4505         }
4506
4507         free(c->path);
4508         c->path = p;
4509
4510         strv_free(c->argv);
4511         c->argv = l;
4512
4513         return 0;
4514 }
4515
4516 int exec_command_append(ExecCommand *c, const char *path, ...) {
4517         _cleanup_strv_free_ char **l = NULL;
4518         va_list ap;
4519         int r;
4520
4521         assert(c);
4522         assert(path);
4523
4524         va_start(ap, path);
4525         l = strv_new_ap(path, ap);
4526         va_end(ap);
4527
4528         if (!l)
4529                 return -ENOMEM;
4530
4531         r = strv_extend_strv(&c->argv, l, false);
4532         if (r < 0)
4533                 return r;
4534
4535         return 0;
4536 }
4537
4538 static void *remove_tmpdir_thread(void *p) {
4539         _cleanup_free_ char *path = p;
4540
4541         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4542         return NULL;
4543 }
4544
4545 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4546         int r;
4547
4548         if (!rt)
4549                 return NULL;
4550
4551         if (rt->manager)
4552                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4553
4554         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4555         if (destroy && rt->tmp_dir) {
4556                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4557
4558                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4559                 if (r < 0) {
4560                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4561                         free(rt->tmp_dir);
4562                 }
4563
4564                 rt->tmp_dir = NULL;
4565         }
4566
4567         if (destroy && rt->var_tmp_dir) {
4568                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4569
4570                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4571                 if (r < 0) {
4572                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4573                         free(rt->var_tmp_dir);
4574                 }
4575
4576                 rt->var_tmp_dir = NULL;
4577         }
4578
4579         rt->id = mfree(rt->id);
4580         rt->tmp_dir = mfree(rt->tmp_dir);
4581         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4582         safe_close_pair(rt->netns_storage_socket);
4583         return mfree(rt);
4584 }
4585
4586 static void exec_runtime_freep(ExecRuntime **rt) {
4587         if (*rt)
4588                 (void) exec_runtime_free(*rt, false);
4589 }
4590
4591 static int exec_runtime_allocate(ExecRuntime **rt) {
4592         assert(rt);
4593
4594         *rt = new0(ExecRuntime, 1);
4595         if (!*rt)
4596                 return -ENOMEM;
4597
4598         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4599         return 0;
4600 }
4601
4602 static int exec_runtime_add(
4603                 Manager *m,
4604                 const char *id,
4605                 const char *tmp_dir,
4606                 const char *var_tmp_dir,
4607                 const int netns_storage_socket[2],
4608                 ExecRuntime **ret) {
4609
4610         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4611         int r;
4612
4613         assert(m);
4614         assert(id);
4615
4616         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4617         if (r < 0)
4618                 return r;
4619
4620         r = exec_runtime_allocate(&rt);
4621         if (r < 0)
4622                 return r;
4623
4624         rt->id = strdup(id);
4625         if (!rt->id)
4626                 return -ENOMEM;
4627
4628         if (tmp_dir) {
4629                 rt->tmp_dir = strdup(tmp_dir);
4630                 if (!rt->tmp_dir)
4631                         return -ENOMEM;
4632
4633                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4634                 assert(var_tmp_dir);
4635                 rt->var_tmp_dir = strdup(var_tmp_dir);
4636                 if (!rt->var_tmp_dir)
4637                         return -ENOMEM;
4638         }
4639
4640         if (netns_storage_socket) {
4641                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4642                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4643         }
4644
4645         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4646         if (r < 0)
4647                 return r;
4648
4649         rt->manager = m;
4650
4651         if (ret)
4652                 *ret = rt;
4653
4654         /* do not remove created ExecRuntime object when the operation succeeds. */
4655         rt = NULL;
4656         return 0;
4657 }
4658
4659 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4660         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4661         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4662         int r;
4663
4664         assert(m);
4665         assert(c);
4666         assert(id);
4667
4668         /* It is not necessary to create ExecRuntime object. */
4669         if (!c->private_network && !c->private_tmp)
4670                 return 0;
4671
4672         if (c->private_tmp) {
4673                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4674                 if (r < 0)
4675                         return r;
4676         }
4677
4678         if (c->private_network) {
4679                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4680                         return -errno;
4681         }
4682
4683         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4684         if (r < 0)
4685                 return r;
4686
4687         /* Avoid cleanup */
4688         netns_storage_socket[0] = -1;
4689         netns_storage_socket[1] = -1;
4690         return 1;
4691 }
4692
4693 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4694         ExecRuntime *rt;
4695         int r;
4696
4697         assert(m);
4698         assert(id);
4699         assert(ret);
4700
4701         rt = hashmap_get(m->exec_runtime_by_id, id);
4702         if (rt)
4703                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4704                 goto ref;
4705
4706         if (!create)
4707                 return 0;
4708
4709         /* If not found, then create a new object. */
4710         r = exec_runtime_make(m, c, id, &rt);
4711         if (r <= 0)
4712                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4713                 return r;
4714
4715 ref:
4716         /* increment reference counter. */
4717         rt->n_ref++;
4718         *ret = rt;
4719         return 1;
4720 }
4721
4722 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4723         if (!rt)
4724                 return NULL;
4725
4726         assert(rt->n_ref > 0);
4727
4728         rt->n_ref--;
4729         if (rt->n_ref > 0)
4730                 return NULL;
4731
4732         return exec_runtime_free(rt, destroy);
4733 }
4734
4735 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4736         ExecRuntime *rt;
4737         Iterator i;
4738
4739         assert(m);
4740         assert(f);
4741         assert(fds);
4742
4743         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4744                 fprintf(f, "exec-runtime=%s", rt->id);
4745
4746                 if (rt->tmp_dir)
4747                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4748
4749                 if (rt->var_tmp_dir)
4750                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4751
4752                 if (rt->netns_storage_socket[0] >= 0) {
4753                         int copy;
4754
4755                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4756                         if (copy < 0)
4757                                 return copy;
4758
4759                         fprintf(f, " netns-socket-0=%i", copy);
4760                 }
4761
4762                 if (rt->netns_storage_socket[1] >= 0) {
4763                         int copy;
4764
4765                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4766                         if (copy < 0)
4767                                 return copy;
4768
4769                         fprintf(f, " netns-socket-1=%i", copy);
4770                 }
4771
4772                 fputc('\n', f);
4773         }
4774
4775         return 0;
4776 }
4777
4778 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4779         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4780         ExecRuntime *rt;
4781         int r;
4782
4783         /* This is for the migration from old (v237 or earlier) deserialization text.
4784          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4785          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4786          * so or not from the serialized text, then we always creates a new object owned by this. */
4787
4788         assert(u);
4789         assert(key);
4790         assert(value);
4791
4792         /* Manager manages ExecRuntime objects by the unit id.
4793          * So, we omit the serialized text when the unit does not have id (yet?)... */
4794         if (isempty(u->id)) {
4795                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4796                 return 0;
4797         }
4798
4799         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4800         if (r < 0) {
4801                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4802                 return 0;
4803         }
4804
4805         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4806         if (!rt) {
4807                 r = exec_runtime_allocate(&rt_create);
4808                 if (r < 0)
4809                         return log_oom();
4810
4811                 rt_create->id = strdup(u->id);
4812                 if (!rt_create->id)
4813                         return log_oom();
4814
4815                 rt = rt_create;
4816         }
4817
4818         if (streq(key, "tmp-dir")) {
4819                 char *copy;
4820
4821                 copy = strdup(value);
4822                 if (!copy)
4823                         return log_oom();
4824
4825                 free_and_replace(rt->tmp_dir, copy);
4826
4827         } else if (streq(key, "var-tmp-dir")) {
4828                 char *copy;
4829
4830                 copy = strdup(value);
4831                 if (!copy)
4832                         return log_oom();
4833
4834                 free_and_replace(rt->var_tmp_dir, copy);
4835
4836         } else if (streq(key, "netns-socket-0")) {
4837                 int fd;
4838
4839                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4840                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4841                         return 0;
4842                 }
4843
4844                 safe_close(rt->netns_storage_socket[0]);
4845                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4846
4847         } else if (streq(key, "netns-socket-1")) {
4848                 int fd;
4849
4850                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4851                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4852                         return 0;
4853                 }
4854
4855                 safe_close(rt->netns_storage_socket[1]);
4856                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4857         } else
4858                 return 0;
4859
4860
4861         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4862         if (rt_create) {
4863                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4864                 if (r < 0) {
4865                         log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4866                         return 0;
4867                 }
4868
4869                 rt_create->manager = u->manager;
4870
4871                 /* Avoid cleanup */
4872                 rt_create = NULL;
4873         }
4874
4875         return 1;
4876 }
4877
4878 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4879         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4880         int r, fd0 = -1, fd1 = -1;
4881         const char *p, *v = value;
4882         size_t n;
4883
4884         assert(m);
4885         assert(value);
4886         assert(fds);
4887
4888         n = strcspn(v, " ");
4889         id = strndupa(v, n);
4890         if (v[n] != ' ')
4891                 goto finalize;
4892         p = v + n + 1;
4893
4894         v = startswith(p, "tmp-dir=");
4895         if (v) {
4896                 n = strcspn(v, " ");
4897                 tmp_dir = strndupa(v, n);
4898                 if (v[n] != ' ')
4899                         goto finalize;
4900                 p = v + n + 1;
4901         }
4902
4903         v = startswith(p, "var-tmp-dir=");
4904         if (v) {
4905                 n = strcspn(v, " ");
4906                 var_tmp_dir = strndupa(v, n);
4907                 if (v[n] != ' ')
4908                         goto finalize;
4909                 p = v + n + 1;
4910         }
4911
4912         v = startswith(p, "netns-socket-0=");
4913         if (v) {
4914                 char *buf;
4915
4916                 n = strcspn(v, " ");
4917                 buf = strndupa(v, n);
4918                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4919                         log_debug("Unable to process exec-runtime netns fd specification.");
4920                         return;
4921                 }
4922                 fd0 = fdset_remove(fds, fd0);
4923                 if (v[n] != ' ')
4924                         goto finalize;
4925                 p = v + n + 1;
4926         }
4927
4928         v = startswith(p, "netns-socket-1=");
4929         if (v) {
4930                 char *buf;
4931
4932                 n = strcspn(v, " ");
4933                 buf = strndupa(v, n);
4934                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4935                         log_debug("Unable to process exec-runtime netns fd specification.");
4936                         return;
4937                 }
4938                 fd1 = fdset_remove(fds, fd1);
4939         }
4940
4941 finalize:
4942
4943         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4944         if (r < 0) {
4945                 log_debug_errno(r, "Failed to add exec-runtime: %m");
4946                 return;
4947         }
4948 }
4949
4950 void exec_runtime_vacuum(Manager *m) {
4951         ExecRuntime *rt;
4952         Iterator i;
4953
4954         assert(m);
4955
4956         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4957
4958         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4959                 if (rt->n_ref > 0)
4960                         continue;
4961
4962                 (void) exec_runtime_free(rt, false);
4963         }
4964 }
4965
4966 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4967         [EXEC_INPUT_NULL] = "null",
4968         [EXEC_INPUT_TTY] = "tty",
4969         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4970         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4971         [EXEC_INPUT_SOCKET] = "socket",
4972         [EXEC_INPUT_NAMED_FD] = "fd",
4973         [EXEC_INPUT_DATA] = "data",
4974         [EXEC_INPUT_FILE] = "file",
4975 };
4976
4977 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4978
4979 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4980         [EXEC_OUTPUT_INHERIT] = "inherit",
4981         [EXEC_OUTPUT_NULL] = "null",
4982         [EXEC_OUTPUT_TTY] = "tty",
4983         [EXEC_OUTPUT_SYSLOG] = "syslog",
4984         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4985         [EXEC_OUTPUT_KMSG] = "kmsg",
4986         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4987         [EXEC_OUTPUT_JOURNAL] = "journal",
4988         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4989         [EXEC_OUTPUT_SOCKET] = "socket",
4990         [EXEC_OUTPUT_NAMED_FD] = "fd",
4991         [EXEC_OUTPUT_FILE] = "file",
4992 };
4993
4994 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4995
4996 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4997         [EXEC_UTMP_INIT] = "init",
4998         [EXEC_UTMP_LOGIN] = "login",
4999         [EXEC_UTMP_USER] = "user",
5000 };
5001
5002 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5003
5004 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5005         [EXEC_PRESERVE_NO] = "no",
5006         [EXEC_PRESERVE_YES] = "yes",
5007         [EXEC_PRESERVE_RESTART] = "restart",
5008 };
5009
5010 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5011
5012 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5013         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5014         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5015         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5016         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5017         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5018 };
5019
5020 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5021
5022 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5023         [EXEC_KEYRING_INHERIT] = "inherit",
5024         [EXEC_KEYRING_PRIVATE] = "private",
5025         [EXEC_KEYRING_SHARED] = "shared",
5026 };
5027
5028 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);