src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <poll.h>
   6 #include <sys/eventfd.h>
   7 #include <sys/ioctl.h>
   8 #include <sys/mman.h>
   9 #include <sys/personality.h>
  10 #include <sys/prctl.h>
  11 #include <sys/shm.h>
  12 #include <sys/types.h>
  13 #include <sys/un.h>
  14 #include <unistd.h>
  15 #include <utmpx.h>
  16
  17 #if HAVE_PAM
  18 #include <security/pam_appl.h>
  19 #endif
  20
  21 #if HAVE_SELINUX
  22 #include <selinux/selinux.h>
  23 #endif
  24
  25 #if HAVE_SECCOMP
  26 #include <seccomp.h>
  27 #endif
  28
  29 #if HAVE_APPARMOR
  30 #include <sys/apparmor.h>
  31 #endif
  32
  33 #include "sd-messages.h"
  34
  35 #include "af-list.h"
  36 #include "alloc-util.h"
  37 #if HAVE_APPARMOR
  38 #include "apparmor-util.h"
  39 #endif
  40 #include "async.h"
  41 #include "barrier.h"
  42 #include "cap-list.h"
  43 #include "capability-util.h"
  44 #include "chown-recursive.h"
  45 #include "cgroup-setup.h"
  46 #include "cpu-set-util.h"
  47 #include "def.h"
  48 #include "env-file.h"
  49 #include "env-util.h"
  50 #include "errno-list.h"
  51 #include "execute.h"
  52 #include "exit-status.h"
  53 #include "fd-util.h"
  54 #include "format-util.h"
  55 #include "fs-util.h"
  56 #include "glob-util.h"
  57 #include "io-util.h"
  58 #include "ioprio.h"
  59 #include "label.h"
  60 #include "log.h"
  61 #include "macro.h"
  62 #include "manager.h"
  63 #include "memory-util.h"
  64 #include "missing_fs.h"
  65 #include "mkdir.h"
  66 #include "namespace.h"
  67 #include "parse-util.h"
  68 #include "path-util.h"
  69 #include "process-util.h"
  70 #include "rlimit-util.h"
  71 #include "rm-rf.h"
  72 #if HAVE_SECCOMP
  73 #include "seccomp-util.h"
  74 #endif
  75 #include "securebits-util.h"
  76 #include "selinux-util.h"
  77 #include "signal-util.h"
  78 #include "smack-util.h"
  79 #include "socket-util.h"
  80 #include "special.h"
  81 #include "stat-util.h"
  82 #include "string-table.h"
  83 #include "string-util.h"
  84 #include "strv.h"
  85 #include "syslog-util.h"
  86 #include "terminal-util.h"
  87 #include "umask-util.h"
  88 #include "unit.h"
  89 #include "user-util.h"
  90 #include "utmp-wtmp.h"
  91
  92 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  93 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
  94
  95 #define SNDBUF_SIZE (8*1024*1024)
  96
  97 static int shift_fds(int fds[], size_t n_fds) {
  98         int start, restart_from;
  99
 100         if (n_fds <= 0)
 101                 return 0;
 102
 103         /* Modifies the fds array! (sorts it) */
 104
 105         assert(fds);
 106
 107         start = 0;
 108         for (;;) {
 109                 int i;
 110
 111                 restart_from = -1;
 112
 113                 for (i = start; i < (int) n_fds; i++) {
 114                         int nfd;
 115
 116                         /* Already at right index? */
 117                         if (fds[i] == i+3)
 118                                 continue;
 119
 120                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 121                         if (nfd < 0)
 122                                 return -errno;
 123
 124                         safe_close(fds[i]);
 125                         fds[i] = nfd;
 126
 127                         /* Hmm, the fd we wanted isn't free? Then
 128                          * let's remember that and try again from here */
 129                         if (nfd != i+3 && restart_from < 0)
 130                                 restart_from = i;
 131                 }
 132
 133                 if (restart_from < 0)
 134                         break;
 135
 136                 start = restart_from;
 137         }
 138
 139         return 0;
 140 }
 141
 142 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 143         size_t i, n_fds;
 144         int r;
 145
 146         n_fds = n_socket_fds + n_storage_fds;
 147         if (n_fds <= 0)
 148                 return 0;
 149
 150         assert(fds);
 151
 152         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 153          * O_NONBLOCK only applies to socket activation though. */
 154
 155         for (i = 0; i < n_fds; i++) {
 156
 157                 if (i < n_socket_fds) {
 158                         r = fd_nonblock(fds[i], nonblock);
 159                         if (r < 0)
 160                                 return r;
 161                 }
 162
 163                 /* We unconditionally drop FD_CLOEXEC from the fds,
 164                  * since after all we want to pass these fds to our
 165                  * children */
 166
 167                 r = fd_cloexec(fds[i], false);
 168                 if (r < 0)
 169                         return r;
 170         }
 171
 172         return 0;
 173 }
 174
 175 static const char *exec_context_tty_path(const ExecContext *context) {
 176         assert(context);
 177
 178         if (context->stdio_as_fds)
 179                 return NULL;
 180
 181         if (context->tty_path)
 182                 return context->tty_path;
 183
 184         return "/dev/console";
 185 }
 186
 187 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 188         const char *path;
 189
 190         assert(context);
 191
 192         path = exec_context_tty_path(context);
 193
 194         if (context->tty_vhangup) {
 195                 if (p && p->stdin_fd >= 0)
 196                         (void) terminal_vhangup_fd(p->stdin_fd);
 197                 else if (path)
 198                         (void) terminal_vhangup(path);
 199         }
 200
 201         if (context->tty_reset) {
 202                 if (p && p->stdin_fd >= 0)
 203                         (void) reset_terminal_fd(p->stdin_fd, true);
 204                 else if (path)
 205                         (void) reset_terminal(path);
 206         }
 207
 208         if (context->tty_vt_disallocate && path)
 209                 (void) vt_disallocate(path);
 210 }
 211
 212 static bool is_terminal_input(ExecInput i) {
 213         return IN_SET(i,
 214                       EXEC_INPUT_TTY,
 215                       EXEC_INPUT_TTY_FORCE,
 216                       EXEC_INPUT_TTY_FAIL);
 217 }
 218
 219 static bool is_terminal_output(ExecOutput o) {
 220         return IN_SET(o,
 221                       EXEC_OUTPUT_TTY,
 222                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 223                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 224                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 225 }
 226
 227 static bool is_syslog_output(ExecOutput o) {
 228         return IN_SET(o,
 229                       EXEC_OUTPUT_SYSLOG,
 230                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 231 }
 232
 233 static bool is_kmsg_output(ExecOutput o) {
 234         return IN_SET(o,
 235                       EXEC_OUTPUT_KMSG,
 236                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 237 }
 238
 239 static bool exec_context_needs_term(const ExecContext *c) {
 240         assert(c);
 241
 242         /* Return true if the execution context suggests we should set $TERM to something useful. */
 243
 244         if (is_terminal_input(c->std_input))
 245                 return true;
 246
 247         if (is_terminal_output(c->std_output))
 248                 return true;
 249
 250         if (is_terminal_output(c->std_error))
 251                 return true;
 252
 253         return !!c->tty_path;
 254 }
 255
 256 static int open_null_as(int flags, int nfd) {
 257         int fd;
 258
 259         assert(nfd >= 0);
 260
 261         fd = open("/dev/null", flags|O_NOCTTY);
 262         if (fd < 0)
 263                 return -errno;
 264
 265         return move_fd(fd, nfd, false);
 266 }
 267
 268 static int connect_journal_socket(
 269                 int fd,
 270                 const char *log_namespace,
 271                 uid_t uid,
 272                 gid_t gid) {
 273
 274         union sockaddr_union sa;
 275         socklen_t sa_len;
 276         uid_t olduid = UID_INVALID;
 277         gid_t oldgid = GID_INVALID;
 278         const char *j;
 279         int r;
 280
 281         j = log_namespace ?
 282                 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
 283                 "/run/systemd/journal/stdout";
 284         r = sockaddr_un_set_path(&sa.un, j);
 285         if (r < 0)
 286                 return r;
 287         sa_len = r;
 288
 289         if (gid_is_valid(gid)) {
 290                 oldgid = getgid();
 291
 292                 if (setegid(gid) < 0)
 293                         return -errno;
 294         }
 295
 296         if (uid_is_valid(uid)) {
 297                 olduid = getuid();
 298
 299                 if (seteuid(uid) < 0) {
 300                         r = -errno;
 301                         goto restore_gid;
 302                 }
 303         }
 304
 305         r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
 306
 307         /* If we fail to restore the uid or gid, things will likely
 308            fail later on. This should only happen if an LSM interferes. */
 309
 310         if (uid_is_valid(uid))
 311                 (void) seteuid(olduid);
 312
 313  restore_gid:
 314         if (gid_is_valid(gid))
 315                 (void) setegid(oldgid);
 316
 317         return r;
 318 }
 319
 320 static int connect_logger_as(
 321                 const Unit *unit,
 322                 const ExecContext *context,
 323                 const ExecParameters *params,
 324                 ExecOutput output,
 325                 const char *ident,
 326                 int nfd,
 327                 uid_t uid,
 328                 gid_t gid) {
 329
 330         _cleanup_close_ int fd = -1;
 331         int r;
 332
 333         assert(context);
 334         assert(params);
 335         assert(output < _EXEC_OUTPUT_MAX);
 336         assert(ident);
 337         assert(nfd >= 0);
 338
 339         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 340         if (fd < 0)
 341                 return -errno;
 342
 343         r = connect_journal_socket(fd, context->log_namespace, uid, gid);
 344         if (r < 0)
 345                 return r;
 346
 347         if (shutdown(fd, SHUT_RD) < 0)
 348                 return -errno;
 349
 350         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 351
 352         if (dprintf(fd,
 353                 "%s\n"
 354                 "%s\n"
 355                 "%i\n"
 356                 "%i\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n",
 360                 context->syslog_identifier ?: ident,
 361                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 362                 context->syslog_priority,
 363                 !!context->syslog_level_prefix,
 364                 is_syslog_output(output),
 365                 is_kmsg_output(output),
 366                 is_terminal_output(output)) < 0)
 367                 return -errno;
 368
 369         return move_fd(TAKE_FD(fd), nfd, false);
 370 }
 371
 372 static int open_terminal_as(const char *path, int flags, int nfd) {
 373         int fd;
 374
 375         assert(path);
 376         assert(nfd >= 0);
 377
 378         fd = open_terminal(path, flags | O_NOCTTY);
 379         if (fd < 0)
 380                 return fd;
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384
 385 static int acquire_path(const char *path, int flags, mode_t mode) {
 386         union sockaddr_union sa;
 387         socklen_t sa_len;
 388         _cleanup_close_ int fd = -1;
 389         int r;
 390
 391         assert(path);
 392
 393         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 394                 flags |= O_CREAT;
 395
 396         fd = open(path, flags|O_NOCTTY, mode);
 397         if (fd >= 0)
 398                 return TAKE_FD(fd);
 399
 400         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 401                 return -errno;
 402
 403         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 404
 405         r = sockaddr_un_set_path(&sa.un, path);
 406         if (r < 0)
 407                 return r == -EINVAL ? -ENXIO : r;
 408         sa_len = r;
 409
 410         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 411         if (fd < 0)
 412                 return -errno;
 413
 414         if (connect(fd, &sa.sa, sa_len) < 0)
 415                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 416                                                            * indication that his wasn't an AF_UNIX socket after all */
 417
 418         if ((flags & O_ACCMODE) == O_RDONLY)
 419                 r = shutdown(fd, SHUT_WR);
 420         else if ((flags & O_ACCMODE) == O_WRONLY)
 421                 r = shutdown(fd, SHUT_RD);
 422         else
 423                 r = 0;
 424         if (r < 0)
 425                 return -errno;
 426
 427         return TAKE_FD(fd);
 428 }
 429
 430 static int fixup_input(
 431                 const ExecContext *context,
 432                 int socket_fd,
 433                 bool apply_tty_stdin) {
 434
 435         ExecInput std_input;
 436
 437         assert(context);
 438
 439         std_input = context->std_input;
 440
 441         if (is_terminal_input(std_input) && !apply_tty_stdin)
 442                 return EXEC_INPUT_NULL;
 443
 444         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 445                 return EXEC_INPUT_NULL;
 446
 447         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 448                 return EXEC_INPUT_NULL;
 449
 450         return std_input;
 451 }
 452
 453 static int fixup_output(ExecOutput std_output, int socket_fd) {
 454
 455         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 456                 return EXEC_OUTPUT_INHERIT;
 457
 458         return std_output;
 459 }
 460
 461 static int setup_input(
 462                 const ExecContext *context,
 463                 const ExecParameters *params,
 464                 int socket_fd,
 465                 const int named_iofds[static 3]) {
 466
 467         ExecInput i;
 468
 469         assert(context);
 470         assert(params);
 471         assert(named_iofds);
 472
 473         if (params->stdin_fd >= 0) {
 474                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 475                         return -errno;
 476
 477                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 478                 if (isatty(STDIN_FILENO)) {
 479                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 480                         (void) reset_terminal_fd(STDIN_FILENO, true);
 481                 }
 482
 483                 return STDIN_FILENO;
 484         }
 485
 486         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 487
 488         switch (i) {
 489
 490         case EXEC_INPUT_NULL:
 491                 return open_null_as(O_RDONLY, STDIN_FILENO);
 492
 493         case EXEC_INPUT_TTY:
 494         case EXEC_INPUT_TTY_FORCE:
 495         case EXEC_INPUT_TTY_FAIL: {
 496                 int fd;
 497
 498                 fd = acquire_terminal(exec_context_tty_path(context),
 499                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 500                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 501                                                                   ACQUIRE_TERMINAL_WAIT,
 502                                       USEC_INFINITY);
 503                 if (fd < 0)
 504                         return fd;
 505
 506                 return move_fd(fd, STDIN_FILENO, false);
 507         }
 508
 509         case EXEC_INPUT_SOCKET:
 510                 assert(socket_fd >= 0);
 511
 512                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 513
 514         case EXEC_INPUT_NAMED_FD:
 515                 assert(named_iofds[STDIN_FILENO] >= 0);
 516
 517                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 518                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 519
 520         case EXEC_INPUT_DATA: {
 521                 int fd;
 522
 523                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 524                 if (fd < 0)
 525                         return fd;
 526
 527                 return move_fd(fd, STDIN_FILENO, false);
 528         }
 529
 530         case EXEC_INPUT_FILE: {
 531                 bool rw;
 532                 int fd;
 533
 534                 assert(context->stdio_file[STDIN_FILENO]);
 535
 536                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 537                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 538
 539                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 540                 if (fd < 0)
 541                         return fd;
 542
 543                 return move_fd(fd, STDIN_FILENO, false);
 544         }
 545
 546         default:
 547                 assert_not_reached("Unknown input type");
 548         }
 549 }
 550
 551 static bool can_inherit_stderr_from_stdout(
 552                 const ExecContext *context,
 553                 ExecOutput o,
 554                 ExecOutput e) {
 555
 556         assert(context);
 557
 558         /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
 559          * stderr fd */
 560
 561         if (e == EXEC_OUTPUT_INHERIT)
 562                 return true;
 563         if (e != o)
 564                 return false;
 565
 566         if (e == EXEC_OUTPUT_NAMED_FD)
 567                 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
 568
 569         if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
 570                 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
 571
 572         return true;
 573 }
 574
 575 static int setup_output(
 576                 const Unit *unit,
 577                 const ExecContext *context,
 578                 const ExecParameters *params,
 579                 int fileno,
 580                 int socket_fd,
 581                 const int named_iofds[static 3],
 582                 const char *ident,
 583                 uid_t uid,
 584                 gid_t gid,
 585                 dev_t *journal_stream_dev,
 586                 ino_t *journal_stream_ino) {
 587
 588         ExecOutput o;
 589         ExecInput i;
 590         int r;
 591
 592         assert(unit);
 593         assert(context);
 594         assert(params);
 595         assert(ident);
 596         assert(journal_stream_dev);
 597         assert(journal_stream_ino);
 598
 599         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 600
 601                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 602                         return -errno;
 603
 604                 return STDOUT_FILENO;
 605         }
 606
 607         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 608                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 609                         return -errno;
 610
 611                 return STDERR_FILENO;
 612         }
 613
 614         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 615         o = fixup_output(context->std_output, socket_fd);
 616
 617         if (fileno == STDERR_FILENO) {
 618                 ExecOutput e;
 619                 e = fixup_output(context->std_error, socket_fd);
 620
 621                 /* This expects the input and output are already set up */
 622
 623                 /* Don't change the stderr file descriptor if we inherit all
 624                  * the way and are not on a tty */
 625                 if (e == EXEC_OUTPUT_INHERIT &&
 626                     o == EXEC_OUTPUT_INHERIT &&
 627                     i == EXEC_INPUT_NULL &&
 628                     !is_terminal_input(context->std_input) &&
 629                     getppid () != 1)
 630                         return fileno;
 631
 632                 /* Duplicate from stdout if possible */
 633                 if (can_inherit_stderr_from_stdout(context, o, e))
 634                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 635
 636                 o = e;
 637
 638         } else if (o == EXEC_OUTPUT_INHERIT) {
 639                 /* If input got downgraded, inherit the original value */
 640                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 641                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 642
 643                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 644                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 645                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 646
 647                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 648                 if (getppid() != 1)
 649                         return fileno;
 650
 651                 /* We need to open /dev/null here anew, to get the right access mode. */
 652                 return open_null_as(O_WRONLY, fileno);
 653         }
 654
 655         switch (o) {
 656
 657         case EXEC_OUTPUT_NULL:
 658                 return open_null_as(O_WRONLY, fileno);
 659
 660         case EXEC_OUTPUT_TTY:
 661                 if (is_terminal_input(i))
 662                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 663
 664                 /* We don't reset the terminal if this is just about output */
 665                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 666
 667         case EXEC_OUTPUT_SYSLOG:
 668         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 669         case EXEC_OUTPUT_KMSG:
 670         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 671         case EXEC_OUTPUT_JOURNAL:
 672         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 673                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 674                 if (r < 0) {
 675                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 676                         r = open_null_as(O_WRONLY, fileno);
 677                 } else {
 678                         struct stat st;
 679
 680                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 681                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 682                          * services to detect whether they are connected to the journal or not.
 683                          *
 684                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 685                          * about STDERR as that's usually the best way to do logging. */
 686
 687                         if (fstat(fileno, &st) >= 0 &&
 688                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 689                                 *journal_stream_dev = st.st_dev;
 690                                 *journal_stream_ino = st.st_ino;
 691                         }
 692                 }
 693                 return r;
 694
 695         case EXEC_OUTPUT_SOCKET:
 696                 assert(socket_fd >= 0);
 697
 698                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 699
 700         case EXEC_OUTPUT_NAMED_FD:
 701                 assert(named_iofds[fileno] >= 0);
 702
 703                 (void) fd_nonblock(named_iofds[fileno], false);
 704                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 705
 706         case EXEC_OUTPUT_FILE:
 707         case EXEC_OUTPUT_FILE_APPEND: {
 708                 bool rw;
 709                 int fd, flags;
 710
 711                 assert(context->stdio_file[fileno]);
 712
 713                 rw = context->std_input == EXEC_INPUT_FILE &&
 714                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 715
 716                 if (rw)
 717                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 718
 719                 flags = O_WRONLY;
 720                 if (o == EXEC_OUTPUT_FILE_APPEND)
 721                         flags |= O_APPEND;
 722
 723                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 724                 if (fd < 0)
 725                         return fd;
 726
 727                 return move_fd(fd, fileno, 0);
 728         }
 729
 730         default:
 731                 assert_not_reached("Unknown error type");
 732         }
 733 }
 734
 735 static int chown_terminal(int fd, uid_t uid) {
 736         int r;
 737
 738         assert(fd >= 0);
 739
 740         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 741         if (isatty(fd) < 1) {
 742                 if (IN_SET(errno, EINVAL, ENOTTY))
 743                         return 0; /* not a tty */
 744
 745                 return -errno;
 746         }
 747
 748         /* This might fail. What matters are the results. */
 749         r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
 750         if (r < 0)
 751                 return r;
 752
 753         return 1;
 754 }
 755
 756 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 757         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 758         int r;
 759
 760         assert(_saved_stdin);
 761         assert(_saved_stdout);
 762
 763         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 764         if (saved_stdin < 0)
 765                 return -errno;
 766
 767         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 768         if (saved_stdout < 0)
 769                 return -errno;
 770
 771         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 772         if (fd < 0)
 773                 return fd;
 774
 775         r = chown_terminal(fd, getuid());
 776         if (r < 0)
 777                 return r;
 778
 779         r = reset_terminal_fd(fd, true);
 780         if (r < 0)
 781                 return r;
 782
 783         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 784         fd = -1;
 785         if (r < 0)
 786                 return r;
 787
 788         *_saved_stdin = saved_stdin;
 789         *_saved_stdout = saved_stdout;
 790
 791         saved_stdin = saved_stdout = -1;
 792
 793         return 0;
 794 }
 795
 796 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 797         assert(err < 0);
 798
 799         if (err == -ETIMEDOUT)
 800                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 801         else {
 802                 errno = -err;
 803                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 804         }
 805 }
 806
 807 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 808         _cleanup_close_ int fd = -1;
 809
 810         assert(vc);
 811
 812         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 813         if (fd < 0)
 814                 return;
 815
 816         write_confirm_error_fd(err, fd, u);
 817 }
 818
 819 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 820         int r = 0;
 821
 822         assert(saved_stdin);
 823         assert(saved_stdout);
 824
 825         release_terminal();
 826
 827         if (*saved_stdin >= 0)
 828                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 829                         r = -errno;
 830
 831         if (*saved_stdout >= 0)
 832                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 833                         r = -errno;
 834
 835         *saved_stdin = safe_close(*saved_stdin);
 836         *saved_stdout = safe_close(*saved_stdout);
 837
 838         return r;
 839 }
 840
 841 enum {
 842         CONFIRM_PRETEND_FAILURE = -1,
 843         CONFIRM_PRETEND_SUCCESS =  0,
 844         CONFIRM_EXECUTE = 1,
 845 };
 846
 847 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 848         int saved_stdout = -1, saved_stdin = -1, r;
 849         _cleanup_free_ char *e = NULL;
 850         char c;
 851
 852         /* For any internal errors, assume a positive response. */
 853         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 854         if (r < 0) {
 855                 write_confirm_error(r, vc, u);
 856                 return CONFIRM_EXECUTE;
 857         }
 858
 859         /* confirm_spawn might have been disabled while we were sleeping. */
 860         if (manager_is_confirm_spawn_disabled(u->manager)) {
 861                 r = 1;
 862                 goto restore_stdio;
 863         }
 864
 865         e = ellipsize(cmdline, 60, 100);
 866         if (!e) {
 867                 log_oom();
 868                 r = CONFIRM_EXECUTE;
 869                 goto restore_stdio;
 870         }
 871
 872         for (;;) {
 873                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 874                 if (r < 0) {
 875                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 876                         r = CONFIRM_EXECUTE;
 877                         goto restore_stdio;
 878                 }
 879
 880                 switch (c) {
 881                 case 'c':
 882                         printf("Resuming normal execution.\n");
 883                         manager_disable_confirm_spawn();
 884                         r = 1;
 885                         break;
 886                 case 'D':
 887                         unit_dump(u, stdout, "  ");
 888                         continue; /* ask again */
 889                 case 'f':
 890                         printf("Failing execution.\n");
 891                         r = CONFIRM_PRETEND_FAILURE;
 892                         break;
 893                 case 'h':
 894                         printf("  c - continue, proceed without asking anymore\n"
 895                                "  D - dump, show the state of the unit\n"
 896                                "  f - fail, don't execute the command and pretend it failed\n"
 897                                "  h - help\n"
 898                                "  i - info, show a short summary of the unit\n"
 899                                "  j - jobs, show jobs that are in progress\n"
 900                                "  s - skip, don't execute the command and pretend it succeeded\n"
 901                                "  y - yes, execute the command\n");
 902                         continue; /* ask again */
 903                 case 'i':
 904                         printf("  Description: %s\n"
 905                                "  Unit:        %s\n"
 906                                "  Command:     %s\n",
 907                                u->id, u->description, cmdline);
 908                         continue; /* ask again */
 909                 case 'j':
 910                         manager_dump_jobs(u->manager, stdout, "  ");
 911                         continue; /* ask again */
 912                 case 'n':
 913                         /* 'n' was removed in favor of 'f'. */
 914                         printf("Didn't understand 'n', did you mean 'f'?\n");
 915                         continue; /* ask again */
 916                 case 's':
 917                         printf("Skipping execution.\n");
 918                         r = CONFIRM_PRETEND_SUCCESS;
 919                         break;
 920                 case 'y':
 921                         r = CONFIRM_EXECUTE;
 922                         break;
 923                 default:
 924                         assert_not_reached("Unhandled choice");
 925                 }
 926                 break;
 927         }
 928
 929 restore_stdio:
 930         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 931         return r;
 932 }
 933
 934 static int get_fixed_user(const ExecContext *c, const char **user,
 935                           uid_t *uid, gid_t *gid,
 936                           const char **home, const char **shell) {
 937         int r;
 938         const char *name;
 939
 940         assert(c);
 941
 942         if (!c->user)
 943                 return 0;
 944
 945         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 946          * (i.e. are "/" or "/bin/nologin"). */
 947
 948         name = c->user;
 949         r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
 950         if (r < 0)
 951                 return r;
 952
 953         *user = name;
 954         return 0;
 955 }
 956
 957 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 958         int r;
 959         const char *name;
 960
 961         assert(c);
 962
 963         if (!c->group)
 964                 return 0;
 965
 966         name = c->group;
 967         r = get_group_creds(&name, gid, 0);
 968         if (r < 0)
 969                 return r;
 970
 971         *group = name;
 972         return 0;
 973 }
 974
 975 static int get_supplementary_groups(const ExecContext *c, const char *user,
 976                                     const char *group, gid_t gid,
 977                                     gid_t **supplementary_gids, int *ngids) {
 978         char **i;
 979         int r, k = 0;
 980         int ngroups_max;
 981         bool keep_groups = false;
 982         gid_t *groups = NULL;
 983         _cleanup_free_ gid_t *l_gids = NULL;
 984
 985         assert(c);
 986
 987         /*
 988          * If user is given, then lookup GID and supplementary groups list.
 989          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 990          * here and as early as possible so we keep the list of supplementary
 991          * groups of the caller.
 992          */
 993         if (user && gid_is_valid(gid) && gid != 0) {
 994                 /* First step, initialize groups from /etc/groups */
 995                 if (initgroups(user, gid) < 0)
 996                         return -errno;
 997
 998                 keep_groups = true;
 999         }
1000
1001         if (strv_isempty(c->supplementary_groups))
1002                 return 0;
1003
1004         /*
1005          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006          * be positive, otherwise fail.
1007          */
1008         errno = 0;
1009         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1010         if (ngroups_max <= 0)
1011                 return errno_or_else(EOPNOTSUPP);
1012
1013         l_gids = new(gid_t, ngroups_max);
1014         if (!l_gids)
1015                 return -ENOMEM;
1016
1017         if (keep_groups) {
1018                 /*
1019                  * Lookup the list of groups that the user belongs to, we
1020                  * avoid NSS lookups here too for gid=0.
1021                  */
1022                 k = ngroups_max;
1023                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024                         return -EINVAL;
1025         } else
1026                 k = 0;
1027
1028         STRV_FOREACH(i, c->supplementary_groups) {
1029                 const char *g;
1030
1031                 if (k >= ngroups_max)
1032                         return -E2BIG;
1033
1034                 g = *i;
1035                 r = get_group_creds(&g, l_gids+k, 0);
1036                 if (r < 0)
1037                         return r;
1038
1039                 k++;
1040         }
1041
1042         /*
1043          * Sets ngids to zero to drop all supplementary groups, happens
1044          * when we are under root and SupplementaryGroups= is empty.
1045          */
1046         if (k == 0) {
1047                 *ngids = 0;
1048                 return 0;
1049         }
1050
1051         /* Otherwise get the final list of supplementary groups */
1052         groups = memdup(l_gids, sizeof(gid_t) * k);
1053         if (!groups)
1054                 return -ENOMEM;
1055
1056         *supplementary_gids = groups;
1057         *ngids = k;
1058
1059         groups = NULL;
1060
1061         return 0;
1062 }
1063
1064 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1065         int r;
1066
1067         /* Handle SupplementaryGroups= if it is not empty */
1068         if (ngids > 0) {
1069                 r = maybe_setgroups(ngids, supplementary_gids);
1070                 if (r < 0)
1071                         return r;
1072         }
1073
1074         if (gid_is_valid(gid)) {
1075                 /* Then set our gids */
1076                 if (setresgid(gid, gid, gid) < 0)
1077                         return -errno;
1078         }
1079
1080         return 0;
1081 }
1082
1083 static int enforce_user(const ExecContext *context, uid_t uid) {
1084         assert(context);
1085
1086         if (!uid_is_valid(uid))
1087                 return 0;
1088
1089         /* Sets (but doesn't look up) the uid and make sure we keep the
1090          * capabilities while doing so. */
1091
1092         if (context->capability_ambient_set != 0) {
1093
1094                 /* First step: If we need to keep capabilities but
1095                  * drop privileges we need to make sure we keep our
1096                  * caps, while we drop privileges. */
1097                 if (uid != 0) {
1098                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1099
1100                         if (prctl(PR_GET_SECUREBITS) != sb)
1101                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1102                                         return -errno;
1103                 }
1104         }
1105
1106         /* Second step: actually set the uids */
1107         if (setresuid(uid, uid, uid) < 0)
1108                 return -errno;
1109
1110         /* At this point we should have all necessary capabilities but
1111            are otherwise a normal user. However, the caps might got
1112            corrupted due to the setresuid() so we need clean them up
1113            later. This is done outside of this call. */
1114
1115         return 0;
1116 }
1117
1118 #if HAVE_PAM
1119
1120 static int null_conv(
1121                 int num_msg,
1122                 const struct pam_message **msg,
1123                 struct pam_response **resp,
1124                 void *appdata_ptr) {
1125
1126         /* We don't support conversations */
1127
1128         return PAM_CONV_ERR;
1129 }
1130
1131 #endif
1132
1133 static int setup_pam(
1134                 const char *name,
1135                 const char *user,
1136                 uid_t uid,
1137                 gid_t gid,
1138                 const char *tty,
1139                 char ***env,
1140                 const int fds[], size_t n_fds) {
1141
1142 #if HAVE_PAM
1143
1144         static const struct pam_conv conv = {
1145                 .conv = null_conv,
1146                 .appdata_ptr = NULL
1147         };
1148
1149         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1150         pam_handle_t *handle = NULL;
1151         sigset_t old_ss;
1152         int pam_code = PAM_SUCCESS, r;
1153         char **nv, **e = NULL;
1154         bool close_session = false;
1155         pid_t pam_pid = 0, parent_pid;
1156         int flags = 0;
1157
1158         assert(name);
1159         assert(user);
1160         assert(env);
1161
1162         /* We set up PAM in the parent process, then fork. The child
1163          * will then stay around until killed via PR_GET_PDEATHSIG or
1164          * systemd via the cgroup logic. It will then remove the PAM
1165          * session again. The parent process will exec() the actual
1166          * daemon. We do things this way to ensure that the main PID
1167          * of the daemon is the one we initially fork()ed. */
1168
1169         r = barrier_create(&barrier);
1170         if (r < 0)
1171                 goto fail;
1172
1173         if (log_get_max_level() < LOG_DEBUG)
1174                 flags |= PAM_SILENT;
1175
1176         pam_code = pam_start(name, user, &conv, &handle);
1177         if (pam_code != PAM_SUCCESS) {
1178                 handle = NULL;
1179                 goto fail;
1180         }
1181
1182         if (!tty) {
1183                 _cleanup_free_ char *q = NULL;
1184
1185                 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1186                  * out if that's the case, and read the TTY off it. */
1187
1188                 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1189                         tty = strjoina("/dev/", q);
1190         }
1191
1192         if (tty) {
1193                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1194                 if (pam_code != PAM_SUCCESS)
1195                         goto fail;
1196         }
1197
1198         STRV_FOREACH(nv, *env) {
1199                 pam_code = pam_putenv(handle, *nv);
1200                 if (pam_code != PAM_SUCCESS)
1201                         goto fail;
1202         }
1203
1204         pam_code = pam_acct_mgmt(handle, flags);
1205         if (pam_code != PAM_SUCCESS)
1206                 goto fail;
1207
1208         pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1209         if (pam_code != PAM_SUCCESS)
1210                 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1211
1212         pam_code = pam_open_session(handle, flags);
1213         if (pam_code != PAM_SUCCESS)
1214                 goto fail;
1215
1216         close_session = true;
1217
1218         e = pam_getenvlist(handle);
1219         if (!e) {
1220                 pam_code = PAM_BUF_ERR;
1221                 goto fail;
1222         }
1223
1224         /* Block SIGTERM, so that we know that it won't get lost in
1225          * the child */
1226
1227         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1228
1229         parent_pid = getpid_cached();
1230
1231         r = safe_fork("(sd-pam)", 0, &pam_pid);
1232         if (r < 0)
1233                 goto fail;
1234         if (r == 0) {
1235                 int sig, ret = EXIT_PAM;
1236
1237                 /* The child's job is to reset the PAM session on
1238                  * termination */
1239                 barrier_set_role(&barrier, BARRIER_CHILD);
1240
1241                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1242                  * are open here that have been opened by PAM. */
1243                 (void) close_many(fds, n_fds);
1244
1245                 /* Drop privileges - we don't need any to pam_close_session
1246                  * and this will make PR_SET_PDEATHSIG work in most cases.
1247                  * If this fails, ignore the error - but expect sd-pam threads
1248                  * to fail to exit normally */
1249
1250                 r = maybe_setgroups(0, NULL);
1251                 if (r < 0)
1252                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1253                 if (setresgid(gid, gid, gid) < 0)
1254                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1255                 if (setresuid(uid, uid, uid) < 0)
1256                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1257
1258                 (void) ignore_signals(SIGPIPE, -1);
1259
1260                 /* Wait until our parent died. This will only work if
1261                  * the above setresuid() succeeds, otherwise the kernel
1262                  * will not allow unprivileged parents kill their privileged
1263                  * children this way. We rely on the control groups kill logic
1264                  * to do the rest for us. */
1265                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1266                         goto child_finish;
1267
1268                 /* Tell the parent that our setup is done. This is especially
1269                  * important regarding dropping privileges. Otherwise, unit
1270                  * setup might race against our setresuid(2) call.
1271                  *
1272                  * If the parent aborted, we'll detect this below, hence ignore
1273                  * return failure here. */
1274                 (void) barrier_place(&barrier);
1275
1276                 /* Check if our parent process might already have died? */
1277                 if (getppid() == parent_pid) {
1278                         sigset_t ss;
1279
1280                         assert_se(sigemptyset(&ss) >= 0);
1281                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1282
1283                         for (;;) {
1284                                 if (sigwait(&ss, &sig) < 0) {
1285                                         if (errno == EINTR)
1286                                                 continue;
1287
1288                                         goto child_finish;
1289                                 }
1290
1291                                 assert(sig == SIGTERM);
1292                                 break;
1293                         }
1294                 }
1295
1296                 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1297                 if (pam_code != PAM_SUCCESS)
1298                         goto child_finish;
1299
1300                 /* If our parent died we'll end the session */
1301                 if (getppid() != parent_pid) {
1302                         pam_code = pam_close_session(handle, flags);
1303                         if (pam_code != PAM_SUCCESS)
1304                                 goto child_finish;
1305                 }
1306
1307                 ret = 0;
1308
1309         child_finish:
1310                 pam_end(handle, pam_code | flags);
1311                 _exit(ret);
1312         }
1313
1314         barrier_set_role(&barrier, BARRIER_PARENT);
1315
1316         /* If the child was forked off successfully it will do all the
1317          * cleanups, so forget about the handle here. */
1318         handle = NULL;
1319
1320         /* Unblock SIGTERM again in the parent */
1321         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1322
1323         /* We close the log explicitly here, since the PAM modules
1324          * might have opened it, but we don't want this fd around. */
1325         closelog();
1326
1327         /* Synchronously wait for the child to initialize. We don't care for
1328          * errors as we cannot recover. However, warn loudly if it happens. */
1329         if (!barrier_place_and_sync(&barrier))
1330                 log_error("PAM initialization failed");
1331
1332         return strv_free_and_replace(*env, e);
1333
1334 fail:
1335         if (pam_code != PAM_SUCCESS) {
1336                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1337                 r = -EPERM;  /* PAM errors do not map to errno */
1338         } else
1339                 log_error_errno(r, "PAM failed: %m");
1340
1341         if (handle) {
1342                 if (close_session)
1343                         pam_code = pam_close_session(handle, flags);
1344
1345                 pam_end(handle, pam_code | flags);
1346         }
1347
1348         strv_free(e);
1349         closelog();
1350
1351         return r;
1352 #else
1353         return 0;
1354 #endif
1355 }
1356
1357 static void rename_process_from_path(const char *path) {
1358         char process_name[11];
1359         const char *p;
1360         size_t l;
1361
1362         /* This resulting string must fit in 10 chars (i.e. the length
1363          * of "/sbin/init") to look pretty in /bin/ps */
1364
1365         p = basename(path);
1366         if (isempty(p)) {
1367                 rename_process("(...)");
1368                 return;
1369         }
1370
1371         l = strlen(p);
1372         if (l > 8) {
1373                 /* The end of the process name is usually more
1374                  * interesting, since the first bit might just be
1375                  * "systemd-" */
1376                 p = p + l - 8;
1377                 l = 8;
1378         }
1379
1380         process_name[0] = '(';
1381         memcpy(process_name+1, p, l);
1382         process_name[1+l] = ')';
1383         process_name[1+l+1] = 0;
1384
1385         rename_process(process_name);
1386 }
1387
1388 static bool context_has_address_families(const ExecContext *c) {
1389         assert(c);
1390
1391         return c->address_families_whitelist ||
1392                 !set_isempty(c->address_families);
1393 }
1394
1395 static bool context_has_syscall_filters(const ExecContext *c) {
1396         assert(c);
1397
1398         return c->syscall_whitelist ||
1399                 !hashmap_isempty(c->syscall_filter);
1400 }
1401
1402 static bool context_has_no_new_privileges(const ExecContext *c) {
1403         assert(c);
1404
1405         if (c->no_new_privileges)
1406                 return true;
1407
1408         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1409                 return false;
1410
1411         /* We need NNP if we have any form of seccomp and are unprivileged */
1412         return context_has_address_families(c) ||
1413                 c->memory_deny_write_execute ||
1414                 c->restrict_realtime ||
1415                 c->restrict_suid_sgid ||
1416                 exec_context_restrict_namespaces_set(c) ||
1417                 c->protect_clock ||
1418                 c->protect_kernel_tunables ||
1419                 c->protect_kernel_modules ||
1420                 c->protect_kernel_logs ||
1421                 c->private_devices ||
1422                 context_has_syscall_filters(c) ||
1423                 !set_isempty(c->syscall_archs) ||
1424                 c->lock_personality ||
1425                 c->protect_hostname;
1426 }
1427
1428 #if HAVE_SECCOMP
1429
1430 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1431
1432         if (is_seccomp_available())
1433                 return false;
1434
1435         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1436         return true;
1437 }
1438
1439 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1440         uint32_t negative_action, default_action, action;
1441         int r;
1442
1443         assert(u);
1444         assert(c);
1445
1446         if (!context_has_syscall_filters(c))
1447                 return 0;
1448
1449         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1450                 return 0;
1451
1452         negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1453
1454         if (c->syscall_whitelist) {
1455                 default_action = negative_action;
1456                 action = SCMP_ACT_ALLOW;
1457         } else {
1458                 default_action = SCMP_ACT_ALLOW;
1459                 action = negative_action;
1460         }
1461
1462         if (needs_ambient_hack) {
1463                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1464                 if (r < 0)
1465                         return r;
1466         }
1467
1468         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1469 }
1470
1471 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1472         assert(u);
1473         assert(c);
1474
1475         if (set_isempty(c->syscall_archs))
1476                 return 0;
1477
1478         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1479                 return 0;
1480
1481         return seccomp_restrict_archs(c->syscall_archs);
1482 }
1483
1484 static int apply_address_families(const Unit* u, const ExecContext *c) {
1485         assert(u);
1486         assert(c);
1487
1488         if (!context_has_address_families(c))
1489                 return 0;
1490
1491         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1492                 return 0;
1493
1494         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1495 }
1496
1497 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1498         assert(u);
1499         assert(c);
1500
1501         if (!c->memory_deny_write_execute)
1502                 return 0;
1503
1504         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1505                 return 0;
1506
1507         return seccomp_memory_deny_write_execute();
1508 }
1509
1510 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1511         assert(u);
1512         assert(c);
1513
1514         if (!c->restrict_realtime)
1515                 return 0;
1516
1517         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1518                 return 0;
1519
1520         return seccomp_restrict_realtime();
1521 }
1522
1523 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1524         assert(u);
1525         assert(c);
1526
1527         if (!c->restrict_suid_sgid)
1528                 return 0;
1529
1530         if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1531                 return 0;
1532
1533         return seccomp_restrict_suid_sgid();
1534 }
1535
1536 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1537         assert(u);
1538         assert(c);
1539
1540         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1541          * let's protect even those systems where this is left on in the kernel. */
1542
1543         if (!c->protect_kernel_tunables)
1544                 return 0;
1545
1546         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1547                 return 0;
1548
1549         return seccomp_protect_sysctl();
1550 }
1551
1552 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1553         assert(u);
1554         assert(c);
1555
1556         /* Turn off module syscalls on ProtectKernelModules=yes */
1557
1558         if (!c->protect_kernel_modules)
1559                 return 0;
1560
1561         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1562                 return 0;
1563
1564         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1565 }
1566
1567 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1568         assert(u);
1569         assert(c);
1570
1571         if (!c->protect_kernel_logs)
1572                 return 0;
1573
1574         if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1575                 return 0;
1576
1577         return seccomp_protect_syslog();
1578 }
1579
1580 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1581         assert(u);
1582         assert(c);
1583
1584         if (!c->protect_clock)
1585                 return 0;
1586
1587         if (skip_seccomp_unavailable(u, "ProtectClock="))
1588                 return 0;
1589
1590         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1591 }
1592
1593 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1594         assert(u);
1595         assert(c);
1596
1597         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1598
1599         if (!c->private_devices)
1600                 return 0;
1601
1602         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1603                 return 0;
1604
1605         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1606 }
1607
1608 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1609         assert(u);
1610         assert(c);
1611
1612         if (!exec_context_restrict_namespaces_set(c))
1613                 return 0;
1614
1615         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1616                 return 0;
1617
1618         return seccomp_restrict_namespaces(c->restrict_namespaces);
1619 }
1620
1621 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1622         unsigned long personality;
1623         int r;
1624
1625         assert(u);
1626         assert(c);
1627
1628         if (!c->lock_personality)
1629                 return 0;
1630
1631         if (skip_seccomp_unavailable(u, "LockPersonality="))
1632                 return 0;
1633
1634         personality = c->personality;
1635
1636         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1637         if (personality == PERSONALITY_INVALID) {
1638
1639                 r = opinionated_personality(&personality);
1640                 if (r < 0)
1641                         return r;
1642         }
1643
1644         return seccomp_lock_personality(personality);
1645 }
1646
1647 #endif
1648
1649 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1650         int r;
1651
1652         assert(u);
1653         assert(c);
1654
1655         if (!c->protect_hostname)
1656                 return 0;
1657
1658         if (ns_type_supported(NAMESPACE_UTS)) {
1659                 if (unshare(CLONE_NEWUTS) < 0) {
1660                         if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1661                                 *ret_exit_status = EXIT_NAMESPACE;
1662                                 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1663                         }
1664
1665                         log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1666                 }
1667         } else
1668                 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1669
1670 #if HAVE_SECCOMP
1671         if (skip_seccomp_unavailable(u, "ProtectHostname="))
1672                 return 0;
1673
1674         r = seccomp_protect_hostname();
1675         if (r < 0) {
1676                 *ret_exit_status = EXIT_SECCOMP;
1677                 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1678         }
1679 #endif
1680
1681         return 0;
1682 }
1683
1684 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1685         assert(idle_pipe);
1686
1687         idle_pipe[1] = safe_close(idle_pipe[1]);
1688         idle_pipe[2] = safe_close(idle_pipe[2]);
1689
1690         if (idle_pipe[0] >= 0) {
1691                 int r;
1692
1693                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1694
1695                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1696                         ssize_t n;
1697
1698                         /* Signal systemd that we are bored and want to continue. */
1699                         n = write(idle_pipe[3], "x", 1);
1700                         if (n > 0)
1701                                 /* Wait for systemd to react to the signal above. */
1702                                 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1703                 }
1704
1705                 idle_pipe[0] = safe_close(idle_pipe[0]);
1706
1707         }
1708
1709         idle_pipe[3] = safe_close(idle_pipe[3]);
1710 }
1711
1712 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1713
1714 static int build_environment(
1715                 const Unit *u,
1716                 const ExecContext *c,
1717                 const ExecParameters *p,
1718                 size_t n_fds,
1719                 const char *home,
1720                 const char *username,
1721                 const char *shell,
1722                 dev_t journal_stream_dev,
1723                 ino_t journal_stream_ino,
1724                 char ***ret) {
1725
1726         _cleanup_strv_free_ char **our_env = NULL;
1727         ExecDirectoryType t;
1728         size_t n_env = 0;
1729         char *x;
1730
1731         assert(u);
1732         assert(c);
1733         assert(p);
1734         assert(ret);
1735
1736         our_env = new0(char*, 15 + _EXEC_DIRECTORY_TYPE_MAX);
1737         if (!our_env)
1738                 return -ENOMEM;
1739
1740         if (n_fds > 0) {
1741                 _cleanup_free_ char *joined = NULL;
1742
1743                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1744                         return -ENOMEM;
1745                 our_env[n_env++] = x;
1746
1747                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1748                         return -ENOMEM;
1749                 our_env[n_env++] = x;
1750
1751                 joined = strv_join(p->fd_names, ":");
1752                 if (!joined)
1753                         return -ENOMEM;
1754
1755                 x = strjoin("LISTEN_FDNAMES=", joined);
1756                 if (!x)
1757                         return -ENOMEM;
1758                 our_env[n_env++] = x;
1759         }
1760
1761         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1762                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1763                         return -ENOMEM;
1764                 our_env[n_env++] = x;
1765
1766                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1767                         return -ENOMEM;
1768                 our_env[n_env++] = x;
1769         }
1770
1771         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1772          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1773          * check the database directly. */
1774         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1775                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1776                 if (!x)
1777                         return -ENOMEM;
1778                 our_env[n_env++] = x;
1779         }
1780
1781         if (home) {
1782                 x = strjoin("HOME=", home);
1783                 if (!x)
1784                         return -ENOMEM;
1785
1786                 path_simplify(x + 5, true);
1787                 our_env[n_env++] = x;
1788         }
1789
1790         if (username) {
1791                 x = strjoin("LOGNAME=", username);
1792                 if (!x)
1793                         return -ENOMEM;
1794                 our_env[n_env++] = x;
1795
1796                 x = strjoin("USER=", username);
1797                 if (!x)
1798                         return -ENOMEM;
1799                 our_env[n_env++] = x;
1800         }
1801
1802         if (shell) {
1803                 x = strjoin("SHELL=", shell);
1804                 if (!x)
1805                         return -ENOMEM;
1806
1807                 path_simplify(x + 6, true);
1808                 our_env[n_env++] = x;
1809         }
1810
1811         if (!sd_id128_is_null(u->invocation_id)) {
1812                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1813                         return -ENOMEM;
1814
1815                 our_env[n_env++] = x;
1816         }
1817
1818         if (exec_context_needs_term(c)) {
1819                 const char *tty_path, *term = NULL;
1820
1821                 tty_path = exec_context_tty_path(c);
1822
1823                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1824                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1825                  * passes to PID 1 ends up all the way in the console login shown. */
1826
1827                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1828                         term = getenv("TERM");
1829                 if (!term)
1830                         term = default_term_for_tty(tty_path);
1831
1832                 x = strjoin("TERM=", term);
1833                 if (!x)
1834                         return -ENOMEM;
1835                 our_env[n_env++] = x;
1836         }
1837
1838         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1839                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1840                         return -ENOMEM;
1841
1842                 our_env[n_env++] = x;
1843         }
1844
1845         if (c->log_namespace) {
1846                 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1847                 if (!x)
1848                         return -ENOMEM;
1849
1850                 our_env[n_env++] = x;
1851         }
1852
1853         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1854                 _cleanup_free_ char *pre = NULL, *joined = NULL;
1855                 const char *n;
1856
1857                 if (!p->prefix[t])
1858                         continue;
1859
1860                 if (strv_isempty(c->directories[t].paths))
1861                         continue;
1862
1863                 n = exec_directory_env_name_to_string(t);
1864                 if (!n)
1865                         continue;
1866
1867                 pre = strjoin(p->prefix[t], "/");
1868                 if (!pre)
1869                         return -ENOMEM;
1870
1871                 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1872                 if (!joined)
1873                         return -ENOMEM;
1874
1875                 x = strjoin(n, "=", joined);
1876                 if (!x)
1877                         return -ENOMEM;
1878
1879                 our_env[n_env++] = x;
1880         }
1881
1882         our_env[n_env++] = NULL;
1883         assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
1884
1885         *ret = TAKE_PTR(our_env);
1886
1887         return 0;
1888 }
1889
1890 static int build_pass_environment(const ExecContext *c, char ***ret) {
1891         _cleanup_strv_free_ char **pass_env = NULL;
1892         size_t n_env = 0, n_bufsize = 0;
1893         char **i;
1894
1895         STRV_FOREACH(i, c->pass_environment) {
1896                 _cleanup_free_ char *x = NULL;
1897                 char *v;
1898
1899                 v = getenv(*i);
1900                 if (!v)
1901                         continue;
1902                 x = strjoin(*i, "=", v);
1903                 if (!x)
1904                         return -ENOMEM;
1905
1906                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1907                         return -ENOMEM;
1908
1909                 pass_env[n_env++] = TAKE_PTR(x);
1910                 pass_env[n_env] = NULL;
1911         }
1912
1913         *ret = TAKE_PTR(pass_env);
1914
1915         return 0;
1916 }
1917
1918 static bool exec_needs_mount_namespace(
1919                 const ExecContext *context,
1920                 const ExecParameters *params,
1921                 const ExecRuntime *runtime) {
1922
1923         assert(context);
1924         assert(params);
1925
1926         if (context->root_image)
1927                 return true;
1928
1929         if (!strv_isempty(context->read_write_paths) ||
1930             !strv_isempty(context->read_only_paths) ||
1931             !strv_isempty(context->inaccessible_paths))
1932                 return true;
1933
1934         if (context->n_bind_mounts > 0)
1935                 return true;
1936
1937         if (context->n_temporary_filesystems > 0)
1938                 return true;
1939
1940         if (!IN_SET(context->mount_flags, 0, MS_SHARED))
1941                 return true;
1942
1943         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1944                 return true;
1945
1946         if (context->private_devices ||
1947             context->private_mounts ||
1948             context->protect_system != PROTECT_SYSTEM_NO ||
1949             context->protect_home != PROTECT_HOME_NO ||
1950             context->protect_kernel_tunables ||
1951             context->protect_kernel_modules ||
1952             context->protect_kernel_logs ||
1953             context->protect_control_groups)
1954                 return true;
1955
1956         if (context->root_directory) {
1957                 ExecDirectoryType t;
1958
1959                 if (context->mount_apivfs)
1960                         return true;
1961
1962                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1963                         if (!params->prefix[t])
1964                                 continue;
1965
1966                         if (!strv_isempty(context->directories[t].paths))
1967                                 return true;
1968                 }
1969         }
1970
1971         if (context->dynamic_user &&
1972             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1973              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1974              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1975                 return true;
1976
1977         if (context->log_namespace)
1978                 return true;
1979
1980         return false;
1981 }
1982
1983 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
1984         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1985         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1986         _cleanup_close_ int unshare_ready_fd = -1;
1987         _cleanup_(sigkill_waitp) pid_t pid = 0;
1988         uint64_t c = 1;
1989         ssize_t n;
1990         int r;
1991
1992         /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
1993          * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
1994          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1995          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1996          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1997          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1998          * continues execution normally.
1999          * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2000          * does not need CAP_SETUID to write the single line mapping to itself. */
2001
2002         /* Can only set up multiple mappings with CAP_SETUID. */
2003         if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2004                 r = asprintf(&uid_map,
2005                              UID_FMT " " UID_FMT " 1\n"     /* Map $OUID → $OUID */
2006                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
2007                              ouid, ouid, uid, uid);
2008         else
2009                 r = asprintf(&uid_map,
2010                              UID_FMT " " UID_FMT " 1\n",    /* Map $OUID → $OUID */
2011                              ouid, ouid);
2012
2013         if (r < 0)
2014                 return -ENOMEM;
2015
2016         /* Can only set up multiple mappings with CAP_SETGID. */
2017         if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2018                 r = asprintf(&gid_map,
2019                              GID_FMT " " GID_FMT " 1\n"     /* Map $OGID → $OGID */
2020                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
2021                              ogid, ogid, gid, gid);
2022         else
2023                 r = asprintf(&gid_map,
2024                              GID_FMT " " GID_FMT " 1\n",    /* Map $OGID -> $OGID */
2025                              ogid, ogid);
2026
2027         if (r < 0)
2028                 return -ENOMEM;
2029
2030         /* Create a communication channel so that the parent can tell the child when it finished creating the user
2031          * namespace. */
2032         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2033         if (unshare_ready_fd < 0)
2034                 return -errno;
2035
2036         /* Create a communication channel so that the child can tell the parent a proper error code in case it
2037          * failed. */
2038         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2039                 return -errno;
2040
2041         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2042         if (r < 0)
2043                 return r;
2044         if (r == 0) {
2045                 _cleanup_close_ int fd = -1;
2046                 const char *a;
2047                 pid_t ppid;
2048
2049                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2050                  * here, after the parent opened its own user namespace. */
2051
2052                 ppid = getppid();
2053                 errno_pipe[0] = safe_close(errno_pipe[0]);
2054
2055                 /* Wait until the parent unshared the user namespace */
2056                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2057                         r = -errno;
2058                         goto child_fail;
2059                 }
2060
2061                 /* Disable the setgroups() system call in the child user namespace, for good. */
2062                 a = procfs_file_alloca(ppid, "setgroups");
2063                 fd = open(a, O_WRONLY|O_CLOEXEC);
2064                 if (fd < 0) {
2065                         if (errno != ENOENT) {
2066                                 r = -errno;
2067                                 goto child_fail;
2068                         }
2069
2070                         /* If the file is missing the kernel is too old, let's continue anyway. */
2071                 } else {
2072                         if (write(fd, "deny\n", 5) < 0) {
2073                                 r = -errno;
2074                                 goto child_fail;
2075                         }
2076
2077                         fd = safe_close(fd);
2078                 }
2079
2080                 /* First write the GID map */
2081                 a = procfs_file_alloca(ppid, "gid_map");
2082                 fd = open(a, O_WRONLY|O_CLOEXEC);
2083                 if (fd < 0) {
2084                         r = -errno;
2085                         goto child_fail;
2086                 }
2087                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2088                         r = -errno;
2089                         goto child_fail;
2090                 }
2091                 fd = safe_close(fd);
2092
2093                 /* The write the UID map */
2094                 a = procfs_file_alloca(ppid, "uid_map");
2095                 fd = open(a, O_WRONLY|O_CLOEXEC);
2096                 if (fd < 0) {
2097                         r = -errno;
2098                         goto child_fail;
2099                 }
2100                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2101                         r = -errno;
2102                         goto child_fail;
2103                 }
2104
2105                 _exit(EXIT_SUCCESS);
2106
2107         child_fail:
2108                 (void) write(errno_pipe[1], &r, sizeof(r));
2109                 _exit(EXIT_FAILURE);
2110         }
2111
2112         errno_pipe[1] = safe_close(errno_pipe[1]);
2113
2114         if (unshare(CLONE_NEWUSER) < 0)
2115                 return -errno;
2116
2117         /* Let the child know that the namespace is ready now */
2118         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2119                 return -errno;
2120
2121         /* Try to read an error code from the child */
2122         n = read(errno_pipe[0], &r, sizeof(r));
2123         if (n < 0)
2124                 return -errno;
2125         if (n == sizeof(r)) { /* an error code was sent to us */
2126                 if (r < 0)
2127                         return r;
2128                 return -EIO;
2129         }
2130         if (n != 0) /* on success we should have read 0 bytes */
2131                 return -EIO;
2132
2133         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2134         pid = 0;
2135         if (r < 0)
2136                 return r;
2137         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2138                 return -EIO;
2139
2140         return 0;
2141 }
2142
2143 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2144         if (!context->dynamic_user)
2145                 return false;
2146
2147         if (type == EXEC_DIRECTORY_CONFIGURATION)
2148                 return false;
2149
2150         if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2151                 return false;
2152
2153         return true;
2154 }
2155
2156 static int setup_exec_directory(
2157                 const ExecContext *context,
2158                 const ExecParameters *params,
2159                 uid_t uid,
2160                 gid_t gid,
2161                 ExecDirectoryType type,
2162                 int *exit_status) {
2163
2164         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2165                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2166                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2167                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2168                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2169                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2170         };
2171         char **rt;
2172         int r;
2173
2174         assert(context);
2175         assert(params);
2176         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2177         assert(exit_status);
2178
2179         if (!params->prefix[type])
2180                 return 0;
2181
2182         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2183                 if (!uid_is_valid(uid))
2184                         uid = 0;
2185                 if (!gid_is_valid(gid))
2186                         gid = 0;
2187         }
2188
2189         STRV_FOREACH(rt, context->directories[type].paths) {
2190                 _cleanup_free_ char *p = NULL, *pp = NULL;
2191
2192                 p = path_join(params->prefix[type], *rt);
2193                 if (!p) {
2194                         r = -ENOMEM;
2195                         goto fail;
2196                 }
2197
2198                 r = mkdir_parents_label(p, 0755);
2199                 if (r < 0)
2200                         goto fail;
2201
2202                 if (exec_directory_is_private(context, type)) {
2203                         _cleanup_free_ char *private_root = NULL;
2204
2205                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2206                          * case we want to avoid leaving a directory around fully accessible that is owned by
2207                          * a dynamic user whose UID is later on reused. To lock this down we use the same
2208                          * trick used by container managers to prohibit host users to get access to files of
2209                          * the same UID in containers: we place everything inside a directory that has an
2210                          * access mode of 0700 and is owned root:root, so that it acts as security boundary
2211                          * for unprivileged host code. We then use fs namespacing to make this directory
2212                          * permeable for the service itself.
2213                          *
2214                          * Specifically: for a service which wants a special directory "foo/" we first create
2215                          * a directory "private/" with access mode 0700 owned by root:root. Then we place
2216                          * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2217                          * "private/foo". This way, privileged host users can access "foo/" as usual, but
2218                          * unprivileged host users can't look into it. Inside of the namespace of the unit
2219                          * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2220                          * "private/foo/" is mounted under the same name, thus disabling the access boundary
2221                          * for the service and making sure it only gets access to the dirs it needs but no
2222                          * others. Tricky? Yes, absolutely, but it works!
2223                          *
2224                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2225                          * to be owned by the service itself.
2226                          *
2227                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2228                          * for sharing files or sockets with other services. */
2229
2230                         private_root = path_join(params->prefix[type], "private");
2231                         if (!private_root) {
2232                                 r = -ENOMEM;
2233                                 goto fail;
2234                         }
2235
2236                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2237                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2238                         if (r < 0)
2239                                 goto fail;
2240
2241                         pp = path_join(private_root, *rt);
2242                         if (!pp) {
2243                                 r = -ENOMEM;
2244                                 goto fail;
2245                         }
2246
2247                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2248                         r = mkdir_parents_label(pp, 0755);
2249                         if (r < 0)
2250                                 goto fail;
2251
2252                         if (is_dir(p, false) > 0 &&
2253                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2254
2255                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2256                                  * it over. Most likely the service has been upgraded from one that didn't use
2257                                  * DynamicUser=1, to one that does. */
2258
2259                                 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2260                                          "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2261                                          exec_directory_type_to_string(type), p, pp);
2262
2263                                 if (rename(p, pp) < 0) {
2264                                         r = -errno;
2265                                         goto fail;
2266                                 }
2267                         } else {
2268                                 /* Otherwise, create the actual directory for the service */
2269
2270                                 r = mkdir_label(pp, context->directories[type].mode);
2271                                 if (r < 0 && r != -EEXIST)
2272                                         goto fail;
2273                         }
2274
2275                         /* And link it up from the original place */
2276                         r = symlink_idempotent(pp, p, true);
2277                         if (r < 0)
2278                                 goto fail;
2279
2280                 } else {
2281                         _cleanup_free_ char *target = NULL;
2282
2283                         if (type != EXEC_DIRECTORY_CONFIGURATION &&
2284                             readlink_and_make_absolute(p, &target) >= 0) {
2285                                 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2286
2287                                 /* This already exists and is a symlink? Interesting. Maybe it's one created
2288                                  * by DynamicUser=1 (see above)?
2289                                  *
2290                                  * We do this for all directory types except for ConfigurationDirectory=,
2291                                  * since they all support the private/ symlink logic at least in some
2292                                  * configurations, see above. */
2293
2294                                 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2295                                 if (r < 0)
2296                                         goto fail;
2297
2298                                 q = path_join(params->prefix[type], "private", *rt);
2299                                 if (!q) {
2300                                         r = -ENOMEM;
2301                                         goto fail;
2302                                 }
2303
2304                                 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2305                                 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2306                                 if (r < 0)
2307                                         goto fail;
2308
2309                                 if (path_equal(q_resolved, target_resolved)) {
2310
2311                                         /* Hmm, apparently DynamicUser= was once turned on for this service,
2312                                          * but is no longer. Let's move the directory back up. */
2313
2314                                         log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2315                                                  "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2316                                                  exec_directory_type_to_string(type), q, p);
2317
2318                                         if (unlink(p) < 0) {
2319                                                 r = -errno;
2320                                                 goto fail;
2321                                         }
2322
2323                                         if (rename(q, p) < 0) {
2324                                                 r = -errno;
2325                                                 goto fail;
2326                                         }
2327                                 }
2328                         }
2329
2330                         r = mkdir_label(p, context->directories[type].mode);
2331                         if (r < 0) {
2332                                 if (r != -EEXIST)
2333                                         goto fail;
2334
2335                                 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2336                                         struct stat st;
2337
2338                                         /* Don't change the owner/access mode of the configuration directory,
2339                                          * as in the common case it is not written to by a service, and shall
2340                                          * not be writable. */
2341
2342                                         if (stat(p, &st) < 0) {
2343                                                 r = -errno;
2344                                                 goto fail;
2345                                         }
2346
2347                                         /* Still complain if the access mode doesn't match */
2348                                         if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2349                                                 log_warning("%s \'%s\' already exists but the mode is different. "
2350                                                             "(File system: %o %sMode: %o)",
2351                                                             exec_directory_type_to_string(type), *rt,
2352                                                             st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2353
2354                                         continue;
2355                                 }
2356                         }
2357                 }
2358
2359                 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2360                  * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2361                  * current UID/GID ownership.) */
2362                 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2363                 if (r < 0)
2364                         goto fail;
2365
2366                 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2367                  * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2368                  * assignments to exist.*/
2369                 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2370                 if (r < 0)
2371                         goto fail;
2372         }
2373
2374         return 0;
2375
2376 fail:
2377         *exit_status = exit_status_table[type];
2378         return r;
2379 }
2380
2381 #if ENABLE_SMACK
2382 static int setup_smack(
2383                 const ExecContext *context,
2384                 const ExecCommand *command) {
2385
2386         int r;
2387
2388         assert(context);
2389         assert(command);
2390
2391         if (context->smack_process_label) {
2392                 r = mac_smack_apply_pid(0, context->smack_process_label);
2393                 if (r < 0)
2394                         return r;
2395         }
2396 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2397         else {
2398                 _cleanup_free_ char *exec_label = NULL;
2399
2400                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2401                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2402                         return r;
2403
2404                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2405                 if (r < 0)
2406                         return r;
2407         }
2408 #endif
2409
2410         return 0;
2411 }
2412 #endif
2413
2414 static int compile_bind_mounts(
2415                 const ExecContext *context,
2416                 const ExecParameters *params,
2417                 BindMount **ret_bind_mounts,
2418                 size_t *ret_n_bind_mounts,
2419                 char ***ret_empty_directories) {
2420
2421         _cleanup_strv_free_ char **empty_directories = NULL;
2422         BindMount *bind_mounts;
2423         size_t n, h = 0, i;
2424         ExecDirectoryType t;
2425         int r;
2426
2427         assert(context);
2428         assert(params);
2429         assert(ret_bind_mounts);
2430         assert(ret_n_bind_mounts);
2431         assert(ret_empty_directories);
2432
2433         n = context->n_bind_mounts;
2434         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2435                 if (!params->prefix[t])
2436                         continue;
2437
2438                 n += strv_length(context->directories[t].paths);
2439         }
2440
2441         if (n <= 0) {
2442                 *ret_bind_mounts = NULL;
2443                 *ret_n_bind_mounts = 0;
2444                 *ret_empty_directories = NULL;
2445                 return 0;
2446         }
2447
2448         bind_mounts = new(BindMount, n);
2449         if (!bind_mounts)
2450                 return -ENOMEM;
2451
2452         for (i = 0; i < context->n_bind_mounts; i++) {
2453                 BindMount *item = context->bind_mounts + i;
2454                 char *s, *d;
2455
2456                 s = strdup(item->source);
2457                 if (!s) {
2458                         r = -ENOMEM;
2459                         goto finish;
2460                 }
2461
2462                 d = strdup(item->destination);
2463                 if (!d) {
2464                         free(s);
2465                         r = -ENOMEM;
2466                         goto finish;
2467                 }
2468
2469                 bind_mounts[h++] = (BindMount) {
2470                         .source = s,
2471                         .destination = d,
2472                         .read_only = item->read_only,
2473                         .recursive = item->recursive,
2474                         .ignore_enoent = item->ignore_enoent,
2475                 };
2476         }
2477
2478         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2479                 char **suffix;
2480
2481                 if (!params->prefix[t])
2482                         continue;
2483
2484                 if (strv_isempty(context->directories[t].paths))
2485                         continue;
2486
2487                 if (exec_directory_is_private(context, t) &&
2488                     !(context->root_directory || context->root_image)) {
2489                         char *private_root;
2490
2491                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2492                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2493                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2494
2495                         private_root = path_join(params->prefix[t], "private");
2496                         if (!private_root) {
2497                                 r = -ENOMEM;
2498                                 goto finish;
2499                         }
2500
2501                         r = strv_consume(&empty_directories, private_root);
2502                         if (r < 0)
2503                                 goto finish;
2504                 }
2505
2506                 STRV_FOREACH(suffix, context->directories[t].paths) {
2507                         char *s, *d;
2508
2509                         if (exec_directory_is_private(context, t))
2510                                 s = path_join(params->prefix[t], "private", *suffix);
2511                         else
2512                                 s = path_join(params->prefix[t], *suffix);
2513                         if (!s) {
2514                                 r = -ENOMEM;
2515                                 goto finish;
2516                         }
2517
2518                         if (exec_directory_is_private(context, t) &&
2519                             (context->root_directory || context->root_image))
2520                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2521                                  * directory is not created on the root directory. So, let's bind-mount the directory
2522                                  * on the 'non-private' place. */
2523                                 d = path_join(params->prefix[t], *suffix);
2524                         else
2525                                 d = strdup(s);
2526                         if (!d) {
2527                                 free(s);
2528                                 r = -ENOMEM;
2529                                 goto finish;
2530                         }
2531
2532                         bind_mounts[h++] = (BindMount) {
2533                                 .source = s,
2534                                 .destination = d,
2535                                 .read_only = false,
2536                                 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2537                                 .recursive = true,
2538                                 .ignore_enoent = false,
2539                         };
2540                 }
2541         }
2542
2543         assert(h == n);
2544
2545         *ret_bind_mounts = bind_mounts;
2546         *ret_n_bind_mounts = n;
2547         *ret_empty_directories = TAKE_PTR(empty_directories);
2548
2549         return (int) n;
2550
2551 finish:
2552         bind_mount_free_many(bind_mounts, h);
2553         return r;
2554 }
2555
2556 static bool insist_on_sandboxing(
2557                 const ExecContext *context,
2558                 const char *root_dir,
2559                 const char *root_image,
2560                 const BindMount *bind_mounts,
2561                 size_t n_bind_mounts) {
2562
2563         size_t i;
2564
2565         assert(context);
2566         assert(n_bind_mounts == 0 || bind_mounts);
2567
2568         /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2569          * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2570          * rearrange stuff in a way we cannot ignore gracefully. */
2571
2572         if (context->n_temporary_filesystems > 0)
2573                 return true;
2574
2575         if (root_dir || root_image)
2576                 return true;
2577
2578         if (context->dynamic_user)
2579                 return true;
2580
2581         /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2582          * essential. */
2583         for (i = 0; i < n_bind_mounts; i++)
2584                 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2585                         return true;
2586
2587         if (context->log_namespace)
2588                 return true;
2589
2590         return false;
2591 }
2592
2593 static int apply_mount_namespace(
2594                 const Unit *u,
2595                 const ExecCommand *command,
2596                 const ExecContext *context,
2597                 const ExecParameters *params,
2598                 const ExecRuntime *runtime,
2599                 char **error_path) {
2600
2601         _cleanup_strv_free_ char **empty_directories = NULL;
2602         char *tmp = NULL, *var = NULL;
2603         const char *root_dir = NULL, *root_image = NULL;
2604         NamespaceInfo ns_info;
2605         bool needs_sandboxing;
2606         BindMount *bind_mounts = NULL;
2607         size_t n_bind_mounts = 0;
2608         int r;
2609
2610         assert(context);
2611
2612         if (params->flags & EXEC_APPLY_CHROOT) {
2613                 root_image = context->root_image;
2614
2615                 if (!root_image)
2616                         root_dir = context->root_directory;
2617         }
2618
2619         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2620         if (r < 0)
2621                 return r;
2622
2623         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2624         if (needs_sandboxing) {
2625                 /* The runtime struct only contains the parent of the private /tmp,
2626                  * which is non-accessible to world users. Inside of it there's a /tmp
2627                  * that is sticky, and that's the one we want to use here. */
2628
2629                 if (context->private_tmp && runtime) {
2630                         if (runtime->tmp_dir)
2631                                 tmp = strjoina(runtime->tmp_dir, "/tmp");
2632                         if (runtime->var_tmp_dir)
2633                                 var = strjoina(runtime->var_tmp_dir, "/tmp");
2634                 }
2635
2636                 ns_info = (NamespaceInfo) {
2637                         .ignore_protect_paths = false,
2638                         .private_dev = context->private_devices,
2639                         .protect_control_groups = context->protect_control_groups,
2640                         .protect_kernel_tunables = context->protect_kernel_tunables,
2641                         .protect_kernel_modules = context->protect_kernel_modules,
2642                         .protect_kernel_logs = context->protect_kernel_logs,
2643                         .protect_hostname = context->protect_hostname,
2644                         .mount_apivfs = context->mount_apivfs,
2645                         .private_mounts = context->private_mounts,
2646                 };
2647         } else if (!context->dynamic_user && root_dir)
2648                 /*
2649                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2650                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2651                  * fail if we are enable to apply the sandbox inside the mount namespace.
2652                  */
2653                 ns_info = (NamespaceInfo) {
2654                         .ignore_protect_paths = true,
2655                 };
2656         else
2657                 ns_info = (NamespaceInfo) {};
2658
2659         if (context->mount_flags == MS_SHARED)
2660                 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2661
2662         r = setup_namespace(root_dir, root_image,
2663                             &ns_info, context->read_write_paths,
2664                             needs_sandboxing ? context->read_only_paths : NULL,
2665                             needs_sandboxing ? context->inaccessible_paths : NULL,
2666                             empty_directories,
2667                             bind_mounts,
2668                             n_bind_mounts,
2669                             context->temporary_filesystems,
2670                             context->n_temporary_filesystems,
2671                             tmp,
2672                             var,
2673                             context->log_namespace,
2674                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2675                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2676                             context->mount_flags,
2677                             DISSECT_IMAGE_DISCARD_ON_LOOP|DISSECT_IMAGE_RELAX_VAR_CHECK|DISSECT_IMAGE_FSCK,
2678                             error_path);
2679
2680         /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
2681          * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
2682          * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2683          * completely different execution environment. */
2684         if (r == -ENOANO) {
2685                 if (insist_on_sandboxing(
2686                                     context,
2687                                     root_dir, root_image,
2688                                     bind_mounts,
2689                                     n_bind_mounts)) {
2690                         log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2691                                        "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2692                                        n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2693
2694                         r = -EOPNOTSUPP;
2695                 } else {
2696                         log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2697                         r = 0;
2698                 }
2699         }
2700
2701         bind_mount_free_many(bind_mounts, n_bind_mounts);
2702         return r;
2703 }
2704
2705 static int apply_working_directory(
2706                 const ExecContext *context,
2707                 const ExecParameters *params,
2708                 const char *home,
2709                 int *exit_status) {
2710
2711         const char *d, *wd;
2712
2713         assert(context);
2714         assert(exit_status);
2715
2716         if (context->working_directory_home) {
2717
2718                 if (!home) {
2719                         *exit_status = EXIT_CHDIR;
2720                         return -ENXIO;
2721                 }
2722
2723                 wd = home;
2724
2725         } else if (context->working_directory)
2726                 wd = context->working_directory;
2727         else
2728                 wd = "/";
2729
2730         if (params->flags & EXEC_APPLY_CHROOT)
2731                 d = wd;
2732         else
2733                 d = prefix_roota(context->root_directory, wd);
2734
2735         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2736                 *exit_status = EXIT_CHDIR;
2737                 return -errno;
2738         }
2739
2740         return 0;
2741 }
2742
2743 static int apply_root_directory(
2744                 const ExecContext *context,
2745                 const ExecParameters *params,
2746                 const bool needs_mount_ns,
2747                 int *exit_status) {
2748
2749         assert(context);
2750         assert(exit_status);
2751
2752         if (params->flags & EXEC_APPLY_CHROOT) {
2753                 if (!needs_mount_ns && context->root_directory)
2754                         if (chroot(context->root_directory) < 0) {
2755                                 *exit_status = EXIT_CHROOT;
2756                                 return -errno;
2757                         }
2758         }
2759
2760         return 0;
2761 }
2762
2763 static int setup_keyring(
2764                 const Unit *u,
2765                 const ExecContext *context,
2766                 const ExecParameters *p,
2767                 uid_t uid, gid_t gid) {
2768
2769         key_serial_t keyring;
2770         int r = 0;
2771         uid_t saved_uid;
2772         gid_t saved_gid;
2773
2774         assert(u);
2775         assert(context);
2776         assert(p);
2777
2778         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2779          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2780          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2781          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2782          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2783          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2784
2785         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2786                 return 0;
2787
2788         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2789          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2790          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2791          * & group is just as nasty as acquiring a reference to the user keyring. */
2792
2793         saved_uid = getuid();
2794         saved_gid = getgid();
2795
2796         if (gid_is_valid(gid) && gid != saved_gid) {
2797                 if (setregid(gid, -1) < 0)
2798                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2799         }
2800
2801         if (uid_is_valid(uid) && uid != saved_uid) {
2802                 if (setreuid(uid, -1) < 0) {
2803                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2804                         goto out;
2805                 }
2806         }
2807
2808         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2809         if (keyring == -1) {
2810                 if (errno == ENOSYS)
2811                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2812                 else if (IN_SET(errno, EACCES, EPERM))
2813                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2814                 else if (errno == EDQUOT)
2815                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2816                 else
2817                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2818
2819                 goto out;
2820         }
2821
2822         /* When requested link the user keyring into the session keyring. */
2823         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2824
2825                 if (keyctl(KEYCTL_LINK,
2826                            KEY_SPEC_USER_KEYRING,
2827                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2828                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2829                         goto out;
2830                 }
2831         }
2832
2833         /* Restore uid/gid back */
2834         if (uid_is_valid(uid) && uid != saved_uid) {
2835                 if (setreuid(saved_uid, -1) < 0) {
2836                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2837                         goto out;
2838                 }
2839         }
2840
2841         if (gid_is_valid(gid) && gid != saved_gid) {
2842                 if (setregid(saved_gid, -1) < 0)
2843                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2844         }
2845
2846         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2847         if (!sd_id128_is_null(u->invocation_id)) {
2848                 key_serial_t key;
2849
2850                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2851                 if (key == -1)
2852                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2853                 else {
2854                         if (keyctl(KEYCTL_SETPERM, key,
2855                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2856                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2857                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2858                 }
2859         }
2860
2861 out:
2862         /* Revert back uid & gid for the the last time, and exit */
2863         /* no extra logging, as only the first already reported error matters */
2864         if (getuid() != saved_uid)
2865                 (void) setreuid(saved_uid, -1);
2866
2867         if (getgid() != saved_gid)
2868                 (void) setregid(saved_gid, -1);
2869
2870         return r;
2871 }
2872
2873 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
2874         assert(array);
2875         assert(n);
2876         assert(pair);
2877
2878         if (pair[0] >= 0)
2879                 array[(*n)++] = pair[0];
2880         if (pair[1] >= 0)
2881                 array[(*n)++] = pair[1];
2882 }
2883
2884 static int close_remaining_fds(
2885                 const ExecParameters *params,
2886                 const ExecRuntime *runtime,
2887                 const DynamicCreds *dcreds,
2888                 int user_lookup_fd,
2889                 int socket_fd,
2890                 int exec_fd,
2891                 const int *fds, size_t n_fds) {
2892
2893         size_t n_dont_close = 0;
2894         int dont_close[n_fds + 12];
2895
2896         assert(params);
2897
2898         if (params->stdin_fd >= 0)
2899                 dont_close[n_dont_close++] = params->stdin_fd;
2900         if (params->stdout_fd >= 0)
2901                 dont_close[n_dont_close++] = params->stdout_fd;
2902         if (params->stderr_fd >= 0)
2903                 dont_close[n_dont_close++] = params->stderr_fd;
2904
2905         if (socket_fd >= 0)
2906                 dont_close[n_dont_close++] = socket_fd;
2907         if (exec_fd >= 0)
2908                 dont_close[n_dont_close++] = exec_fd;
2909         if (n_fds > 0) {
2910                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2911                 n_dont_close += n_fds;
2912         }
2913
2914         if (runtime)
2915                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2916
2917         if (dcreds) {
2918                 if (dcreds->user)
2919                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2920                 if (dcreds->group)
2921                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2922         }
2923
2924         if (user_lookup_fd >= 0)
2925                 dont_close[n_dont_close++] = user_lookup_fd;
2926
2927         return close_all_fds(dont_close, n_dont_close);
2928 }
2929
2930 static int send_user_lookup(
2931                 Unit *unit,
2932                 int user_lookup_fd,
2933                 uid_t uid,
2934                 gid_t gid) {
2935
2936         assert(unit);
2937
2938         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2939          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2940          * specified. */
2941
2942         if (user_lookup_fd < 0)
2943                 return 0;
2944
2945         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2946                 return 0;
2947
2948         if (writev(user_lookup_fd,
2949                (struct iovec[]) {
2950                            IOVEC_INIT(&uid, sizeof(uid)),
2951                            IOVEC_INIT(&gid, sizeof(gid)),
2952                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2953                 return -errno;
2954
2955         return 0;
2956 }
2957
2958 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2959         int r;
2960
2961         assert(c);
2962         assert(home);
2963         assert(buf);
2964
2965         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2966
2967         if (*home)
2968                 return 0;
2969
2970         if (!c->working_directory_home)
2971                 return 0;
2972
2973         r = get_home_dir(buf);
2974         if (r < 0)
2975                 return r;
2976
2977         *home = *buf;
2978         return 1;
2979 }
2980
2981 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2982         _cleanup_strv_free_ char ** list = NULL;
2983         ExecDirectoryType t;
2984         int r;
2985
2986         assert(c);
2987         assert(p);
2988         assert(ret);
2989
2990         assert(c->dynamic_user);
2991
2992         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2993          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2994          * directories. */
2995
2996         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2997                 char **i;
2998
2999                 if (t == EXEC_DIRECTORY_CONFIGURATION)
3000                         continue;
3001
3002                 if (!p->prefix[t])
3003                         continue;
3004
3005                 STRV_FOREACH(i, c->directories[t].paths) {
3006                         char *e;
3007
3008                         if (exec_directory_is_private(c, t))
3009                                 e = path_join(p->prefix[t], "private", *i);
3010                         else
3011                                 e = path_join(p->prefix[t], *i);
3012                         if (!e)
3013                                 return -ENOMEM;
3014
3015                         r = strv_consume(&list, e);
3016                         if (r < 0)
3017                                 return r;
3018                 }
3019         }
3020
3021         *ret = TAKE_PTR(list);
3022
3023         return 0;
3024 }
3025
3026 static char *exec_command_line(char **argv);
3027
3028 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3029         bool using_subcgroup;
3030         char *p;
3031
3032         assert(params);
3033         assert(ret);
3034
3035         if (!params->cgroup_path)
3036                 return -EINVAL;
3037
3038         /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3039          * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3040          * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3041          * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3042          * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3043          * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3044          * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3045          * flag, which is only passed for the former statements, not for the latter. */
3046
3047         using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3048         if (using_subcgroup)
3049                 p = path_join(params->cgroup_path, ".control");
3050         else
3051                 p = strdup(params->cgroup_path);
3052         if (!p)
3053                 return -ENOMEM;
3054
3055         *ret = p;
3056         return using_subcgroup;
3057 }
3058
3059 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3060         _cleanup_(cpu_set_reset) CPUSet s = {};
3061         int r;
3062
3063         assert(c);
3064         assert(ret);
3065
3066         if (!c->numa_policy.nodes.set) {
3067                 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3068                 return 0;
3069         }
3070
3071         r = numa_to_cpu_set(&c->numa_policy, &s);
3072         if (r < 0)
3073                 return r;
3074
3075         cpu_set_reset(ret);
3076
3077         return cpu_set_add_all(ret, &s);
3078 }
3079
3080 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3081         assert(c);
3082
3083         return c->cpu_affinity_from_numa;
3084 }
3085
3086 static int exec_child(
3087                 Unit *unit,
3088                 const ExecCommand *command,
3089                 const ExecContext *context,
3090                 const ExecParameters *params,
3091                 ExecRuntime *runtime,
3092                 DynamicCreds *dcreds,
3093                 int socket_fd,
3094                 const int named_iofds[static 3],
3095                 int *fds,
3096                 size_t n_socket_fds,
3097                 size_t n_storage_fds,
3098                 char **files_env,
3099                 int user_lookup_fd,
3100                 int *exit_status) {
3101
3102         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3103         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
3104         _cleanup_free_ gid_t *supplementary_gids = NULL;
3105         const char *username = NULL, *groupname = NULL;
3106         _cleanup_free_ char *home_buffer = NULL;
3107         const char *home = NULL, *shell = NULL;
3108         char **final_argv = NULL;
3109         dev_t journal_stream_dev = 0;
3110         ino_t journal_stream_ino = 0;
3111         bool userns_set_up = false;
3112         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3113                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
3114                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
3115                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
3116 #if HAVE_SELINUX
3117         _cleanup_free_ char *mac_selinux_context_net = NULL;
3118         bool use_selinux = false;
3119 #endif
3120 #if ENABLE_SMACK
3121         bool use_smack = false;
3122 #endif
3123 #if HAVE_APPARMOR
3124         bool use_apparmor = false;
3125 #endif
3126         uid_t saved_uid = getuid();
3127         gid_t saved_gid = getgid();
3128         uid_t uid = UID_INVALID;
3129         gid_t gid = GID_INVALID;
3130         size_t n_fds;
3131         ExecDirectoryType dt;
3132         int secure_bits;
3133         _cleanup_free_ gid_t *gids_after_pam = NULL;
3134         int ngids_after_pam = 0;
3135
3136         assert(unit);
3137         assert(command);
3138         assert(context);
3139         assert(params);
3140         assert(exit_status);
3141
3142         rename_process_from_path(command->path);
3143
3144         /* We reset exactly these signals, since they are the
3145          * only ones we set to SIG_IGN in the main daemon. All
3146          * others we leave untouched because we set them to
3147          * SIG_DFL or a valid handler initially, both of which
3148          * will be demoted to SIG_DFL. */
3149         (void) default_signals(SIGNALS_CRASH_HANDLER,
3150                                SIGNALS_IGNORE, -1);
3151
3152         if (context->ignore_sigpipe)
3153                 (void) ignore_signals(SIGPIPE, -1);
3154
3155         r = reset_signal_mask();
3156         if (r < 0) {
3157                 *exit_status = EXIT_SIGNAL_MASK;
3158                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3159         }
3160
3161         if (params->idle_pipe)
3162                 do_idle_pipe_dance(params->idle_pipe);
3163
3164         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3165          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3166          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3167          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3168
3169         log_forget_fds();
3170         log_set_open_when_needed(true);
3171
3172         /* In case anything used libc syslog(), close this here, too */
3173         closelog();
3174
3175         n_fds = n_socket_fds + n_storage_fds;
3176         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
3177         if (r < 0) {
3178                 *exit_status = EXIT_FDS;
3179                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3180         }
3181
3182         if (!context->same_pgrp)
3183                 if (setsid() < 0) {
3184                         *exit_status = EXIT_SETSID;
3185                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3186                 }
3187
3188         exec_context_tty_reset(context, params);
3189
3190         if (unit_shall_confirm_spawn(unit)) {
3191                 const char *vc = params->confirm_spawn;
3192                 _cleanup_free_ char *cmdline = NULL;
3193
3194                 cmdline = exec_command_line(command->argv);
3195                 if (!cmdline) {
3196                         *exit_status = EXIT_MEMORY;
3197                         return log_oom();
3198                 }
3199
3200                 r = ask_for_confirmation(vc, unit, cmdline);
3201                 if (r != CONFIRM_EXECUTE) {
3202                         if (r == CONFIRM_PRETEND_SUCCESS) {
3203                                 *exit_status = EXIT_SUCCESS;
3204                                 return 0;
3205                         }
3206                         *exit_status = EXIT_CONFIRM;
3207                         log_unit_error(unit, "Execution cancelled by the user");
3208                         return -ECANCELED;
3209                 }
3210         }
3211
3212         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3213          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3214          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3215          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3216          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3217         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3218             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3219                 *exit_status = EXIT_MEMORY;
3220                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3221         }
3222
3223         if (context->dynamic_user && dcreds) {
3224                 _cleanup_strv_free_ char **suggested_paths = NULL;
3225
3226                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3227                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
3228                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3229                         *exit_status = EXIT_USER;
3230                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3231                 }
3232
3233                 r = compile_suggested_paths(context, params, &suggested_paths);
3234                 if (r < 0) {
3235                         *exit_status = EXIT_MEMORY;
3236                         return log_oom();
3237                 }
3238
3239                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3240                 if (r < 0) {
3241                         *exit_status = EXIT_USER;
3242                         if (r == -EILSEQ) {
3243                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3244                                 return -EOPNOTSUPP;
3245                         }
3246                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3247                 }
3248
3249                 if (!uid_is_valid(uid)) {
3250                         *exit_status = EXIT_USER;
3251                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
3252                         return -ESRCH;
3253                 }
3254
3255                 if (!gid_is_valid(gid)) {
3256                         *exit_status = EXIT_USER;
3257                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
3258                         return -ESRCH;
3259                 }
3260
3261                 if (dcreds->user)
3262                         username = dcreds->user->name;
3263
3264         } else {
3265                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3266                 if (r < 0) {
3267                         *exit_status = EXIT_USER;
3268                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3269                 }
3270
3271                 r = get_fixed_group(context, &groupname, &gid);
3272                 if (r < 0) {
3273                         *exit_status = EXIT_GROUP;
3274                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3275                 }
3276         }
3277
3278         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3279         r = get_supplementary_groups(context, username, groupname, gid,
3280                                      &supplementary_gids, &ngids);
3281         if (r < 0) {
3282                 *exit_status = EXIT_GROUP;
3283                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3284         }
3285
3286         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3287         if (r < 0) {
3288                 *exit_status = EXIT_USER;
3289                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3290         }
3291
3292         user_lookup_fd = safe_close(user_lookup_fd);
3293
3294         r = acquire_home(context, uid, &home, &home_buffer);
3295         if (r < 0) {
3296                 *exit_status = EXIT_CHDIR;
3297                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3298         }
3299
3300         /* If a socket is connected to STDIN/STDOUT/STDERR, we
3301          * must sure to drop O_NONBLOCK */
3302         if (socket_fd >= 0)
3303                 (void) fd_nonblock(socket_fd, false);
3304
3305         /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3306          * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3307         if (params->cgroup_path) {
3308                 _cleanup_free_ char *p = NULL;
3309
3310                 r = exec_parameters_get_cgroup_path(params, &p);
3311                 if (r < 0) {
3312                         *exit_status = EXIT_CGROUP;
3313                         return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3314                 }
3315
3316                 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3317                 if (r < 0) {
3318                         *exit_status = EXIT_CGROUP;
3319                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3320                 }
3321         }
3322
3323         if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3324                 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3325                 if (r < 0) {
3326                         *exit_status = EXIT_NETWORK;
3327                         return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3328                 }
3329         }
3330
3331         r = setup_input(context, params, socket_fd, named_iofds);
3332         if (r < 0) {
3333                 *exit_status = EXIT_STDIN;
3334                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3335         }
3336
3337         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3338         if (r < 0) {
3339                 *exit_status = EXIT_STDOUT;
3340                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3341         }
3342
3343         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3344         if (r < 0) {
3345                 *exit_status = EXIT_STDERR;
3346                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3347         }
3348
3349         if (context->oom_score_adjust_set) {
3350                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3351                  * prohibit write access to this file, and we shouldn't trip up over that. */
3352                 r = set_oom_score_adjust(context->oom_score_adjust);
3353                 if (IN_SET(r, -EPERM, -EACCES))
3354                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3355                 else if (r < 0) {
3356                         *exit_status = EXIT_OOM_ADJUST;
3357                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3358                 }
3359         }
3360
3361         if (context->coredump_filter_set) {
3362                 r = set_coredump_filter(context->coredump_filter);
3363                 if (ERRNO_IS_PRIVILEGE(r))
3364                         log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3365                 else if (r < 0)
3366                         return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3367         }
3368
3369         if (context->nice_set) {
3370                 r = setpriority_closest(context->nice);
3371                 if (r < 0)
3372                         return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3373         }
3374
3375         if (context->cpu_sched_set) {
3376                 struct sched_param param = {
3377                         .sched_priority = context->cpu_sched_priority,
3378                 };
3379
3380                 r = sched_setscheduler(0,
3381                                        context->cpu_sched_policy |
3382                                        (context->cpu_sched_reset_on_fork ?
3383                                         SCHED_RESET_ON_FORK : 0),
3384                                        &param);
3385                 if (r < 0) {
3386                         *exit_status = EXIT_SETSCHEDULER;
3387                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
3388                 }
3389         }
3390
3391         if (context->cpu_affinity_from_numa || context->cpu_set.set) {
3392                 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
3393                 const CPUSet *cpu_set;
3394
3395                 if (context->cpu_affinity_from_numa) {
3396                         r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
3397                         if (r < 0) {
3398                                 *exit_status = EXIT_CPUAFFINITY;
3399                                 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
3400                         }
3401
3402                         cpu_set = &converted_cpu_set;
3403                 } else
3404                         cpu_set = &context->cpu_set;
3405
3406                 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
3407                         *exit_status = EXIT_CPUAFFINITY;
3408                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
3409                 }
3410         }
3411
3412         if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3413                 r = apply_numa_policy(&context->numa_policy);
3414                 if (r == -EOPNOTSUPP)
3415                         log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
3416                 else if (r < 0) {
3417                         *exit_status = EXIT_NUMA_POLICY;
3418                         return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3419                 }
3420         }
3421
3422         if (context->ioprio_set)
3423                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3424                         *exit_status = EXIT_IOPRIO;
3425                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3426                 }
3427
3428         if (context->timer_slack_nsec != NSEC_INFINITY)
3429                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3430                         *exit_status = EXIT_TIMERSLACK;
3431                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3432                 }
3433
3434         if (context->personality != PERSONALITY_INVALID) {
3435                 r = safe_personality(context->personality);
3436                 if (r < 0) {
3437                         *exit_status = EXIT_PERSONALITY;
3438                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3439                 }
3440         }
3441
3442         if (context->utmp_id)
3443                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3444                                       context->tty_path,
3445                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3446                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3447                                       USER_PROCESS,
3448                                       username);
3449
3450         if (uid_is_valid(uid)) {
3451                 r = chown_terminal(STDIN_FILENO, uid);
3452                 if (r < 0) {
3453                         *exit_status = EXIT_STDIN;
3454                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3455                 }
3456         }
3457
3458         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
3459          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3460          * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3461          * touch a single hierarchy too. */
3462         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3463                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3464                 if (r < 0) {
3465                         *exit_status = EXIT_CGROUP;
3466                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3467                 }
3468         }
3469
3470         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3471                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3472                 if (r < 0)
3473                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3474         }
3475
3476         r = build_environment(
3477                         unit,
3478                         context,
3479                         params,
3480                         n_fds,
3481                         home,
3482                         username,
3483                         shell,
3484                         journal_stream_dev,
3485                         journal_stream_ino,
3486                         &our_env);
3487         if (r < 0) {
3488                 *exit_status = EXIT_MEMORY;
3489                 return log_oom();
3490         }
3491
3492         r = build_pass_environment(context, &pass_env);
3493         if (r < 0) {
3494                 *exit_status = EXIT_MEMORY;
3495                 return log_oom();
3496         }
3497
3498         accum_env = strv_env_merge(5,
3499                                    params->environment,
3500                                    our_env,
3501                                    pass_env,
3502                                    context->environment,
3503                                    files_env);
3504         if (!accum_env) {
3505                 *exit_status = EXIT_MEMORY;
3506                 return log_oom();
3507         }
3508         accum_env = strv_env_clean(accum_env);
3509
3510         (void) umask(context->umask);
3511
3512         r = setup_keyring(unit, context, params, uid, gid);
3513         if (r < 0) {
3514                 *exit_status = EXIT_KEYRING;
3515                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3516         }
3517
3518         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3519         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3520
3521         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3522         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3523
3524         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3525         if (needs_ambient_hack)
3526                 needs_setuid = false;
3527         else
3528                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3529
3530         if (needs_sandboxing) {
3531                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3532                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3533                  * impacting our own code paths. */
3534
3535 #if HAVE_SELINUX
3536                 use_selinux = mac_selinux_use();
3537 #endif
3538 #if ENABLE_SMACK
3539                 use_smack = mac_smack_use();
3540 #endif
3541 #if HAVE_APPARMOR
3542                 use_apparmor = mac_apparmor_use();
3543 #endif
3544         }
3545
3546         if (needs_sandboxing) {
3547                 int which_failed;
3548
3549                 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3550                  * is set here. (See below.) */
3551
3552                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3553                 if (r < 0) {
3554                         *exit_status = EXIT_LIMITS;
3555                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3556                 }
3557         }
3558
3559         if (needs_setuid) {
3560
3561                 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3562                  * wins here. (See above.) */
3563
3564                 if (context->pam_name && username) {
3565                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3566                         if (r < 0) {
3567                                 *exit_status = EXIT_PAM;
3568                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3569                         }
3570
3571                         ngids_after_pam = getgroups_alloc(&gids_after_pam);
3572                         if (ngids_after_pam < 0) {
3573                                 *exit_status = EXIT_MEMORY;
3574                                 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
3575                         }
3576                 }
3577         }
3578
3579         if (needs_sandboxing) {
3580 #if HAVE_SELINUX
3581                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3582                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3583                         if (r < 0) {
3584                                 *exit_status = EXIT_SELINUX_CONTEXT;
3585                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3586                         }
3587                 }
3588 #endif
3589
3590                 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
3591                  * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
3592                  * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
3593                 if (context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
3594                         userns_set_up = true;
3595                         r = setup_private_users(saved_uid, saved_gid, uid, gid);
3596                         if (r < 0) {
3597                                 *exit_status = EXIT_USER;
3598                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
3599                         }
3600                 }
3601         }
3602
3603         if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3604
3605                 if (ns_type_supported(NAMESPACE_NET)) {
3606                         r = setup_netns(runtime->netns_storage_socket);
3607                         if (r == -EPERM)
3608                                 log_unit_warning_errno(unit, r,
3609                                                        "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
3610                         else if (r < 0) {
3611                                 *exit_status = EXIT_NETWORK;
3612                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3613                         }
3614                 } else if (context->network_namespace_path) {
3615                         *exit_status = EXIT_NETWORK;
3616                         return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3617                                                     "NetworkNamespacePath= is not supported, refusing.");
3618                 } else
3619                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3620         }
3621
3622         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3623         if (needs_mount_namespace) {
3624                 _cleanup_free_ char *error_path = NULL;
3625
3626                 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3627                 if (r < 0) {
3628                         *exit_status = EXIT_NAMESPACE;
3629                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3630                                                     error_path ? ": " : "", strempty(error_path));
3631                 }
3632         }
3633
3634         if (needs_sandboxing) {
3635                 r = apply_protect_hostname(unit, context, exit_status);
3636                 if (r < 0)
3637                         return r;
3638         }
3639
3640         /* Drop groups as early as possible.
3641          * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
3642          * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
3643         if (needs_setuid) {
3644                 _cleanup_free_ gid_t *gids_to_enforce = NULL;
3645                 int ngids_to_enforce = 0;
3646
3647                 ngids_to_enforce = merge_gid_lists(supplementary_gids,
3648                                                    ngids,
3649                                                    gids_after_pam,
3650                                                    ngids_after_pam,
3651                                                    &gids_to_enforce);
3652                 if (ngids_to_enforce < 0) {
3653                         *exit_status = EXIT_MEMORY;
3654                         return log_unit_error_errno(unit,
3655                                                     ngids_to_enforce,
3656                                                     "Failed to merge group lists. Group membership might be incorrect: %m");
3657                 }
3658
3659                 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
3660                 if (r < 0) {
3661                         *exit_status = EXIT_GROUP;
3662                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3663                 }
3664         }
3665
3666         /* If the user namespace was not set up above, try to do it now.
3667          * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
3668          * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
3669          * case of mount namespaces being less privileged when the mount point list is copied from a
3670          * different user namespace). */
3671
3672         if (needs_sandboxing && context->private_users && !userns_set_up) {
3673                 r = setup_private_users(saved_uid, saved_gid, uid, gid);
3674                 if (r < 0) {
3675                         *exit_status = EXIT_USER;
3676                         return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3677                 }
3678         }
3679
3680         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3681          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3682          * however if we have it as we want to keep it open until the final execve(). */
3683
3684         if (params->exec_fd >= 0) {
3685                 exec_fd = params->exec_fd;
3686
3687                 if (exec_fd < 3 + (int) n_fds) {
3688                         int moved_fd;
3689
3690                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3691                          * process we are about to execute. */
3692
3693                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3694                         if (moved_fd < 0) {
3695                                 *exit_status = EXIT_FDS;
3696                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3697                         }
3698
3699                         safe_close(exec_fd);
3700                         exec_fd = moved_fd;
3701                 } else {
3702                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3703                         r = fd_cloexec(exec_fd, true);
3704                         if (r < 0) {
3705                                 *exit_status = EXIT_FDS;
3706                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3707                         }
3708                 }
3709
3710                 fds_with_exec_fd = newa(int, n_fds + 1);
3711                 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
3712                 fds_with_exec_fd[n_fds] = exec_fd;
3713                 n_fds_with_exec_fd = n_fds + 1;
3714         } else {
3715                 fds_with_exec_fd = fds;
3716                 n_fds_with_exec_fd = n_fds;
3717         }
3718
3719         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3720         if (r >= 0)
3721                 r = shift_fds(fds, n_fds);
3722         if (r >= 0)
3723                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3724         if (r < 0) {
3725                 *exit_status = EXIT_FDS;
3726                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3727         }
3728
3729         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3730          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3731          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3732          * came this far. */
3733
3734         secure_bits = context->secure_bits;
3735
3736         if (needs_sandboxing) {
3737                 uint64_t bset;
3738
3739                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3740                  * requested. (Note this is placed after the general resource limit initialization, see
3741                  * above, in order to take precedence.) */
3742                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3743                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3744                                 *exit_status = EXIT_LIMITS;
3745                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3746                         }
3747                 }
3748
3749 #if ENABLE_SMACK
3750                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3751                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3752                 if (use_smack) {
3753                         r = setup_smack(context, command);
3754                         if (r < 0) {
3755                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3756                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3757                         }
3758                 }
3759 #endif
3760
3761                 bset = context->capability_bounding_set;
3762                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3763                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3764                  * instead of us doing that */
3765                 if (needs_ambient_hack)
3766                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3767                                 (UINT64_C(1) << CAP_SETUID) |
3768                                 (UINT64_C(1) << CAP_SETGID);
3769
3770                 if (!cap_test_all(bset)) {
3771                         r = capability_bounding_set_drop(bset, false);
3772                         if (r < 0) {
3773                                 *exit_status = EXIT_CAPABILITIES;
3774                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3775                         }
3776                 }
3777
3778                 /* This is done before enforce_user, but ambient set
3779                  * does not survive over setresuid() if keep_caps is not set. */
3780                 if (!needs_ambient_hack) {
3781                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3782                         if (r < 0) {
3783                                 *exit_status = EXIT_CAPABILITIES;
3784                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3785                         }
3786                 }
3787         }
3788
3789         /* chroot to root directory first, before we lose the ability to chroot */
3790         r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3791         if (r < 0)
3792                 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3793
3794         if (needs_setuid) {
3795                 if (uid_is_valid(uid)) {
3796                         r = enforce_user(context, uid);
3797                         if (r < 0) {
3798                                 *exit_status = EXIT_USER;
3799                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3800                         }
3801
3802                         if (!needs_ambient_hack &&
3803                             context->capability_ambient_set != 0) {
3804
3805                                 /* Fix the ambient capabilities after user change. */
3806                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3807                                 if (r < 0) {
3808                                         *exit_status = EXIT_CAPABILITIES;
3809                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3810                                 }
3811
3812                                 /* If we were asked to change user and ambient capabilities
3813                                  * were requested, we had to add keep-caps to the securebits
3814                                  * so that we would maintain the inherited capability set
3815                                  * through the setresuid(). Make sure that the bit is added
3816                                  * also to the context secure_bits so that we don't try to
3817                                  * drop the bit away next. */
3818
3819                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3820                         }
3821                 }
3822         }
3823
3824         /* Apply working directory here, because the working directory might be on NFS and only the user running
3825          * this service might have the correct privilege to change to the working directory */
3826         r = apply_working_directory(context, params, home, exit_status);
3827         if (r < 0)
3828                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3829
3830         if (needs_sandboxing) {
3831                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3832                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3833                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3834                  * are restricted. */
3835
3836 #if HAVE_SELINUX
3837                 if (use_selinux) {
3838                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3839
3840                         if (exec_context) {
3841                                 r = setexeccon(exec_context);
3842                                 if (r < 0) {
3843                                         *exit_status = EXIT_SELINUX_CONTEXT;
3844                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3845                                 }
3846                         }
3847                 }
3848 #endif
3849
3850 #if HAVE_APPARMOR
3851                 if (use_apparmor && context->apparmor_profile) {
3852                         r = aa_change_onexec(context->apparmor_profile);
3853                         if (r < 0 && !context->apparmor_profile_ignore) {
3854                                 *exit_status = EXIT_APPARMOR_PROFILE;
3855                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3856                         }
3857                 }
3858 #endif
3859
3860                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3861                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3862                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3863                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3864                                 *exit_status = EXIT_SECUREBITS;
3865                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3866                         }
3867
3868                 if (context_has_no_new_privileges(context))
3869                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3870                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3871                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3872                         }
3873
3874 #if HAVE_SECCOMP
3875                 r = apply_address_families(unit, context);
3876                 if (r < 0) {
3877                         *exit_status = EXIT_ADDRESS_FAMILIES;
3878                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3879                 }
3880
3881                 r = apply_memory_deny_write_execute(unit, context);
3882                 if (r < 0) {
3883                         *exit_status = EXIT_SECCOMP;
3884                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3885                 }
3886
3887                 r = apply_restrict_realtime(unit, context);
3888                 if (r < 0) {
3889                         *exit_status = EXIT_SECCOMP;
3890                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3891                 }
3892
3893                 r = apply_restrict_suid_sgid(unit, context);
3894                 if (r < 0) {
3895                         *exit_status = EXIT_SECCOMP;
3896                         return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3897                 }
3898
3899                 r = apply_restrict_namespaces(unit, context);
3900                 if (r < 0) {
3901                         *exit_status = EXIT_SECCOMP;
3902                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3903                 }
3904
3905                 r = apply_protect_sysctl(unit, context);
3906                 if (r < 0) {
3907                         *exit_status = EXIT_SECCOMP;
3908                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3909                 }
3910
3911                 r = apply_protect_kernel_modules(unit, context);
3912                 if (r < 0) {
3913                         *exit_status = EXIT_SECCOMP;
3914                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3915                 }
3916
3917                 r = apply_protect_kernel_logs(unit, context);
3918                 if (r < 0) {
3919                         *exit_status = EXIT_SECCOMP;
3920                         return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
3921                 }
3922
3923                 r = apply_protect_clock(unit, context);
3924                 if (r < 0) {
3925                         *exit_status = EXIT_SECCOMP;
3926                         return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
3927                 }
3928
3929                 r = apply_private_devices(unit, context);
3930                 if (r < 0) {
3931                         *exit_status = EXIT_SECCOMP;
3932                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3933                 }
3934
3935                 r = apply_syscall_archs(unit, context);
3936                 if (r < 0) {
3937                         *exit_status = EXIT_SECCOMP;
3938                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3939                 }
3940
3941                 r = apply_lock_personality(unit, context);
3942                 if (r < 0) {
3943                         *exit_status = EXIT_SECCOMP;
3944                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3945                 }
3946
3947                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3948                  * by the filter as little as possible. */
3949                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3950                 if (r < 0) {
3951                         *exit_status = EXIT_SECCOMP;
3952                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3953                 }
3954 #endif
3955         }
3956
3957         if (!strv_isempty(context->unset_environment)) {
3958                 char **ee = NULL;
3959
3960                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3961                 if (!ee) {
3962                         *exit_status = EXIT_MEMORY;
3963                         return log_oom();
3964                 }
3965
3966                 strv_free_and_replace(accum_env, ee);
3967         }
3968
3969         if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3970                 replaced_argv = replace_env_argv(command->argv, accum_env);
3971                 if (!replaced_argv) {
3972                         *exit_status = EXIT_MEMORY;
3973                         return log_oom();
3974                 }
3975                 final_argv = replaced_argv;
3976         } else
3977                 final_argv = command->argv;
3978
3979         if (DEBUG_LOGGING) {
3980                 _cleanup_free_ char *line;
3981
3982                 line = exec_command_line(final_argv);
3983                 if (line)
3984                         log_struct(LOG_DEBUG,
3985                                    "EXECUTABLE=%s", command->path,
3986                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3987                                    LOG_UNIT_ID(unit),
3988                                    LOG_UNIT_INVOCATION_ID(unit));
3989         }
3990
3991         if (exec_fd >= 0) {
3992                 uint8_t hot = 1;
3993
3994                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3995                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3996
3997                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3998                         *exit_status = EXIT_EXEC;
3999                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4000                 }
4001         }
4002
4003         execve(command->path, final_argv, accum_env);
4004         r = -errno;
4005
4006         if (exec_fd >= 0) {
4007                 uint8_t hot = 0;
4008
4009                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4010                  * that POLLHUP on it no longer means execve() succeeded. */
4011
4012                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4013                         *exit_status = EXIT_EXEC;
4014                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4015                 }
4016         }
4017
4018         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4019                 log_struct_errno(LOG_INFO, r,
4020                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4021                                  LOG_UNIT_ID(unit),
4022                                  LOG_UNIT_INVOCATION_ID(unit),
4023                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4024                                                   command->path),
4025                                  "EXECUTABLE=%s", command->path);
4026                 return 0;
4027         }
4028
4029         *exit_status = EXIT_EXEC;
4030         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
4031 }
4032
4033 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4034 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4035
4036 int exec_spawn(Unit *unit,
4037                ExecCommand *command,
4038                const ExecContext *context,
4039                const ExecParameters *params,
4040                ExecRuntime *runtime,
4041                DynamicCreds *dcreds,
4042                pid_t *ret) {
4043
4044         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4045         _cleanup_free_ char *subcgroup_path = NULL;
4046         _cleanup_strv_free_ char **files_env = NULL;
4047         size_t n_storage_fds = 0, n_socket_fds = 0;
4048         _cleanup_free_ char *line = NULL;
4049         pid_t pid;
4050
4051         assert(unit);
4052         assert(command);
4053         assert(context);
4054         assert(ret);
4055         assert(params);
4056         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4057
4058         if (context->std_input == EXEC_INPUT_SOCKET ||
4059             context->std_output == EXEC_OUTPUT_SOCKET ||
4060             context->std_error == EXEC_OUTPUT_SOCKET) {
4061
4062                 if (params->n_socket_fds > 1) {
4063                         log_unit_error(unit, "Got more than one socket.");
4064                         return -EINVAL;
4065                 }
4066
4067                 if (params->n_socket_fds == 0) {
4068                         log_unit_error(unit, "Got no socket.");
4069                         return -EINVAL;
4070                 }
4071
4072                 socket_fd = params->fds[0];
4073         } else {
4074                 socket_fd = -1;
4075                 fds = params->fds;
4076                 n_socket_fds = params->n_socket_fds;
4077                 n_storage_fds = params->n_storage_fds;
4078         }
4079
4080         r = exec_context_named_iofds(context, params, named_iofds);
4081         if (r < 0)
4082                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4083
4084         r = exec_context_load_environment(unit, context, &files_env);
4085         if (r < 0)
4086                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4087
4088         line = exec_command_line(command->argv);
4089         if (!line)
4090                 return log_oom();
4091
4092         log_struct(LOG_DEBUG,
4093                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
4094                    "EXECUTABLE=%s", command->path,
4095                    LOG_UNIT_ID(unit),
4096                    LOG_UNIT_INVOCATION_ID(unit));
4097
4098         if (params->cgroup_path) {
4099                 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4100                 if (r < 0)
4101                         return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4102                 if (r > 0) { /* We are using a child cgroup */
4103                         r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4104                         if (r < 0)
4105                                 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4106                 }
4107         }
4108
4109         pid = fork();
4110         if (pid < 0)
4111                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4112
4113         if (pid == 0) {
4114                 int exit_status = EXIT_SUCCESS;
4115
4116                 r = exec_child(unit,
4117                                command,
4118                                context,
4119                                params,
4120                                runtime,
4121                                dcreds,
4122                                socket_fd,
4123                                named_iofds,
4124                                fds,
4125                                n_socket_fds,
4126                                n_storage_fds,
4127                                files_env,
4128                                unit->manager->user_lookup_fds[1],
4129                                &exit_status);
4130
4131                 if (r < 0) {
4132                         const char *status =
4133                                 exit_status_to_string(exit_status,
4134                                                       EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4135
4136                         log_struct_errno(LOG_ERR, r,
4137                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4138                                          LOG_UNIT_ID(unit),
4139                                          LOG_UNIT_INVOCATION_ID(unit),
4140                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4141                                                           status, command->path),
4142                                          "EXECUTABLE=%s", command->path);
4143                 }
4144
4145                 _exit(exit_status);
4146         }
4147
4148         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4149
4150         /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4151          * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4152          * process will be killed too). */
4153         if (subcgroup_path)
4154                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4155
4156         exec_status_start(&command->exec_status, pid);
4157
4158         *ret = pid;
4159         return 0;
4160 }
4161
4162 void exec_context_init(ExecContext *c) {
4163         ExecDirectoryType i;
4164
4165         assert(c);
4166
4167         c->umask = 0022;
4168         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4169         c->cpu_sched_policy = SCHED_OTHER;
4170         c->syslog_priority = LOG_DAEMON|LOG_INFO;
4171         c->syslog_level_prefix = true;
4172         c->ignore_sigpipe = true;
4173         c->timer_slack_nsec = NSEC_INFINITY;
4174         c->personality = PERSONALITY_INVALID;
4175         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4176                 c->directories[i].mode = 0755;
4177         c->timeout_clean_usec = USEC_INFINITY;
4178         c->capability_bounding_set = CAP_ALL;
4179         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4180         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4181         c->log_level_max = -1;
4182         numa_policy_reset(&c->numa_policy);
4183 }
4184
4185 void exec_context_done(ExecContext *c) {
4186         ExecDirectoryType i;
4187         size_t l;
4188
4189         assert(c);
4190
4191         c->environment = strv_free(c->environment);
4192         c->environment_files = strv_free(c->environment_files);
4193         c->pass_environment = strv_free(c->pass_environment);
4194         c->unset_environment = strv_free(c->unset_environment);
4195
4196         rlimit_free_all(c->rlimit);
4197
4198         for (l = 0; l < 3; l++) {
4199                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4200                 c->stdio_file[l] = mfree(c->stdio_file[l]);
4201         }
4202
4203         c->working_directory = mfree(c->working_directory);
4204         c->root_directory = mfree(c->root_directory);
4205         c->root_image = mfree(c->root_image);
4206         c->tty_path = mfree(c->tty_path);
4207         c->syslog_identifier = mfree(c->syslog_identifier);
4208         c->user = mfree(c->user);
4209         c->group = mfree(c->group);
4210
4211         c->supplementary_groups = strv_free(c->supplementary_groups);
4212
4213         c->pam_name = mfree(c->pam_name);
4214
4215         c->read_only_paths = strv_free(c->read_only_paths);
4216         c->read_write_paths = strv_free(c->read_write_paths);
4217         c->inaccessible_paths = strv_free(c->inaccessible_paths);
4218
4219         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4220         c->bind_mounts = NULL;
4221         c->n_bind_mounts = 0;
4222         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4223         c->temporary_filesystems = NULL;
4224         c->n_temporary_filesystems = 0;
4225
4226         cpu_set_reset(&c->cpu_set);
4227         numa_policy_reset(&c->numa_policy);
4228
4229         c->utmp_id = mfree(c->utmp_id);
4230         c->selinux_context = mfree(c->selinux_context);
4231         c->apparmor_profile = mfree(c->apparmor_profile);
4232         c->smack_process_label = mfree(c->smack_process_label);
4233
4234         c->syscall_filter = hashmap_free(c->syscall_filter);
4235         c->syscall_archs = set_free(c->syscall_archs);
4236         c->address_families = set_free(c->address_families);
4237
4238         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
4239                 c->directories[i].paths = strv_free(c->directories[i].paths);
4240
4241         c->log_level_max = -1;
4242
4243         exec_context_free_log_extra_fields(c);
4244
4245         c->log_ratelimit_interval_usec = 0;
4246         c->log_ratelimit_burst = 0;
4247
4248         c->stdin_data = mfree(c->stdin_data);
4249         c->stdin_data_size = 0;
4250
4251         c->network_namespace_path = mfree(c->network_namespace_path);
4252
4253         c->log_namespace = mfree(c->log_namespace);
4254 }
4255
4256 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4257         char **i;
4258
4259         assert(c);
4260
4261         if (!runtime_prefix)
4262                 return 0;
4263
4264         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4265                 _cleanup_free_ char *p;
4266
4267                 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4268                         p = path_join(runtime_prefix, "private", *i);
4269                 else
4270                         p = path_join(runtime_prefix, *i);
4271                 if (!p)
4272                         return -ENOMEM;
4273
4274                 /* We execute this synchronously, since we need to be sure this is gone when we start the
4275                  * service next. */
4276                 (void) rm_rf(p, REMOVE_ROOT);
4277         }
4278
4279         return 0;
4280 }
4281
4282 static void exec_command_done(ExecCommand *c) {
4283         assert(c);
4284
4285         c->path = mfree(c->path);
4286         c->argv = strv_free(c->argv);
4287 }
4288
4289 void exec_command_done_array(ExecCommand *c, size_t n) {
4290         size_t i;
4291
4292         for (i = 0; i < n; i++)
4293                 exec_command_done(c+i);
4294 }
4295
4296 ExecCommand* exec_command_free_list(ExecCommand *c) {
4297         ExecCommand *i;
4298
4299         while ((i = c)) {
4300                 LIST_REMOVE(command, c, i);
4301                 exec_command_done(i);
4302                 free(i);
4303         }
4304
4305         return NULL;
4306 }
4307
4308 void exec_command_free_array(ExecCommand **c, size_t n) {
4309         size_t i;
4310
4311         for (i = 0; i < n; i++)
4312                 c[i] = exec_command_free_list(c[i]);
4313 }
4314
4315 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4316         size_t i;
4317
4318         for (i = 0; i < n; i++)
4319                 exec_status_reset(&c[i].exec_status);
4320 }
4321
4322 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4323         size_t i;
4324
4325         for (i = 0; i < n; i++) {
4326                 ExecCommand *z;
4327
4328                 LIST_FOREACH(command, z, c[i])
4329                         exec_status_reset(&z->exec_status);
4330         }
4331 }
4332
4333 typedef struct InvalidEnvInfo {
4334         const Unit *unit;
4335         const char *path;
4336 } InvalidEnvInfo;
4337
4338 static void invalid_env(const char *p, void *userdata) {
4339         InvalidEnvInfo *info = userdata;
4340
4341         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
4342 }
4343
4344 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4345         assert(c);
4346
4347         switch (fd_index) {
4348
4349         case STDIN_FILENO:
4350                 if (c->std_input != EXEC_INPUT_NAMED_FD)
4351                         return NULL;
4352
4353                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
4354
4355         case STDOUT_FILENO:
4356                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4357                         return NULL;
4358
4359                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
4360
4361         case STDERR_FILENO:
4362                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4363                         return NULL;
4364
4365                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
4366
4367         default:
4368                 return NULL;
4369         }
4370 }
4371
4372 static int exec_context_named_iofds(
4373                 const ExecContext *c,
4374                 const ExecParameters *p,
4375                 int named_iofds[static 3]) {
4376
4377         size_t i, targets;
4378         const char* stdio_fdname[3];
4379         size_t n_fds;
4380
4381         assert(c);
4382         assert(p);
4383         assert(named_iofds);
4384
4385         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4386                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4387                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
4388
4389         for (i = 0; i < 3; i++)
4390                 stdio_fdname[i] = exec_context_fdname(c, i);
4391
4392         n_fds = p->n_storage_fds + p->n_socket_fds;
4393
4394         for (i = 0; i < n_fds  && targets > 0; i++)
4395                 if (named_iofds[STDIN_FILENO] < 0 &&
4396                     c->std_input == EXEC_INPUT_NAMED_FD &&
4397                     stdio_fdname[STDIN_FILENO] &&
4398                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4399
4400                         named_iofds[STDIN_FILENO] = p->fds[i];
4401                         targets--;
4402
4403                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4404                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
4405                            stdio_fdname[STDOUT_FILENO] &&
4406                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4407
4408                         named_iofds[STDOUT_FILENO] = p->fds[i];
4409                         targets--;
4410
4411                 } else if (named_iofds[STDERR_FILENO] < 0 &&
4412                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
4413                            stdio_fdname[STDERR_FILENO] &&
4414                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4415
4416                         named_iofds[STDERR_FILENO] = p->fds[i];
4417                         targets--;
4418                 }
4419
4420         return targets == 0 ? 0 : -ENOENT;
4421 }
4422
4423 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
4424         char **i, **r = NULL;
4425
4426         assert(c);
4427         assert(l);
4428
4429         STRV_FOREACH(i, c->environment_files) {
4430                 char *fn;
4431                 int k;
4432                 unsigned n;
4433                 bool ignore = false;
4434                 char **p;
4435                 _cleanup_globfree_ glob_t pglob = {};
4436
4437                 fn = *i;
4438
4439                 if (fn[0] == '-') {
4440                         ignore = true;
4441                         fn++;
4442                 }
4443
4444                 if (!path_is_absolute(fn)) {
4445                         if (ignore)
4446                                 continue;
4447
4448                         strv_free(r);
4449                         return -EINVAL;
4450                 }
4451
4452                 /* Filename supports globbing, take all matching files */
4453                 k = safe_glob(fn, 0, &pglob);
4454                 if (k < 0) {
4455                         if (ignore)
4456                                 continue;
4457
4458                         strv_free(r);
4459                         return k;
4460                 }
4461
4462                 /* When we don't match anything, -ENOENT should be returned */
4463                 assert(pglob.gl_pathc > 0);
4464
4465                 for (n = 0; n < pglob.gl_pathc; n++) {
4466                         k = load_env_file(NULL, pglob.gl_pathv[n], &p);
4467                         if (k < 0) {
4468                                 if (ignore)
4469                                         continue;
4470
4471                                 strv_free(r);
4472                                 return k;
4473                         }
4474                         /* Log invalid environment variables with filename */
4475                         if (p) {
4476                                 InvalidEnvInfo info = {
4477                                         .unit = unit,
4478                                         .path = pglob.gl_pathv[n]
4479                                 };
4480
4481                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
4482                         }
4483
4484                         if (!r)
4485                                 r = p;
4486                         else {
4487                                 char **m;
4488
4489                                 m = strv_env_merge(2, r, p);
4490                                 strv_free(r);
4491                                 strv_free(p);
4492                                 if (!m)
4493                                         return -ENOMEM;
4494
4495                                 r = m;
4496                         }
4497                 }
4498         }
4499
4500         *l = r;
4501
4502         return 0;
4503 }
4504
4505 static bool tty_may_match_dev_console(const char *tty) {
4506         _cleanup_free_ char *resolved = NULL;
4507
4508         if (!tty)
4509                 return true;
4510
4511         tty = skip_dev_prefix(tty);
4512
4513         /* trivial identity? */
4514         if (streq(tty, "console"))
4515                 return true;
4516
4517         if (resolve_dev_console(&resolved) < 0)
4518                 return true; /* if we could not resolve, assume it may */
4519
4520         /* "tty0" means the active VC, so it may be the same sometimes */
4521         return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
4522 }
4523
4524 static bool exec_context_may_touch_tty(const ExecContext *ec) {
4525         assert(ec);
4526
4527         return ec->tty_reset ||
4528                 ec->tty_vhangup ||
4529                 ec->tty_vt_disallocate ||
4530                 is_terminal_input(ec->std_input) ||
4531                 is_terminal_output(ec->std_output) ||
4532                 is_terminal_output(ec->std_error);
4533 }
4534
4535 bool exec_context_may_touch_console(const ExecContext *ec) {
4536
4537         return exec_context_may_touch_tty(ec) &&
4538                tty_may_match_dev_console(exec_context_tty_path(ec));
4539 }
4540
4541 static void strv_fprintf(FILE *f, char **l) {
4542         char **g;
4543
4544         assert(f);
4545
4546         STRV_FOREACH(g, l)
4547                 fprintf(f, " %s", *g);
4548 }
4549
4550 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
4551         char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
4552         ExecDirectoryType dt;
4553         unsigned i;
4554         int r;
4555
4556         assert(c);
4557         assert(f);
4558
4559         prefix = strempty(prefix);
4560
4561         fprintf(f,
4562                 "%sUMask: %04o\n"
4563                 "%sWorkingDirectory: %s\n"
4564                 "%sRootDirectory: %s\n"
4565                 "%sNonBlocking: %s\n"
4566                 "%sPrivateTmp: %s\n"
4567                 "%sPrivateDevices: %s\n"
4568                 "%sProtectKernelTunables: %s\n"
4569                 "%sProtectKernelModules: %s\n"
4570                 "%sProtectKernelLogs: %s\n"
4571                 "%sProtectClock: %s\n"
4572                 "%sProtectControlGroups: %s\n"
4573                 "%sPrivateNetwork: %s\n"
4574                 "%sPrivateUsers: %s\n"
4575                 "%sProtectHome: %s\n"
4576                 "%sProtectSystem: %s\n"
4577                 "%sMountAPIVFS: %s\n"
4578                 "%sIgnoreSIGPIPE: %s\n"
4579                 "%sMemoryDenyWriteExecute: %s\n"
4580                 "%sRestrictRealtime: %s\n"
4581                 "%sRestrictSUIDSGID: %s\n"
4582                 "%sKeyringMode: %s\n"
4583                 "%sProtectHostname: %s\n",
4584                 prefix, c->umask,
4585                 prefix, c->working_directory ? c->working_directory : "/",
4586                 prefix, c->root_directory ? c->root_directory : "/",
4587                 prefix, yes_no(c->non_blocking),
4588                 prefix, yes_no(c->private_tmp),
4589                 prefix, yes_no(c->private_devices),
4590                 prefix, yes_no(c->protect_kernel_tunables),
4591                 prefix, yes_no(c->protect_kernel_modules),
4592                 prefix, yes_no(c->protect_kernel_logs),
4593                 prefix, yes_no(c->protect_clock),
4594                 prefix, yes_no(c->protect_control_groups),
4595                 prefix, yes_no(c->private_network),
4596                 prefix, yes_no(c->private_users),
4597                 prefix, protect_home_to_string(c->protect_home),
4598                 prefix, protect_system_to_string(c->protect_system),
4599                 prefix, yes_no(c->mount_apivfs),
4600                 prefix, yes_no(c->ignore_sigpipe),
4601                 prefix, yes_no(c->memory_deny_write_execute),
4602                 prefix, yes_no(c->restrict_realtime),
4603                 prefix, yes_no(c->restrict_suid_sgid),
4604                 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4605                 prefix, yes_no(c->protect_hostname));
4606
4607         if (c->root_image)
4608                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4609
4610         STRV_FOREACH(e, c->environment)
4611                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4612
4613         STRV_FOREACH(e, c->environment_files)
4614                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4615
4616         STRV_FOREACH(e, c->pass_environment)
4617                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4618
4619         STRV_FOREACH(e, c->unset_environment)
4620                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4621
4622         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4623
4624         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4625                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4626
4627                 STRV_FOREACH(d, c->directories[dt].paths)
4628                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4629         }
4630
4631         fprintf(f,
4632                 "%sTimeoutCleanSec: %s\n",
4633                 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
4634
4635         if (c->nice_set)
4636                 fprintf(f,
4637                         "%sNice: %i\n",
4638                         prefix, c->nice);
4639
4640         if (c->oom_score_adjust_set)
4641                 fprintf(f,
4642                         "%sOOMScoreAdjust: %i\n",
4643                         prefix, c->oom_score_adjust);
4644
4645         if (c->coredump_filter_set)
4646                 fprintf(f,
4647                         "%sCoredumpFilter: 0x%"PRIx64"\n",
4648                         prefix, c->coredump_filter);
4649
4650         for (i = 0; i < RLIM_NLIMITS; i++)
4651                 if (c->rlimit[i]) {
4652                         fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
4653                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4654                         fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
4655                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4656                 }
4657
4658         if (c->ioprio_set) {
4659                 _cleanup_free_ char *class_str = NULL;
4660
4661                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4662                 if (r >= 0)
4663                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4664
4665                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4666         }
4667
4668         if (c->cpu_sched_set) {
4669                 _cleanup_free_ char *policy_str = NULL;
4670
4671                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4672                 if (r >= 0)
4673                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4674
4675                 fprintf(f,
4676                         "%sCPUSchedulingPriority: %i\n"
4677                         "%sCPUSchedulingResetOnFork: %s\n",
4678                         prefix, c->cpu_sched_priority,
4679                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4680         }
4681
4682         if (c->cpu_set.set) {
4683                 _cleanup_free_ char *affinity = NULL;
4684
4685                 affinity = cpu_set_to_range_string(&c->cpu_set);
4686                 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
4687         }
4688
4689         if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4690                 _cleanup_free_ char *nodes = NULL;
4691
4692                 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4693                 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4694                 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4695         }
4696
4697         if (c->timer_slack_nsec != NSEC_INFINITY)
4698                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4699
4700         fprintf(f,
4701                 "%sStandardInput: %s\n"
4702                 "%sStandardOutput: %s\n"
4703                 "%sStandardError: %s\n",
4704                 prefix, exec_input_to_string(c->std_input),
4705                 prefix, exec_output_to_string(c->std_output),
4706                 prefix, exec_output_to_string(c->std_error));
4707
4708         if (c->std_input == EXEC_INPUT_NAMED_FD)
4709                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4710         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4711                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4712         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4713                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4714
4715         if (c->std_input == EXEC_INPUT_FILE)
4716                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4717         if (c->std_output == EXEC_OUTPUT_FILE)
4718                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4719         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4720                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4721         if (c->std_error == EXEC_OUTPUT_FILE)
4722                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4723         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4724                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4725
4726         if (c->tty_path)
4727                 fprintf(f,
4728                         "%sTTYPath: %s\n"
4729                         "%sTTYReset: %s\n"
4730                         "%sTTYVHangup: %s\n"
4731                         "%sTTYVTDisallocate: %s\n",
4732                         prefix, c->tty_path,
4733                         prefix, yes_no(c->tty_reset),
4734                         prefix, yes_no(c->tty_vhangup),
4735                         prefix, yes_no(c->tty_vt_disallocate));
4736
4737         if (IN_SET(c->std_output,
4738                    EXEC_OUTPUT_SYSLOG,
4739                    EXEC_OUTPUT_KMSG,
4740                    EXEC_OUTPUT_JOURNAL,
4741                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4742                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4743                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4744             IN_SET(c->std_error,
4745                    EXEC_OUTPUT_SYSLOG,
4746                    EXEC_OUTPUT_KMSG,
4747                    EXEC_OUTPUT_JOURNAL,
4748                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4749                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4750                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4751
4752                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4753
4754                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4755                 if (r >= 0)
4756                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4757
4758                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4759                 if (r >= 0)
4760                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4761         }
4762
4763         if (c->log_level_max >= 0) {
4764                 _cleanup_free_ char *t = NULL;
4765
4766                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4767
4768                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4769         }
4770
4771         if (c->log_ratelimit_interval_usec > 0) {
4772                 char buf_timespan[FORMAT_TIMESPAN_MAX];
4773
4774                 fprintf(f,
4775                         "%sLogRateLimitIntervalSec: %s\n",
4776                         prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
4777         }
4778
4779         if (c->log_ratelimit_burst > 0)
4780                 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
4781
4782         if (c->n_log_extra_fields > 0) {
4783                 size_t j;
4784
4785                 for (j = 0; j < c->n_log_extra_fields; j++) {
4786                         fprintf(f, "%sLogExtraFields: ", prefix);
4787                         fwrite(c->log_extra_fields[j].iov_base,
4788                                1, c->log_extra_fields[j].iov_len,
4789                                f);
4790                         fputc('\n', f);
4791                 }
4792         }
4793
4794         if (c->log_namespace)
4795                 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
4796
4797         if (c->secure_bits) {
4798                 _cleanup_free_ char *str = NULL;
4799
4800                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4801                 if (r >= 0)
4802                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4803         }
4804
4805         if (c->capability_bounding_set != CAP_ALL) {
4806                 _cleanup_free_ char *str = NULL;
4807
4808                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4809                 if (r >= 0)
4810                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4811         }
4812
4813         if (c->capability_ambient_set != 0) {
4814                 _cleanup_free_ char *str = NULL;
4815
4816                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4817                 if (r >= 0)
4818                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4819         }
4820
4821         if (c->user)
4822                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4823         if (c->group)
4824                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4825
4826         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4827
4828         if (!strv_isempty(c->supplementary_groups)) {
4829                 fprintf(f, "%sSupplementaryGroups:", prefix);
4830                 strv_fprintf(f, c->supplementary_groups);
4831                 fputs("\n", f);
4832         }
4833
4834         if (c->pam_name)
4835                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4836
4837         if (!strv_isempty(c->read_write_paths)) {
4838                 fprintf(f, "%sReadWritePaths:", prefix);
4839                 strv_fprintf(f, c->read_write_paths);
4840                 fputs("\n", f);
4841         }
4842
4843         if (!strv_isempty(c->read_only_paths)) {
4844                 fprintf(f, "%sReadOnlyPaths:", prefix);
4845                 strv_fprintf(f, c->read_only_paths);
4846                 fputs("\n", f);
4847         }
4848
4849         if (!strv_isempty(c->inaccessible_paths)) {
4850                 fprintf(f, "%sInaccessiblePaths:", prefix);
4851                 strv_fprintf(f, c->inaccessible_paths);
4852                 fputs("\n", f);
4853         }
4854
4855         if (c->n_bind_mounts > 0)
4856                 for (i = 0; i < c->n_bind_mounts; i++)
4857                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4858                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4859                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4860                                 c->bind_mounts[i].source,
4861                                 c->bind_mounts[i].destination,
4862                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4863
4864         if (c->n_temporary_filesystems > 0)
4865                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4866                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4867
4868                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4869                                 t->path,
4870                                 isempty(t->options) ? "" : ":",
4871                                 strempty(t->options));
4872                 }
4873
4874         if (c->utmp_id)
4875                 fprintf(f,
4876                         "%sUtmpIdentifier: %s\n",
4877                         prefix, c->utmp_id);
4878
4879         if (c->selinux_context)
4880                 fprintf(f,
4881                         "%sSELinuxContext: %s%s\n",
4882                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4883
4884         if (c->apparmor_profile)
4885                 fprintf(f,
4886                         "%sAppArmorProfile: %s%s\n",
4887                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4888
4889         if (c->smack_process_label)
4890                 fprintf(f,
4891                         "%sSmackProcessLabel: %s%s\n",
4892                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4893
4894         if (c->personality != PERSONALITY_INVALID)
4895                 fprintf(f,
4896                         "%sPersonality: %s\n",
4897                         prefix, strna(personality_to_string(c->personality)));
4898
4899         fprintf(f,
4900                 "%sLockPersonality: %s\n",
4901                 prefix, yes_no(c->lock_personality));
4902
4903         if (c->syscall_filter) {
4904 #if HAVE_SECCOMP
4905                 Iterator j;
4906                 void *id, *val;
4907                 bool first = true;
4908 #endif
4909
4910                 fprintf(f,
4911                         "%sSystemCallFilter: ",
4912                         prefix);
4913
4914                 if (!c->syscall_whitelist)
4915                         fputc('~', f);
4916
4917 #if HAVE_SECCOMP
4918                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4919                         _cleanup_free_ char *name = NULL;
4920                         const char *errno_name = NULL;
4921                         int num = PTR_TO_INT(val);
4922
4923                         if (first)
4924                                 first = false;
4925                         else
4926                                 fputc(' ', f);
4927
4928                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4929                         fputs(strna(name), f);
4930
4931                         if (num >= 0) {
4932                                 errno_name = errno_to_name(num);
4933                                 if (errno_name)
4934                                         fprintf(f, ":%s", errno_name);
4935                                 else
4936                                         fprintf(f, ":%d", num);
4937                         }
4938                 }
4939 #endif
4940
4941                 fputc('\n', f);
4942         }
4943
4944         if (c->syscall_archs) {
4945 #if HAVE_SECCOMP
4946                 Iterator j;
4947                 void *id;
4948 #endif
4949
4950                 fprintf(f,
4951                         "%sSystemCallArchitectures:",
4952                         prefix);
4953
4954 #if HAVE_SECCOMP
4955                 SET_FOREACH(id, c->syscall_archs, j)
4956                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4957 #endif
4958                 fputc('\n', f);
4959         }
4960
4961         if (exec_context_restrict_namespaces_set(c)) {
4962                 _cleanup_free_ char *s = NULL;
4963
4964                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4965                 if (r >= 0)
4966                         fprintf(f, "%sRestrictNamespaces: %s\n",
4967                                 prefix, strna(s));
4968         }
4969
4970         if (c->network_namespace_path)
4971                 fprintf(f,
4972                         "%sNetworkNamespacePath: %s\n",
4973                         prefix, c->network_namespace_path);
4974
4975         if (c->syscall_errno > 0) {
4976                 const char *errno_name;
4977
4978                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4979
4980                 errno_name = errno_to_name(c->syscall_errno);
4981                 if (errno_name)
4982                         fprintf(f, "%s\n", errno_name);
4983                 else
4984                         fprintf(f, "%d\n", c->syscall_errno);
4985         }
4986 }
4987
4988 bool exec_context_maintains_privileges(const ExecContext *c) {
4989         assert(c);
4990
4991         /* Returns true if the process forked off would run under
4992          * an unchanged UID or as root. */
4993
4994         if (!c->user)
4995                 return true;
4996
4997         if (streq(c->user, "root") || streq(c->user, "0"))
4998                 return true;
4999
5000         return false;
5001 }
5002
5003 int exec_context_get_effective_ioprio(const ExecContext *c) {
5004         int p;
5005
5006         assert(c);
5007
5008         if (c->ioprio_set)
5009                 return c->ioprio;
5010
5011         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5012         if (p < 0)
5013                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5014
5015         return p;
5016 }
5017
5018 void exec_context_free_log_extra_fields(ExecContext *c) {
5019         size_t l;
5020
5021         assert(c);
5022
5023         for (l = 0; l < c->n_log_extra_fields; l++)
5024                 free(c->log_extra_fields[l].iov_base);
5025         c->log_extra_fields = mfree(c->log_extra_fields);
5026         c->n_log_extra_fields = 0;
5027 }
5028
5029 void exec_context_revert_tty(ExecContext *c) {
5030         int r;
5031
5032         assert(c);
5033
5034         /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5035         exec_context_tty_reset(c, NULL);
5036
5037         /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5038          * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5039          * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5040
5041         if (exec_context_may_touch_tty(c)) {
5042                 const char *path;
5043
5044                 path = exec_context_tty_path(c);
5045                 if (path) {
5046                         r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
5047                         if (r < 0 && r != -ENOENT)
5048                                 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5049                 }
5050         }
5051 }
5052
5053 int exec_context_get_clean_directories(
5054                 ExecContext *c,
5055                 char **prefix,
5056                 ExecCleanMask mask,
5057                 char ***ret) {
5058
5059         _cleanup_strv_free_ char **l = NULL;
5060         ExecDirectoryType t;
5061         int r;
5062
5063         assert(c);
5064         assert(prefix);
5065         assert(ret);
5066
5067         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5068                 char **i;
5069
5070                 if (!FLAGS_SET(mask, 1U << t))
5071                         continue;
5072
5073                 if (!prefix[t])
5074                         continue;
5075
5076                 STRV_FOREACH(i, c->directories[t].paths) {
5077                         char *j;
5078
5079                         j = path_join(prefix[t], *i);
5080                         if (!j)
5081                                 return -ENOMEM;
5082
5083                         r = strv_consume(&l, j);
5084                         if (r < 0)
5085                                 return r;
5086
5087                         /* Also remove private directories unconditionally. */
5088                         if (t != EXEC_DIRECTORY_CONFIGURATION) {
5089                                 j = path_join(prefix[t], "private", *i);
5090                                 if (!j)
5091                                         return -ENOMEM;
5092
5093                                 r = strv_consume(&l, j);
5094                                 if (r < 0)
5095                                         return r;
5096                         }
5097                 }
5098         }
5099
5100         *ret = TAKE_PTR(l);
5101         return 0;
5102 }
5103
5104 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5105         ExecCleanMask mask = 0;
5106
5107         assert(c);
5108         assert(ret);
5109
5110         for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5111                 if (!strv_isempty(c->directories[t].paths))
5112                         mask |= 1U << t;
5113
5114         *ret = mask;
5115         return 0;
5116 }
5117
5118 void exec_status_start(ExecStatus *s, pid_t pid) {
5119         assert(s);
5120
5121         *s = (ExecStatus) {
5122                 .pid = pid,
5123         };
5124
5125         dual_timestamp_get(&s->start_timestamp);
5126 }
5127
5128 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5129         assert(s);
5130
5131         if (s->pid != pid) {
5132                 *s = (ExecStatus) {
5133                         .pid = pid,
5134                 };
5135         }
5136
5137         dual_timestamp_get(&s->exit_timestamp);
5138
5139         s->code = code;
5140         s->status = status;
5141
5142         if (context && context->utmp_id)
5143                 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5144 }
5145
5146 void exec_status_reset(ExecStatus *s) {
5147         assert(s);
5148
5149         *s = (ExecStatus) {};
5150 }
5151
5152 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5153         char buf[FORMAT_TIMESTAMP_MAX];
5154
5155         assert(s);
5156         assert(f);
5157
5158         if (s->pid <= 0)
5159                 return;
5160
5161         prefix = strempty(prefix);
5162
5163         fprintf(f,
5164                 "%sPID: "PID_FMT"\n",
5165                 prefix, s->pid);
5166
5167         if (dual_timestamp_is_set(&s->start_timestamp))
5168                 fprintf(f,
5169                         "%sStart Timestamp: %s\n",
5170                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5171
5172         if (dual_timestamp_is_set(&s->exit_timestamp))
5173                 fprintf(f,
5174                         "%sExit Timestamp: %s\n"
5175                         "%sExit Code: %s\n"
5176                         "%sExit Status: %i\n",
5177                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5178                         prefix, sigchld_code_to_string(s->code),
5179                         prefix, s->status);
5180 }
5181
5182 static char *exec_command_line(char **argv) {
5183         size_t k;
5184         char *n, *p, **a;
5185         bool first = true;
5186
5187         assert(argv);
5188
5189         k = 1;
5190         STRV_FOREACH(a, argv)
5191                 k += strlen(*a)+3;
5192
5193         n = new(char, k);
5194         if (!n)
5195                 return NULL;
5196
5197         p = n;
5198         STRV_FOREACH(a, argv) {
5199
5200                 if (!first)
5201                         *(p++) = ' ';
5202                 else
5203                         first = false;
5204
5205                 if (strpbrk(*a, WHITESPACE)) {
5206                         *(p++) = '\'';
5207                         p = stpcpy(p, *a);
5208                         *(p++) = '\'';
5209                 } else
5210                         p = stpcpy(p, *a);
5211
5212         }
5213
5214         *p = 0;
5215
5216         /* FIXME: this doesn't really handle arguments that have
5217          * spaces and ticks in them */
5218
5219         return n;
5220 }
5221
5222 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5223         _cleanup_free_ char *cmd = NULL;
5224         const char *prefix2;
5225
5226         assert(c);
5227         assert(f);
5228
5229         prefix = strempty(prefix);
5230         prefix2 = strjoina(prefix, "\t");
5231
5232         cmd = exec_command_line(c->argv);
5233         fprintf(f,
5234                 "%sCommand Line: %s\n",
5235                 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5236
5237         exec_status_dump(&c->exec_status, f, prefix2);
5238 }
5239
5240 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
5241         assert(f);
5242
5243         prefix = strempty(prefix);
5244
5245         LIST_FOREACH(command, c, c)
5246                 exec_command_dump(c, f, prefix);
5247 }
5248
5249 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
5250         ExecCommand *end;
5251
5252         assert(l);
5253         assert(e);
5254
5255         if (*l) {
5256                 /* It's kind of important, that we keep the order here */
5257                 LIST_FIND_TAIL(command, *l, end);
5258                 LIST_INSERT_AFTER(command, *l, end, e);
5259         } else
5260               *l = e;
5261 }
5262
5263 int exec_command_set(ExecCommand *c, const char *path, ...) {
5264         va_list ap;
5265         char **l, *p;
5266
5267         assert(c);
5268         assert(path);
5269
5270         va_start(ap, path);
5271         l = strv_new_ap(path, ap);
5272         va_end(ap);
5273
5274         if (!l)
5275                 return -ENOMEM;
5276
5277         p = strdup(path);
5278         if (!p) {
5279                 strv_free(l);
5280                 return -ENOMEM;
5281         }
5282
5283         free_and_replace(c->path, p);
5284
5285         return strv_free_and_replace(c->argv, l);
5286 }
5287
5288 int exec_command_append(ExecCommand *c, const char *path, ...) {
5289         _cleanup_strv_free_ char **l = NULL;
5290         va_list ap;
5291         int r;
5292
5293         assert(c);
5294         assert(path);
5295
5296         va_start(ap, path);
5297         l = strv_new_ap(path, ap);
5298         va_end(ap);
5299
5300         if (!l)
5301                 return -ENOMEM;
5302
5303         r = strv_extend_strv(&c->argv, l, false);
5304         if (r < 0)
5305                 return r;
5306
5307         return 0;
5308 }
5309
5310 static void *remove_tmpdir_thread(void *p) {
5311         _cleanup_free_ char *path = p;
5312
5313         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5314         return NULL;
5315 }
5316
5317 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5318         int r;
5319
5320         if (!rt)
5321                 return NULL;
5322
5323         if (rt->manager)
5324                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5325
5326         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5327         if (destroy && rt->tmp_dir) {
5328                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5329
5330                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5331                 if (r < 0) {
5332                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5333                         free(rt->tmp_dir);
5334                 }
5335
5336                 rt->tmp_dir = NULL;
5337         }
5338
5339         if (destroy && rt->var_tmp_dir) {
5340                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5341
5342                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5343                 if (r < 0) {
5344                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5345                         free(rt->var_tmp_dir);
5346                 }
5347
5348                 rt->var_tmp_dir = NULL;
5349         }
5350
5351         rt->id = mfree(rt->id);
5352         rt->tmp_dir = mfree(rt->tmp_dir);
5353         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5354         safe_close_pair(rt->netns_storage_socket);
5355         return mfree(rt);
5356 }
5357
5358 static void exec_runtime_freep(ExecRuntime **rt) {
5359         (void) exec_runtime_free(*rt, false);
5360 }
5361
5362 static int exec_runtime_allocate(ExecRuntime **ret) {
5363         ExecRuntime *n;
5364
5365         assert(ret);
5366
5367         n = new(ExecRuntime, 1);
5368         if (!n)
5369                 return -ENOMEM;
5370
5371         *n = (ExecRuntime) {
5372                 .netns_storage_socket = { -1, -1 },
5373         };
5374
5375         *ret = n;
5376         return 0;
5377 }
5378
5379 static int exec_runtime_add(
5380                 Manager *m,
5381                 const char *id,
5382                 const char *tmp_dir,
5383                 const char *var_tmp_dir,
5384                 const int netns_storage_socket[2],
5385                 ExecRuntime **ret) {
5386
5387         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
5388         int r;
5389
5390         assert(m);
5391         assert(id);
5392
5393         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5394         if (r < 0)
5395                 return r;
5396
5397         r = exec_runtime_allocate(&rt);
5398         if (r < 0)
5399                 return r;
5400
5401         rt->id = strdup(id);
5402         if (!rt->id)
5403                 return -ENOMEM;
5404
5405         if (tmp_dir) {
5406                 rt->tmp_dir = strdup(tmp_dir);
5407                 if (!rt->tmp_dir)
5408                         return -ENOMEM;
5409
5410                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5411                 assert(var_tmp_dir);
5412                 rt->var_tmp_dir = strdup(var_tmp_dir);
5413                 if (!rt->var_tmp_dir)
5414                         return -ENOMEM;
5415         }
5416
5417         if (netns_storage_socket) {
5418                 rt->netns_storage_socket[0] = netns_storage_socket[0];
5419                 rt->netns_storage_socket[1] = netns_storage_socket[1];
5420         }
5421
5422         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5423         if (r < 0)
5424                 return r;
5425
5426         rt->manager = m;
5427
5428         if (ret)
5429                 *ret = rt;
5430
5431         /* do not remove created ExecRuntime object when the operation succeeds. */
5432         rt = NULL;
5433         return 0;
5434 }
5435
5436 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5437         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
5438         _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
5439         int r;
5440
5441         assert(m);
5442         assert(c);
5443         assert(id);
5444
5445         /* It is not necessary to create ExecRuntime object. */
5446         if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
5447                 return 0;
5448
5449         if (c->private_tmp &&
5450             !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
5451               (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
5452                prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
5453                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
5454                 if (r < 0)
5455                         return r;
5456         }
5457
5458         if (c->private_network || c->network_namespace_path) {
5459                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5460                         return -errno;
5461         }
5462
5463         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5464         if (r < 0)
5465                 return r;
5466
5467         /* Avoid cleanup */
5468         netns_storage_socket[0] = netns_storage_socket[1] = -1;
5469         return 1;
5470 }
5471
5472 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5473         ExecRuntime *rt;
5474         int r;
5475
5476         assert(m);
5477         assert(id);
5478         assert(ret);
5479
5480         rt = hashmap_get(m->exec_runtime_by_id, id);
5481         if (rt)
5482                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5483                 goto ref;
5484
5485         if (!create)
5486                 return 0;
5487
5488         /* If not found, then create a new object. */
5489         r = exec_runtime_make(m, c, id, &rt);
5490         if (r <= 0)
5491                 /* When r == 0, it is not necessary to create ExecRuntime object. */
5492                 return r;
5493
5494 ref:
5495         /* increment reference counter. */
5496         rt->n_ref++;
5497         *ret = rt;
5498         return 1;
5499 }
5500
5501 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5502         if (!rt)
5503                 return NULL;
5504
5505         assert(rt->n_ref > 0);
5506
5507         rt->n_ref--;
5508         if (rt->n_ref > 0)
5509                 return NULL;
5510
5511         return exec_runtime_free(rt, destroy);
5512 }
5513
5514 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5515         ExecRuntime *rt;
5516         Iterator i;
5517
5518         assert(m);
5519         assert(f);
5520         assert(fds);
5521
5522         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5523                 fprintf(f, "exec-runtime=%s", rt->id);
5524
5525                 if (rt->tmp_dir)
5526                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
5527
5528                 if (rt->var_tmp_dir)
5529                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
5530
5531                 if (rt->netns_storage_socket[0] >= 0) {
5532                         int copy;
5533
5534                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5535                         if (copy < 0)
5536                                 return copy;
5537
5538                         fprintf(f, " netns-socket-0=%i", copy);
5539                 }
5540
5541                 if (rt->netns_storage_socket[1] >= 0) {
5542                         int copy;
5543
5544                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5545                         if (copy < 0)
5546                                 return copy;
5547
5548                         fprintf(f, " netns-socket-1=%i", copy);
5549                 }
5550
5551                 fputc('\n', f);
5552         }
5553
5554         return 0;
5555 }
5556
5557 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5558         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5559         ExecRuntime *rt;
5560         int r;
5561
5562         /* This is for the migration from old (v237 or earlier) deserialization text.
5563          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5564          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5565          * so or not from the serialized text, then we always creates a new object owned by this. */
5566
5567         assert(u);
5568         assert(key);
5569         assert(value);
5570
5571         /* Manager manages ExecRuntime objects by the unit id.
5572          * So, we omit the serialized text when the unit does not have id (yet?)... */
5573         if (isempty(u->id)) {
5574                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5575                 return 0;
5576         }
5577
5578         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5579         if (r < 0) {
5580                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5581                 return 0;
5582         }
5583
5584         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5585         if (!rt) {
5586                 r = exec_runtime_allocate(&rt_create);
5587                 if (r < 0)
5588                         return log_oom();
5589
5590                 rt_create->id = strdup(u->id);
5591                 if (!rt_create->id)
5592                         return log_oom();
5593
5594                 rt = rt_create;
5595         }
5596
5597         if (streq(key, "tmp-dir")) {
5598                 char *copy;
5599
5600                 copy = strdup(value);
5601                 if (!copy)
5602                         return log_oom();
5603
5604                 free_and_replace(rt->tmp_dir, copy);
5605
5606         } else if (streq(key, "var-tmp-dir")) {
5607                 char *copy;
5608
5609                 copy = strdup(value);
5610                 if (!copy)
5611                         return log_oom();
5612
5613                 free_and_replace(rt->var_tmp_dir, copy);
5614
5615         } else if (streq(key, "netns-socket-0")) {
5616                 int fd;
5617
5618                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5619                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5620                         return 0;
5621                 }
5622
5623                 safe_close(rt->netns_storage_socket[0]);
5624                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5625
5626         } else if (streq(key, "netns-socket-1")) {
5627                 int fd;
5628
5629                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
5630                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
5631                         return 0;
5632                 }
5633
5634                 safe_close(rt->netns_storage_socket[1]);
5635                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
5636         } else
5637                 return 0;
5638
5639         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5640         if (rt_create) {
5641                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5642                 if (r < 0) {
5643                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
5644                         return 0;
5645                 }
5646
5647                 rt_create->manager = u->manager;
5648
5649                 /* Avoid cleanup */
5650                 rt_create = NULL;
5651         }
5652
5653         return 1;
5654 }
5655
5656 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5657         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5658         int r, fd0 = -1, fd1 = -1;
5659         const char *p, *v = value;
5660         size_t n;
5661
5662         assert(m);
5663         assert(value);
5664         assert(fds);
5665
5666         n = strcspn(v, " ");
5667         id = strndupa(v, n);
5668         if (v[n] != ' ')
5669                 goto finalize;
5670         p = v + n + 1;
5671
5672         v = startswith(p, "tmp-dir=");
5673         if (v) {
5674                 n = strcspn(v, " ");
5675                 tmp_dir = strndupa(v, n);
5676                 if (v[n] != ' ')
5677                         goto finalize;
5678                 p = v + n + 1;
5679         }
5680
5681         v = startswith(p, "var-tmp-dir=");
5682         if (v) {
5683                 n = strcspn(v, " ");
5684                 var_tmp_dir = strndupa(v, n);
5685                 if (v[n] != ' ')
5686                         goto finalize;
5687                 p = v + n + 1;
5688         }
5689
5690         v = startswith(p, "netns-socket-0=");
5691         if (v) {
5692                 char *buf;
5693
5694                 n = strcspn(v, " ");
5695                 buf = strndupa(v, n);
5696                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5697                         log_debug("Unable to process exec-runtime netns fd specification.");
5698                         return;
5699                 }
5700                 fd0 = fdset_remove(fds, fd0);
5701                 if (v[n] != ' ')
5702                         goto finalize;
5703                 p = v + n + 1;
5704         }
5705
5706         v = startswith(p, "netns-socket-1=");
5707         if (v) {
5708                 char *buf;
5709
5710                 n = strcspn(v, " ");
5711                 buf = strndupa(v, n);
5712                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5713                         log_debug("Unable to process exec-runtime netns fd specification.");
5714                         return;
5715                 }
5716                 fd1 = fdset_remove(fds, fd1);
5717         }
5718
5719 finalize:
5720
5721         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5722         if (r < 0)
5723                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5724 }
5725
5726 void exec_runtime_vacuum(Manager *m) {
5727         ExecRuntime *rt;
5728         Iterator i;
5729
5730         assert(m);
5731
5732         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5733
5734         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5735                 if (rt->n_ref > 0)
5736                         continue;
5737
5738                 (void) exec_runtime_free(rt, false);
5739         }
5740 }
5741
5742 void exec_params_clear(ExecParameters *p) {
5743         if (!p)
5744                 return;
5745
5746         strv_free(p->environment);
5747 }
5748
5749 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5750         [EXEC_INPUT_NULL] = "null",
5751         [EXEC_INPUT_TTY] = "tty",
5752         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5753         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5754         [EXEC_INPUT_SOCKET] = "socket",
5755         [EXEC_INPUT_NAMED_FD] = "fd",
5756         [EXEC_INPUT_DATA] = "data",
5757         [EXEC_INPUT_FILE] = "file",
5758 };
5759
5760 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5761
5762 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5763         [EXEC_OUTPUT_INHERIT] = "inherit",
5764         [EXEC_OUTPUT_NULL] = "null",
5765         [EXEC_OUTPUT_TTY] = "tty",
5766         [EXEC_OUTPUT_SYSLOG] = "syslog",
5767         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5768         [EXEC_OUTPUT_KMSG] = "kmsg",
5769         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5770         [EXEC_OUTPUT_JOURNAL] = "journal",
5771         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5772         [EXEC_OUTPUT_SOCKET] = "socket",
5773         [EXEC_OUTPUT_NAMED_FD] = "fd",
5774         [EXEC_OUTPUT_FILE] = "file",
5775         [EXEC_OUTPUT_FILE_APPEND] = "append",
5776 };
5777
5778 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5779
5780 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5781         [EXEC_UTMP_INIT] = "init",
5782         [EXEC_UTMP_LOGIN] = "login",
5783         [EXEC_UTMP_USER] = "user",
5784 };
5785
5786 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5787
5788 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5789         [EXEC_PRESERVE_NO] = "no",
5790         [EXEC_PRESERVE_YES] = "yes",
5791         [EXEC_PRESERVE_RESTART] = "restart",
5792 };
5793
5794 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5795
5796 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
5797 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5798         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5799         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5800         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5801         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5802         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5803 };
5804
5805 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5806
5807 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5808  * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5809  * directories, specifically .timer units with their timestamp touch file. */
5810 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5811         [EXEC_DIRECTORY_RUNTIME] = "runtime",
5812         [EXEC_DIRECTORY_STATE] = "state",
5813         [EXEC_DIRECTORY_CACHE] = "cache",
5814         [EXEC_DIRECTORY_LOGS] = "logs",
5815         [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5816 };
5817
5818 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5819
5820 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5821  * the service payload in. */
5822 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5823         [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5824         [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5825         [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5826         [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5827         [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5828 };
5829
5830 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5831
5832 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5833         [EXEC_KEYRING_INHERIT] = "inherit",
5834         [EXEC_KEYRING_PRIVATE] = "private",
5835         [EXEC_KEYRING_SHARED] = "shared",
5836 };
5837
5838 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);