src/core/execute.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2010 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <errno.h>
  21 #include <fcntl.h>
  22 #include <glob.h>
  23 #include <grp.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <string.h>
  27 #include <sys/capability.h>
  28 #include <sys/eventfd.h>
  29 #include <sys/mman.h>
  30 #include <sys/personality.h>
  31 #include <sys/prctl.h>
  32 #include <sys/shm.h>
  33 #include <sys/socket.h>
  34 #include <sys/stat.h>
  35 #include <sys/types.h>
  36 #include <sys/un.h>
  37 #include <unistd.h>
  38 #include <utmpx.h>
  39
  40 #if HAVE_PAM
  41 #include <security/pam_appl.h>
  42 #endif
  43
  44 #if HAVE_SELINUX
  45 #include <selinux/selinux.h>
  46 #endif
  47
  48 #if HAVE_SECCOMP
  49 #include <seccomp.h>
  50 #endif
  51
  52 #if HAVE_APPARMOR
  53 #include <sys/apparmor.h>
  54 #endif
  55
  56 #include "sd-messages.h"
  57
  58 #include "af-list.h"
  59 #include "alloc-util.h"
  60 #if HAVE_APPARMOR
  61 #include "apparmor-util.h"
  62 #endif
  63 #include "async.h"
  64 #include "barrier.h"
  65 #include "cap-list.h"
  66 #include "capability-util.h"
  67 #include "chown-recursive.h"
  68 #include "def.h"
  69 #include "env-util.h"
  70 #include "errno-list.h"
  71 #include "execute.h"
  72 #include "exit-status.h"
  73 #include "fd-util.h"
  74 #include "fileio.h"
  75 #include "format-util.h"
  76 #include "fs-util.h"
  77 #include "glob-util.h"
  78 #include "io-util.h"
  79 #include "ioprio.h"
  80 #include "label.h"
  81 #include "log.h"
  82 #include "macro.h"
  83 #include "missing.h"
  84 #include "mkdir.h"
  85 #include "namespace.h"
  86 #include "parse-util.h"
  87 #include "path-util.h"
  88 #include "process-util.h"
  89 #include "rlimit-util.h"
  90 #include "rm-rf.h"
  91 #if HAVE_SECCOMP
  92 #include "seccomp-util.h"
  93 #endif
  94 #include "securebits.h"
  95 #include "securebits-util.h"
  96 #include "selinux-util.h"
  97 #include "signal-util.h"
  98 #include "smack-util.h"
  99 #include "special.h"
 100 #include "string-table.h"
 101 #include "string-util.h"
 102 #include "strv.h"
 103 #include "syslog-util.h"
 104 #include "terminal-util.h"
 105 #include "unit.h"
 106 #include "user-util.h"
 107 #include "util.h"
 108 #include "utmp-wtmp.h"
 109
 110 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 111 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 112
 113 /* This assumes there is a 'tty' group */
 114 #define TTY_MODE 0620
 115
 116 #define SNDBUF_SIZE (8*1024*1024)
 117
 118 static int shift_fds(int fds[], unsigned n_fds) {
 119         int start, restart_from;
 120
 121         if (n_fds <= 0)
 122                 return 0;
 123
 124         /* Modifies the fds array! (sorts it) */
 125
 126         assert(fds);
 127
 128         start = 0;
 129         for (;;) {
 130                 int i;
 131
 132                 restart_from = -1;
 133
 134                 for (i = start; i < (int) n_fds; i++) {
 135                         int nfd;
 136
 137                         /* Already at right index? */
 138                         if (fds[i] == i+3)
 139                                 continue;
 140
 141                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 142                         if (nfd < 0)
 143                                 return -errno;
 144
 145                         safe_close(fds[i]);
 146                         fds[i] = nfd;
 147
 148                         /* Hmm, the fd we wanted isn't free? Then
 149                          * let's remember that and try again from here */
 150                         if (nfd != i+3 && restart_from < 0)
 151                                 restart_from = i;
 152                 }
 153
 154                 if (restart_from < 0)
 155                         break;
 156
 157                 start = restart_from;
 158         }
 159
 160         return 0;
 161 }
 162
 163 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 164         unsigned i, n_fds;
 165         int r;
 166
 167         n_fds = n_storage_fds + n_socket_fds;
 168         if (n_fds <= 0)
 169                 return 0;
 170
 171         assert(fds);
 172
 173         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 174          * O_NONBLOCK only applies to socket activation though. */
 175
 176         for (i = 0; i < n_fds; i++) {
 177
 178                 if (i < n_socket_fds) {
 179                         r = fd_nonblock(fds[i], nonblock);
 180                         if (r < 0)
 181                                 return r;
 182                 }
 183
 184                 /* We unconditionally drop FD_CLOEXEC from the fds,
 185                  * since after all we want to pass these fds to our
 186                  * children */
 187
 188                 r = fd_cloexec(fds[i], false);
 189                 if (r < 0)
 190                         return r;
 191         }
 192
 193         return 0;
 194 }
 195
 196 static const char *exec_context_tty_path(const ExecContext *context) {
 197         assert(context);
 198
 199         if (context->stdio_as_fds)
 200                 return NULL;
 201
 202         if (context->tty_path)
 203                 return context->tty_path;
 204
 205         return "/dev/console";
 206 }
 207
 208 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 209         const char *path;
 210
 211         assert(context);
 212
 213         path = exec_context_tty_path(context);
 214
 215         if (context->tty_vhangup) {
 216                 if (p && p->stdin_fd >= 0)
 217                         (void) terminal_vhangup_fd(p->stdin_fd);
 218                 else if (path)
 219                         (void) terminal_vhangup(path);
 220         }
 221
 222         if (context->tty_reset) {
 223                 if (p && p->stdin_fd >= 0)
 224                         (void) reset_terminal_fd(p->stdin_fd, true);
 225                 else if (path)
 226                         (void) reset_terminal(path);
 227         }
 228
 229         if (context->tty_vt_disallocate && path)
 230                 (void) vt_disallocate(path);
 231 }
 232
 233 static bool is_terminal_input(ExecInput i) {
 234         return IN_SET(i,
 235                       EXEC_INPUT_TTY,
 236                       EXEC_INPUT_TTY_FORCE,
 237                       EXEC_INPUT_TTY_FAIL);
 238 }
 239
 240 static bool is_terminal_output(ExecOutput o) {
 241         return IN_SET(o,
 242                       EXEC_OUTPUT_TTY,
 243                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 244                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 245                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 246 }
 247
 248 static bool is_syslog_output(ExecOutput o) {
 249         return IN_SET(o,
 250                       EXEC_OUTPUT_SYSLOG,
 251                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 252 }
 253
 254 static bool is_kmsg_output(ExecOutput o) {
 255         return IN_SET(o,
 256                       EXEC_OUTPUT_KMSG,
 257                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 258 }
 259
 260 static bool exec_context_needs_term(const ExecContext *c) {
 261         assert(c);
 262
 263         /* Return true if the execution context suggests we should set $TERM to something useful. */
 264
 265         if (is_terminal_input(c->std_input))
 266                 return true;
 267
 268         if (is_terminal_output(c->std_output))
 269                 return true;
 270
 271         if (is_terminal_output(c->std_error))
 272                 return true;
 273
 274         return !!c->tty_path;
 275 }
 276
 277 static int open_null_as(int flags, int nfd) {
 278         int fd, r;
 279
 280         assert(nfd >= 0);
 281
 282         fd = open("/dev/null", flags|O_NOCTTY);
 283         if (fd < 0)
 284                 return -errno;
 285
 286         if (fd != nfd) {
 287                 r = dup2(fd, nfd) < 0 ? -errno : nfd;
 288                 safe_close(fd);
 289         } else
 290                 r = nfd;
 291
 292         return r;
 293 }
 294
 295 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 296         static const union sockaddr_union sa = {
 297                 .un.sun_family = AF_UNIX,
 298                 .un.sun_path = "/run/systemd/journal/stdout",
 299         };
 300         uid_t olduid = UID_INVALID;
 301         gid_t oldgid = GID_INVALID;
 302         int r;
 303
 304         if (gid_is_valid(gid)) {
 305                 oldgid = getgid();
 306
 307                 if (setegid(gid) < 0)
 308                         return -errno;
 309         }
 310
 311         if (uid_is_valid(uid)) {
 312                 olduid = getuid();
 313
 314                 if (seteuid(uid) < 0) {
 315                         r = -errno;
 316                         goto restore_gid;
 317                 }
 318         }
 319
 320         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 321
 322         /* If we fail to restore the uid or gid, things will likely
 323            fail later on. This should only happen if an LSM interferes. */
 324
 325         if (uid_is_valid(uid))
 326                 (void) seteuid(olduid);
 327
 328  restore_gid:
 329         if (gid_is_valid(gid))
 330                 (void) setegid(oldgid);
 331
 332         return r;
 333 }
 334
 335 static int connect_logger_as(
 336                 Unit *unit,
 337                 const ExecContext *context,
 338                 const ExecParameters *params,
 339                 ExecOutput output,
 340                 const char *ident,
 341                 int nfd,
 342                 uid_t uid,
 343                 gid_t gid) {
 344
 345         int fd, r;
 346
 347         assert(context);
 348         assert(params);
 349         assert(output < _EXEC_OUTPUT_MAX);
 350         assert(ident);
 351         assert(nfd >= 0);
 352
 353         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 354         if (fd < 0)
 355                 return -errno;
 356
 357         r = connect_journal_socket(fd, uid, gid);
 358         if (r < 0)
 359                 return r;
 360
 361         if (shutdown(fd, SHUT_RD) < 0) {
 362                 safe_close(fd);
 363                 return -errno;
 364         }
 365
 366         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 367
 368         dprintf(fd,
 369                 "%s\n"
 370                 "%s\n"
 371                 "%i\n"
 372                 "%i\n"
 373                 "%i\n"
 374                 "%i\n"
 375                 "%i\n",
 376                 context->syslog_identifier ?: ident,
 377                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 378                 context->syslog_priority,
 379                 !!context->syslog_level_prefix,
 380                 is_syslog_output(output),
 381                 is_kmsg_output(output),
 382                 is_terminal_output(output));
 383
 384         if (fd == nfd)
 385                 return nfd;
 386
 387         r = dup2(fd, nfd) < 0 ? -errno : nfd;
 388         safe_close(fd);
 389
 390         return r;
 391 }
 392 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
 393         int fd, r;
 394
 395         assert(path);
 396         assert(nfd >= 0);
 397
 398         fd = open_terminal(path, mode | O_NOCTTY);
 399         if (fd < 0)
 400                 return fd;
 401
 402         if (fd != nfd) {
 403                 r = dup2(fd, nfd) < 0 ? -errno : nfd;
 404                 safe_close(fd);
 405         } else
 406                 r = nfd;
 407
 408         return r;
 409 }
 410
 411 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
 412
 413         if (is_terminal_input(std_input) && !apply_tty_stdin)
 414                 return EXEC_INPUT_NULL;
 415
 416         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 417                 return EXEC_INPUT_NULL;
 418
 419         return std_input;
 420 }
 421
 422 static int fixup_output(ExecOutput std_output, int socket_fd) {
 423
 424         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 425                 return EXEC_OUTPUT_INHERIT;
 426
 427         return std_output;
 428 }
 429
 430 static int setup_input(
 431                 const ExecContext *context,
 432                 const ExecParameters *params,
 433                 int socket_fd,
 434                 int named_iofds[3]) {
 435
 436         ExecInput i;
 437
 438         assert(context);
 439         assert(params);
 440
 441         if (params->stdin_fd >= 0) {
 442                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 443                         return -errno;
 444
 445                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 446                 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 447                 (void) reset_terminal_fd(STDIN_FILENO, true);
 448
 449                 return STDIN_FILENO;
 450         }
 451
 452         i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 453
 454         switch (i) {
 455
 456         case EXEC_INPUT_NULL:
 457                 return open_null_as(O_RDONLY, STDIN_FILENO);
 458
 459         case EXEC_INPUT_TTY:
 460         case EXEC_INPUT_TTY_FORCE:
 461         case EXEC_INPUT_TTY_FAIL: {
 462                 int fd, r;
 463
 464                 fd = acquire_terminal(exec_context_tty_path(context),
 465                                       i == EXEC_INPUT_TTY_FAIL,
 466                                       i == EXEC_INPUT_TTY_FORCE,
 467                                       false,
 468                                       USEC_INFINITY);
 469                 if (fd < 0)
 470                         return fd;
 471
 472                 if (fd != STDIN_FILENO) {
 473                         r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 474                         safe_close(fd);
 475                 } else
 476                         r = STDIN_FILENO;
 477
 478                 return r;
 479         }
 480
 481         case EXEC_INPUT_SOCKET:
 482                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 483
 484         case EXEC_INPUT_NAMED_FD:
 485                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 486                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 487
 488         default:
 489                 assert_not_reached("Unknown input type");
 490         }
 491 }
 492
 493 static int setup_output(
 494                 Unit *unit,
 495                 const ExecContext *context,
 496                 const ExecParameters *params,
 497                 int fileno,
 498                 int socket_fd,
 499                 int named_iofds[3],
 500                 const char *ident,
 501                 uid_t uid,
 502                 gid_t gid,
 503                 dev_t *journal_stream_dev,
 504                 ino_t *journal_stream_ino) {
 505
 506         ExecOutput o;
 507         ExecInput i;
 508         int r;
 509
 510         assert(unit);
 511         assert(context);
 512         assert(params);
 513         assert(ident);
 514         assert(journal_stream_dev);
 515         assert(journal_stream_ino);
 516
 517         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 518
 519                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 520                         return -errno;
 521
 522                 return STDOUT_FILENO;
 523         }
 524
 525         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 526                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 527                         return -errno;
 528
 529                 return STDERR_FILENO;
 530         }
 531
 532         i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 533         o = fixup_output(context->std_output, socket_fd);
 534
 535         if (fileno == STDERR_FILENO) {
 536                 ExecOutput e;
 537                 e = fixup_output(context->std_error, socket_fd);
 538
 539                 /* This expects the input and output are already set up */
 540
 541                 /* Don't change the stderr file descriptor if we inherit all
 542                  * the way and are not on a tty */
 543                 if (e == EXEC_OUTPUT_INHERIT &&
 544                     o == EXEC_OUTPUT_INHERIT &&
 545                     i == EXEC_INPUT_NULL &&
 546                     !is_terminal_input(context->std_input) &&
 547                     getppid () != 1)
 548                         return fileno;
 549
 550                 /* Duplicate from stdout if possible */
 551                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 552                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 553
 554                 o = e;
 555
 556         } else if (o == EXEC_OUTPUT_INHERIT) {
 557                 /* If input got downgraded, inherit the original value */
 558                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 559                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 560
 561                 /* If the input is connected to anything that's not a /dev/null, inherit that... */
 562                 if (i != EXEC_INPUT_NULL)
 563                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 564
 565                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 566                 if (getppid() != 1)
 567                         return fileno;
 568
 569                 /* We need to open /dev/null here anew, to get the right access mode. */
 570                 return open_null_as(O_WRONLY, fileno);
 571         }
 572
 573         switch (o) {
 574
 575         case EXEC_OUTPUT_NULL:
 576                 return open_null_as(O_WRONLY, fileno);
 577
 578         case EXEC_OUTPUT_TTY:
 579                 if (is_terminal_input(i))
 580                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 581
 582                 /* We don't reset the terminal if this is just about output */
 583                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 584
 585         case EXEC_OUTPUT_SYSLOG:
 586         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 587         case EXEC_OUTPUT_KMSG:
 588         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 589         case EXEC_OUTPUT_JOURNAL:
 590         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 591                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 592                 if (r < 0) {
 593                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 594                         r = open_null_as(O_WRONLY, fileno);
 595                 } else {
 596                         struct stat st;
 597
 598                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 599                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 600                          * services to detect whether they are connected to the journal or not.
 601                          *
 602                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 603                          * about STDERR as that's usually the best way to do logging. */
 604
 605                         if (fstat(fileno, &st) >= 0 &&
 606                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 607                                 *journal_stream_dev = st.st_dev;
 608                                 *journal_stream_ino = st.st_ino;
 609                         }
 610                 }
 611                 return r;
 612
 613         case EXEC_OUTPUT_SOCKET:
 614                 assert(socket_fd >= 0);
 615                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 616
 617         case EXEC_OUTPUT_NAMED_FD:
 618                 (void) fd_nonblock(named_iofds[fileno], false);
 619                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 620
 621         default:
 622                 assert_not_reached("Unknown error type");
 623         }
 624 }
 625
 626 static int chown_terminal(int fd, uid_t uid) {
 627         struct stat st;
 628
 629         assert(fd >= 0);
 630
 631         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 632         if (isatty(fd) < 1)
 633                 return 0;
 634
 635         /* This might fail. What matters are the results. */
 636         (void) fchown(fd, uid, -1);
 637         (void) fchmod(fd, TTY_MODE);
 638
 639         if (fstat(fd, &st) < 0)
 640                 return -errno;
 641
 642         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 643                 return -EPERM;
 644
 645         return 0;
 646 }
 647
 648 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 649         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 650         int r;
 651
 652         assert(_saved_stdin);
 653         assert(_saved_stdout);
 654
 655         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 656         if (saved_stdin < 0)
 657                 return -errno;
 658
 659         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 660         if (saved_stdout < 0)
 661                 return -errno;
 662
 663         fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
 664         if (fd < 0)
 665                 return fd;
 666
 667         r = chown_terminal(fd, getuid());
 668         if (r < 0)
 669                 return r;
 670
 671         r = reset_terminal_fd(fd, true);
 672         if (r < 0)
 673                 return r;
 674
 675         if (dup2(fd, STDIN_FILENO) < 0)
 676                 return -errno;
 677
 678         if (dup2(fd, STDOUT_FILENO) < 0)
 679                 return -errno;
 680
 681         if (fd >= 2)
 682                 safe_close(fd);
 683         fd = -1;
 684
 685         *_saved_stdin = saved_stdin;
 686         *_saved_stdout = saved_stdout;
 687
 688         saved_stdin = saved_stdout = -1;
 689
 690         return 0;
 691 }
 692
 693 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 694         assert(err < 0);
 695
 696         if (err == -ETIMEDOUT)
 697                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 698         else {
 699                 errno = -err;
 700                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 701         }
 702 }
 703
 704 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 705         _cleanup_close_ int fd = -1;
 706
 707         assert(vc);
 708
 709         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 710         if (fd < 0)
 711                 return;
 712
 713         write_confirm_error_fd(err, fd, u);
 714 }
 715
 716 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 717         int r = 0;
 718
 719         assert(saved_stdin);
 720         assert(saved_stdout);
 721
 722         release_terminal();
 723
 724         if (*saved_stdin >= 0)
 725                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 726                         r = -errno;
 727
 728         if (*saved_stdout >= 0)
 729                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 730                         r = -errno;
 731
 732         *saved_stdin = safe_close(*saved_stdin);
 733         *saved_stdout = safe_close(*saved_stdout);
 734
 735         return r;
 736 }
 737
 738 enum {
 739         CONFIRM_PRETEND_FAILURE = -1,
 740         CONFIRM_PRETEND_SUCCESS =  0,
 741         CONFIRM_EXECUTE = 1,
 742 };
 743
 744 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 745         int saved_stdout = -1, saved_stdin = -1, r;
 746         _cleanup_free_ char *e = NULL;
 747         char c;
 748
 749         /* For any internal errors, assume a positive response. */
 750         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 751         if (r < 0) {
 752                 write_confirm_error(r, vc, u);
 753                 return CONFIRM_EXECUTE;
 754         }
 755
 756         /* confirm_spawn might have been disabled while we were sleeping. */
 757         if (manager_is_confirm_spawn_disabled(u->manager)) {
 758                 r = 1;
 759                 goto restore_stdio;
 760         }
 761
 762         e = ellipsize(cmdline, 60, 100);
 763         if (!e) {
 764                 log_oom();
 765                 r = CONFIRM_EXECUTE;
 766                 goto restore_stdio;
 767         }
 768
 769         for (;;) {
 770                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 771                 if (r < 0) {
 772                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 773                         r = CONFIRM_EXECUTE;
 774                         goto restore_stdio;
 775                 }
 776
 777                 switch (c) {
 778                 case 'c':
 779                         printf("Resuming normal execution.\n");
 780                         manager_disable_confirm_spawn();
 781                         r = 1;
 782                         break;
 783                 case 'D':
 784                         unit_dump(u, stdout, "  ");
 785                         continue; /* ask again */
 786                 case 'f':
 787                         printf("Failing execution.\n");
 788                         r = CONFIRM_PRETEND_FAILURE;
 789                         break;
 790                 case 'h':
 791                         printf("  c - continue, proceed without asking anymore\n"
 792                                "  D - dump, show the state of the unit\n"
 793                                "  f - fail, don't execute the command and pretend it failed\n"
 794                                "  h - help\n"
 795                                "  i - info, show a short summary of the unit\n"
 796                                "  j - jobs, show jobs that are in progress\n"
 797                                "  s - skip, don't execute the command and pretend it succeeded\n"
 798                                "  y - yes, execute the command\n");
 799                         continue; /* ask again */
 800                 case 'i':
 801                         printf("  Description: %s\n"
 802                                "  Unit:        %s\n"
 803                                "  Command:     %s\n",
 804                                u->id, u->description, cmdline);
 805                         continue; /* ask again */
 806                 case 'j':
 807                         manager_dump_jobs(u->manager, stdout, "  ");
 808                         continue; /* ask again */
 809                 case 'n':
 810                         /* 'n' was removed in favor of 'f'. */
 811                         printf("Didn't understand 'n', did you mean 'f'?\n");
 812                         continue; /* ask again */
 813                 case 's':
 814                         printf("Skipping execution.\n");
 815                         r = CONFIRM_PRETEND_SUCCESS;
 816                         break;
 817                 case 'y':
 818                         r = CONFIRM_EXECUTE;
 819                         break;
 820                 default:
 821                         assert_not_reached("Unhandled choice");
 822                 }
 823                 break;
 824         }
 825
 826 restore_stdio:
 827         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 828         return r;
 829 }
 830
 831 static int get_fixed_user(const ExecContext *c, const char **user,
 832                           uid_t *uid, gid_t *gid,
 833                           const char **home, const char **shell) {
 834         int r;
 835         const char *name;
 836
 837         assert(c);
 838
 839         if (!c->user)
 840                 return 0;
 841
 842         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 843          * (i.e. are "/" or "/bin/nologin"). */
 844
 845         name = c->user;
 846         r = get_user_creds_clean(&name, uid, gid, home, shell);
 847         if (r < 0)
 848                 return r;
 849
 850         *user = name;
 851         return 0;
 852 }
 853
 854 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 855         int r;
 856         const char *name;
 857
 858         assert(c);
 859
 860         if (!c->group)
 861                 return 0;
 862
 863         name = c->group;
 864         r = get_group_creds(&name, gid);
 865         if (r < 0)
 866                 return r;
 867
 868         *group = name;
 869         return 0;
 870 }
 871
 872 static int get_supplementary_groups(const ExecContext *c, const char *user,
 873                                     const char *group, gid_t gid,
 874                                     gid_t **supplementary_gids, int *ngids) {
 875         char **i;
 876         int r, k = 0;
 877         int ngroups_max;
 878         bool keep_groups = false;
 879         gid_t *groups = NULL;
 880         _cleanup_free_ gid_t *l_gids = NULL;
 881
 882         assert(c);
 883
 884         /*
 885          * If user is given, then lookup GID and supplementary groups list.
 886          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 887          * here and as early as possible so we keep the list of supplementary
 888          * groups of the caller.
 889          */
 890         if (user && gid_is_valid(gid) && gid != 0) {
 891                 /* First step, initialize groups from /etc/groups */
 892                 if (initgroups(user, gid) < 0)
 893                         return -errno;
 894
 895                 keep_groups = true;
 896         }
 897
 898         if (strv_isempty(c->supplementary_groups))
 899                 return 0;
 900
 901         /*
 902          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 903          * be positive, otherwise fail.
 904          */
 905         errno = 0;
 906         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 907         if (ngroups_max <= 0) {
 908                 if (errno > 0)
 909                         return -errno;
 910                 else
 911                         return -EOPNOTSUPP; /* For all other values */
 912         }
 913
 914         l_gids = new(gid_t, ngroups_max);
 915         if (!l_gids)
 916                 return -ENOMEM;
 917
 918         if (keep_groups) {
 919                 /*
 920                  * Lookup the list of groups that the user belongs to, we
 921                  * avoid NSS lookups here too for gid=0.
 922                  */
 923                 k = ngroups_max;
 924                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 925                         return -EINVAL;
 926         } else
 927                 k = 0;
 928
 929         STRV_FOREACH(i, c->supplementary_groups) {
 930                 const char *g;
 931
 932                 if (k >= ngroups_max)
 933                         return -E2BIG;
 934
 935                 g = *i;
 936                 r = get_group_creds(&g, l_gids+k);
 937                 if (r < 0)
 938                         return r;
 939
 940                 k++;
 941         }
 942
 943         /*
 944          * Sets ngids to zero to drop all supplementary groups, happens
 945          * when we are under root and SupplementaryGroups= is empty.
 946          */
 947         if (k == 0) {
 948                 *ngids = 0;
 949                 return 0;
 950         }
 951
 952         /* Otherwise get the final list of supplementary groups */
 953         groups = memdup(l_gids, sizeof(gid_t) * k);
 954         if (!groups)
 955                 return -ENOMEM;
 956
 957         *supplementary_gids = groups;
 958         *ngids = k;
 959
 960         groups = NULL;
 961
 962         return 0;
 963 }
 964
 965 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
 966         int r;
 967
 968         /* Handle SupplementaryGroups= if it is not empty */
 969         if (ngids > 0) {
 970                 r = maybe_setgroups(ngids, supplementary_gids);
 971                 if (r < 0)
 972                         return r;
 973         }
 974
 975         if (gid_is_valid(gid)) {
 976                 /* Then set our gids */
 977                 if (setresgid(gid, gid, gid) < 0)
 978                         return -errno;
 979         }
 980
 981         return 0;
 982 }
 983
 984 static int enforce_user(const ExecContext *context, uid_t uid) {
 985         assert(context);
 986
 987         if (!uid_is_valid(uid))
 988                 return 0;
 989
 990         /* Sets (but doesn't look up) the uid and make sure we keep the
 991          * capabilities while doing so. */
 992
 993         if (context->capability_ambient_set != 0) {
 994
 995                 /* First step: If we need to keep capabilities but
 996                  * drop privileges we need to make sure we keep our
 997                  * caps, while we drop privileges. */
 998                 if (uid != 0) {
 999                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1000
1001                         if (prctl(PR_GET_SECUREBITS) != sb)
1002                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1003                                         return -errno;
1004                 }
1005         }
1006
1007         /* Second step: actually set the uids */
1008         if (setresuid(uid, uid, uid) < 0)
1009                 return -errno;
1010
1011         /* At this point we should have all necessary capabilities but
1012            are otherwise a normal user. However, the caps might got
1013            corrupted due to the setresuid() so we need clean them up
1014            later. This is done outside of this call. */
1015
1016         return 0;
1017 }
1018
1019 #if HAVE_PAM
1020
1021 static int null_conv(
1022                 int num_msg,
1023                 const struct pam_message **msg,
1024                 struct pam_response **resp,
1025                 void *appdata_ptr) {
1026
1027         /* We don't support conversations */
1028
1029         return PAM_CONV_ERR;
1030 }
1031
1032 #endif
1033
1034 static int setup_pam(
1035                 const char *name,
1036                 const char *user,
1037                 uid_t uid,
1038                 gid_t gid,
1039                 const char *tty,
1040                 char ***env,
1041                 int fds[], unsigned n_fds) {
1042
1043 #if HAVE_PAM
1044
1045         static const struct pam_conv conv = {
1046                 .conv = null_conv,
1047                 .appdata_ptr = NULL
1048         };
1049
1050         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1051         pam_handle_t *handle = NULL;
1052         sigset_t old_ss;
1053         int pam_code = PAM_SUCCESS, r;
1054         char **nv, **e = NULL;
1055         bool close_session = false;
1056         pid_t pam_pid = 0, parent_pid;
1057         int flags = 0;
1058
1059         assert(name);
1060         assert(user);
1061         assert(env);
1062
1063         /* We set up PAM in the parent process, then fork. The child
1064          * will then stay around until killed via PR_GET_PDEATHSIG or
1065          * systemd via the cgroup logic. It will then remove the PAM
1066          * session again. The parent process will exec() the actual
1067          * daemon. We do things this way to ensure that the main PID
1068          * of the daemon is the one we initially fork()ed. */
1069
1070         r = barrier_create(&barrier);
1071         if (r < 0)
1072                 goto fail;
1073
1074         if (log_get_max_level() < LOG_DEBUG)
1075                 flags |= PAM_SILENT;
1076
1077         pam_code = pam_start(name, user, &conv, &handle);
1078         if (pam_code != PAM_SUCCESS) {
1079                 handle = NULL;
1080                 goto fail;
1081         }
1082
1083         if (tty) {
1084                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1085                 if (pam_code != PAM_SUCCESS)
1086                         goto fail;
1087         }
1088
1089         STRV_FOREACH(nv, *env) {
1090                 pam_code = pam_putenv(handle, *nv);
1091                 if (pam_code != PAM_SUCCESS)
1092                         goto fail;
1093         }
1094
1095         pam_code = pam_acct_mgmt(handle, flags);
1096         if (pam_code != PAM_SUCCESS)
1097                 goto fail;
1098
1099         pam_code = pam_open_session(handle, flags);
1100         if (pam_code != PAM_SUCCESS)
1101                 goto fail;
1102
1103         close_session = true;
1104
1105         e = pam_getenvlist(handle);
1106         if (!e) {
1107                 pam_code = PAM_BUF_ERR;
1108                 goto fail;
1109         }
1110
1111         /* Block SIGTERM, so that we know that it won't get lost in
1112          * the child */
1113
1114         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1115
1116         parent_pid = getpid_cached();
1117
1118         pam_pid = fork();
1119         if (pam_pid < 0) {
1120                 r = -errno;
1121                 goto fail;
1122         }
1123
1124         if (pam_pid == 0) {
1125                 int sig, ret = EXIT_PAM;
1126
1127                 /* The child's job is to reset the PAM session on
1128                  * termination */
1129                 barrier_set_role(&barrier, BARRIER_CHILD);
1130
1131                 /* This string must fit in 10 chars (i.e. the length
1132                  * of "/sbin/init"), to look pretty in /bin/ps */
1133                 rename_process("(sd-pam)");
1134
1135                 /* Make sure we don't keep open the passed fds in this
1136                 child. We assume that otherwise only those fds are
1137                 open here that have been opened by PAM. */
1138                 close_many(fds, n_fds);
1139
1140                 /* Drop privileges - we don't need any to pam_close_session
1141                  * and this will make PR_SET_PDEATHSIG work in most cases.
1142                  * If this fails, ignore the error - but expect sd-pam threads
1143                  * to fail to exit normally */
1144
1145                 r = maybe_setgroups(0, NULL);
1146                 if (r < 0)
1147                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1148                 if (setresgid(gid, gid, gid) < 0)
1149                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1150                 if (setresuid(uid, uid, uid) < 0)
1151                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1152
1153                 (void) ignore_signals(SIGPIPE, -1);
1154
1155                 /* Wait until our parent died. This will only work if
1156                  * the above setresuid() succeeds, otherwise the kernel
1157                  * will not allow unprivileged parents kill their privileged
1158                  * children this way. We rely on the control groups kill logic
1159                  * to do the rest for us. */
1160                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1161                         goto child_finish;
1162
1163                 /* Tell the parent that our setup is done. This is especially
1164                  * important regarding dropping privileges. Otherwise, unit
1165                  * setup might race against our setresuid(2) call.
1166                  *
1167                  * If the parent aborted, we'll detect this below, hence ignore
1168                  * return failure here. */
1169                 (void) barrier_place(&barrier);
1170
1171                 /* Check if our parent process might already have died? */
1172                 if (getppid() == parent_pid) {
1173                         sigset_t ss;
1174
1175                         assert_se(sigemptyset(&ss) >= 0);
1176                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1177
1178                         for (;;) {
1179                                 if (sigwait(&ss, &sig) < 0) {
1180                                         if (errno == EINTR)
1181                                                 continue;
1182
1183                                         goto child_finish;
1184                                 }
1185
1186                                 assert(sig == SIGTERM);
1187                                 break;
1188                         }
1189                 }
1190
1191                 /* If our parent died we'll end the session */
1192                 if (getppid() != parent_pid) {
1193                         pam_code = pam_close_session(handle, flags);
1194                         if (pam_code != PAM_SUCCESS)
1195                                 goto child_finish;
1196                 }
1197
1198                 ret = 0;
1199
1200         child_finish:
1201                 pam_end(handle, pam_code | flags);
1202                 _exit(ret);
1203         }
1204
1205         barrier_set_role(&barrier, BARRIER_PARENT);
1206
1207         /* If the child was forked off successfully it will do all the
1208          * cleanups, so forget about the handle here. */
1209         handle = NULL;
1210
1211         /* Unblock SIGTERM again in the parent */
1212         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1213
1214         /* We close the log explicitly here, since the PAM modules
1215          * might have opened it, but we don't want this fd around. */
1216         closelog();
1217
1218         /* Synchronously wait for the child to initialize. We don't care for
1219          * errors as we cannot recover. However, warn loudly if it happens. */
1220         if (!barrier_place_and_sync(&barrier))
1221                 log_error("PAM initialization failed");
1222
1223         strv_free(*env);
1224         *env = e;
1225
1226         return 0;
1227
1228 fail:
1229         if (pam_code != PAM_SUCCESS) {
1230                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1231                 r = -EPERM;  /* PAM errors do not map to errno */
1232         } else
1233                 log_error_errno(r, "PAM failed: %m");
1234
1235         if (handle) {
1236                 if (close_session)
1237                         pam_code = pam_close_session(handle, flags);
1238
1239                 pam_end(handle, pam_code | flags);
1240         }
1241
1242         strv_free(e);
1243         closelog();
1244
1245         return r;
1246 #else
1247         return 0;
1248 #endif
1249 }
1250
1251 static void rename_process_from_path(const char *path) {
1252         char process_name[11];
1253         const char *p;
1254         size_t l;
1255
1256         /* This resulting string must fit in 10 chars (i.e. the length
1257          * of "/sbin/init") to look pretty in /bin/ps */
1258
1259         p = basename(path);
1260         if (isempty(p)) {
1261                 rename_process("(...)");
1262                 return;
1263         }
1264
1265         l = strlen(p);
1266         if (l > 8) {
1267                 /* The end of the process name is usually more
1268                  * interesting, since the first bit might just be
1269                  * "systemd-" */
1270                 p = p + l - 8;
1271                 l = 8;
1272         }
1273
1274         process_name[0] = '(';
1275         memcpy(process_name+1, p, l);
1276         process_name[1+l] = ')';
1277         process_name[1+l+1] = 0;
1278
1279         rename_process(process_name);
1280 }
1281
1282 static bool context_has_address_families(const ExecContext *c) {
1283         assert(c);
1284
1285         return c->address_families_whitelist ||
1286                 !set_isempty(c->address_families);
1287 }
1288
1289 static bool context_has_syscall_filters(const ExecContext *c) {
1290         assert(c);
1291
1292         return c->syscall_whitelist ||
1293                 !hashmap_isempty(c->syscall_filter);
1294 }
1295
1296 static bool context_has_no_new_privileges(const ExecContext *c) {
1297         assert(c);
1298
1299         if (c->no_new_privileges)
1300                 return true;
1301
1302         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1303                 return false;
1304
1305         /* We need NNP if we have any form of seccomp and are unprivileged */
1306         return context_has_address_families(c) ||
1307                 c->memory_deny_write_execute ||
1308                 c->restrict_realtime ||
1309                 exec_context_restrict_namespaces_set(c) ||
1310                 c->protect_kernel_tunables ||
1311                 c->protect_kernel_modules ||
1312                 c->private_devices ||
1313                 context_has_syscall_filters(c) ||
1314                 !set_isempty(c->syscall_archs) ||
1315                 c->lock_personality;
1316 }
1317
1318 #if HAVE_SECCOMP
1319
1320 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1321
1322         if (is_seccomp_available())
1323                 return false;
1324
1325         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1326         return true;
1327 }
1328
1329 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1330         uint32_t negative_action, default_action, action;
1331         int r;
1332
1333         assert(u);
1334         assert(c);
1335
1336         if (!context_has_syscall_filters(c))
1337                 return 0;
1338
1339         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1340                 return 0;
1341
1342         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1343
1344         if (c->syscall_whitelist) {
1345                 default_action = negative_action;
1346                 action = SCMP_ACT_ALLOW;
1347         } else {
1348                 default_action = SCMP_ACT_ALLOW;
1349                 action = negative_action;
1350         }
1351
1352         if (needs_ambient_hack) {
1353                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1354                 if (r < 0)
1355                         return r;
1356         }
1357
1358         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1359 }
1360
1361 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1362         assert(u);
1363         assert(c);
1364
1365         if (set_isempty(c->syscall_archs))
1366                 return 0;
1367
1368         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1369                 return 0;
1370
1371         return seccomp_restrict_archs(c->syscall_archs);
1372 }
1373
1374 static int apply_address_families(const Unit* u, const ExecContext *c) {
1375         assert(u);
1376         assert(c);
1377
1378         if (!context_has_address_families(c))
1379                 return 0;
1380
1381         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1382                 return 0;
1383
1384         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1385 }
1386
1387 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1388         assert(u);
1389         assert(c);
1390
1391         if (!c->memory_deny_write_execute)
1392                 return 0;
1393
1394         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1395                 return 0;
1396
1397         return seccomp_memory_deny_write_execute();
1398 }
1399
1400 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1401         assert(u);
1402         assert(c);
1403
1404         if (!c->restrict_realtime)
1405                 return 0;
1406
1407         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1408                 return 0;
1409
1410         return seccomp_restrict_realtime();
1411 }
1412
1413 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1414         assert(u);
1415         assert(c);
1416
1417         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1418          * let's protect even those systems where this is left on in the kernel. */
1419
1420         if (!c->protect_kernel_tunables)
1421                 return 0;
1422
1423         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1424                 return 0;
1425
1426         return seccomp_protect_sysctl();
1427 }
1428
1429 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1430         assert(u);
1431         assert(c);
1432
1433         /* Turn off module syscalls on ProtectKernelModules=yes */
1434
1435         if (!c->protect_kernel_modules)
1436                 return 0;
1437
1438         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1439                 return 0;
1440
1441         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1442 }
1443
1444 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1445         assert(u);
1446         assert(c);
1447
1448         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1449
1450         if (!c->private_devices)
1451                 return 0;
1452
1453         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1454                 return 0;
1455
1456         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1457 }
1458
1459 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1460         assert(u);
1461         assert(c);
1462
1463         if (!exec_context_restrict_namespaces_set(c))
1464                 return 0;
1465
1466         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1467                 return 0;
1468
1469         return seccomp_restrict_namespaces(c->restrict_namespaces);
1470 }
1471
1472 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1473         unsigned long personality;
1474         int r;
1475
1476         assert(u);
1477         assert(c);
1478
1479         if (!c->lock_personality)
1480                 return 0;
1481
1482         if (skip_seccomp_unavailable(u, "LockPersonality="))
1483                 return 0;
1484
1485         personality = c->personality;
1486
1487         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1488         if (personality == PERSONALITY_INVALID) {
1489
1490                 r = opinionated_personality(&personality);
1491                 if (r < 0)
1492                         return r;
1493         }
1494
1495         return seccomp_lock_personality(personality);
1496 }
1497
1498 #endif
1499
1500 static void do_idle_pipe_dance(int idle_pipe[4]) {
1501         assert(idle_pipe);
1502
1503         idle_pipe[1] = safe_close(idle_pipe[1]);
1504         idle_pipe[2] = safe_close(idle_pipe[2]);
1505
1506         if (idle_pipe[0] >= 0) {
1507                 int r;
1508
1509                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1510
1511                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1512                         ssize_t n;
1513
1514                         /* Signal systemd that we are bored and want to continue. */
1515                         n = write(idle_pipe[3], "x", 1);
1516                         if (n > 0)
1517                                 /* Wait for systemd to react to the signal above. */
1518                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1519                 }
1520
1521                 idle_pipe[0] = safe_close(idle_pipe[0]);
1522
1523         }
1524
1525         idle_pipe[3] = safe_close(idle_pipe[3]);
1526 }
1527
1528 static int build_environment(
1529                 Unit *u,
1530                 const ExecContext *c,
1531                 const ExecParameters *p,
1532                 unsigned n_fds,
1533                 const char *home,
1534                 const char *username,
1535                 const char *shell,
1536                 dev_t journal_stream_dev,
1537                 ino_t journal_stream_ino,
1538                 char ***ret) {
1539
1540         _cleanup_strv_free_ char **our_env = NULL;
1541         unsigned n_env = 0;
1542         char *x;
1543
1544         assert(u);
1545         assert(c);
1546         assert(ret);
1547
1548         our_env = new0(char*, 14);
1549         if (!our_env)
1550                 return -ENOMEM;
1551
1552         if (n_fds > 0) {
1553                 _cleanup_free_ char *joined = NULL;
1554
1555                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1556                         return -ENOMEM;
1557                 our_env[n_env++] = x;
1558
1559                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1560                         return -ENOMEM;
1561                 our_env[n_env++] = x;
1562
1563                 joined = strv_join(p->fd_names, ":");
1564                 if (!joined)
1565                         return -ENOMEM;
1566
1567                 x = strjoin("LISTEN_FDNAMES=", joined);
1568                 if (!x)
1569                         return -ENOMEM;
1570                 our_env[n_env++] = x;
1571         }
1572
1573         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1574                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1575                         return -ENOMEM;
1576                 our_env[n_env++] = x;
1577
1578                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1579                         return -ENOMEM;
1580                 our_env[n_env++] = x;
1581         }
1582
1583         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1584          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1585          * check the database directly. */
1586         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1587                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1588                 if (!x)
1589                         return -ENOMEM;
1590                 our_env[n_env++] = x;
1591         }
1592
1593         if (home) {
1594                 x = strappend("HOME=", home);
1595                 if (!x)
1596                         return -ENOMEM;
1597                 our_env[n_env++] = x;
1598         }
1599
1600         if (username) {
1601                 x = strappend("LOGNAME=", username);
1602                 if (!x)
1603                         return -ENOMEM;
1604                 our_env[n_env++] = x;
1605
1606                 x = strappend("USER=", username);
1607                 if (!x)
1608                         return -ENOMEM;
1609                 our_env[n_env++] = x;
1610         }
1611
1612         if (shell) {
1613                 x = strappend("SHELL=", shell);
1614                 if (!x)
1615                         return -ENOMEM;
1616                 our_env[n_env++] = x;
1617         }
1618
1619         if (!sd_id128_is_null(u->invocation_id)) {
1620                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1621                         return -ENOMEM;
1622
1623                 our_env[n_env++] = x;
1624         }
1625
1626         if (exec_context_needs_term(c)) {
1627                 const char *tty_path, *term = NULL;
1628
1629                 tty_path = exec_context_tty_path(c);
1630
1631                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1632                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1633                  * passes to PID 1 ends up all the way in the console login shown. */
1634
1635                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1636                         term = getenv("TERM");
1637                 if (!term)
1638                         term = default_term_for_tty(tty_path);
1639
1640                 x = strappend("TERM=", term);
1641                 if (!x)
1642                         return -ENOMEM;
1643                 our_env[n_env++] = x;
1644         }
1645
1646         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1647                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1648                         return -ENOMEM;
1649
1650                 our_env[n_env++] = x;
1651         }
1652
1653         our_env[n_env++] = NULL;
1654         assert(n_env <= 12);
1655
1656         *ret = our_env;
1657         our_env = NULL;
1658
1659         return 0;
1660 }
1661
1662 static int build_pass_environment(const ExecContext *c, char ***ret) {
1663         _cleanup_strv_free_ char **pass_env = NULL;
1664         size_t n_env = 0, n_bufsize = 0;
1665         char **i;
1666
1667         STRV_FOREACH(i, c->pass_environment) {
1668                 _cleanup_free_ char *x = NULL;
1669                 char *v;
1670
1671                 v = getenv(*i);
1672                 if (!v)
1673                         continue;
1674                 x = strjoin(*i, "=", v);
1675                 if (!x)
1676                         return -ENOMEM;
1677
1678                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1679                         return -ENOMEM;
1680
1681                 pass_env[n_env++] = x;
1682                 pass_env[n_env] = NULL;
1683                 x = NULL;
1684         }
1685
1686         *ret = pass_env;
1687         pass_env = NULL;
1688
1689         return 0;
1690 }
1691
1692 static bool exec_needs_mount_namespace(
1693                 const ExecContext *context,
1694                 const ExecParameters *params,
1695                 ExecRuntime *runtime) {
1696
1697         assert(context);
1698         assert(params);
1699
1700         if (context->root_image)
1701                 return true;
1702
1703         if (!strv_isempty(context->read_write_paths) ||
1704             !strv_isempty(context->read_only_paths) ||
1705             !strv_isempty(context->inaccessible_paths))
1706                 return true;
1707
1708         if (context->n_bind_mounts > 0 ||
1709             !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1710             !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1711             !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1712             !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1713             !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1714                 return true;
1715
1716         if (context->mount_flags != 0)
1717                 return true;
1718
1719         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1720                 return true;
1721
1722         if (context->private_devices ||
1723             context->protect_system != PROTECT_SYSTEM_NO ||
1724             context->protect_home != PROTECT_HOME_NO ||
1725             context->protect_kernel_tunables ||
1726             context->protect_kernel_modules ||
1727             context->protect_control_groups)
1728                 return true;
1729
1730         if (context->mount_apivfs && (context->root_image || context->root_directory))
1731                 return true;
1732
1733         return false;
1734 }
1735
1736 static int setup_private_users(uid_t uid, gid_t gid) {
1737         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1738         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1739         _cleanup_close_ int unshare_ready_fd = -1;
1740         _cleanup_(sigkill_waitp) pid_t pid = 0;
1741         uint64_t c = 1;
1742         siginfo_t si;
1743         ssize_t n;
1744         int r;
1745
1746         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1747          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1748          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1749          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1750          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1751          * continues execution normally. */
1752
1753         if (uid != 0 && uid_is_valid(uid)) {
1754                 r = asprintf(&uid_map,
1755                              "0 0 1\n"                      /* Map root → root */
1756                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1757                              uid, uid);
1758                 if (r < 0)
1759                         return -ENOMEM;
1760         } else {
1761                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1762                 if (!uid_map)
1763                         return -ENOMEM;
1764         }
1765
1766         if (gid != 0 && gid_is_valid(gid)) {
1767                 r = asprintf(&gid_map,
1768                              "0 0 1\n"                      /* Map root → root */
1769                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1770                              gid, gid);
1771                 if (r < 0)
1772                         return -ENOMEM;
1773         } else {
1774                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1775                 if (!gid_map)
1776                         return -ENOMEM;
1777         }
1778
1779         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1780          * namespace. */
1781         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1782         if (unshare_ready_fd < 0)
1783                 return -errno;
1784
1785         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1786          * failed. */
1787         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1788                 return -errno;
1789
1790         pid = fork();
1791         if (pid < 0)
1792                 return -errno;
1793
1794         if (pid == 0) {
1795                 _cleanup_close_ int fd = -1;
1796                 const char *a;
1797                 pid_t ppid;
1798
1799                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1800                  * here, after the parent opened its own user namespace. */
1801
1802                 ppid = getppid();
1803                 errno_pipe[0] = safe_close(errno_pipe[0]);
1804
1805                 /* Wait until the parent unshared the user namespace */
1806                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1807                         r = -errno;
1808                         goto child_fail;
1809                 }
1810
1811                 /* Disable the setgroups() system call in the child user namespace, for good. */
1812                 a = procfs_file_alloca(ppid, "setgroups");
1813                 fd = open(a, O_WRONLY|O_CLOEXEC);
1814                 if (fd < 0) {
1815                         if (errno != ENOENT) {
1816                                 r = -errno;
1817                                 goto child_fail;
1818                         }
1819
1820                         /* If the file is missing the kernel is too old, let's continue anyway. */
1821                 } else {
1822                         if (write(fd, "deny\n", 5) < 0) {
1823                                 r = -errno;
1824                                 goto child_fail;
1825                         }
1826
1827                         fd = safe_close(fd);
1828                 }
1829
1830                 /* First write the GID map */
1831                 a = procfs_file_alloca(ppid, "gid_map");
1832                 fd = open(a, O_WRONLY|O_CLOEXEC);
1833                 if (fd < 0) {
1834                         r = -errno;
1835                         goto child_fail;
1836                 }
1837                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1838                         r = -errno;
1839                         goto child_fail;
1840                 }
1841                 fd = safe_close(fd);
1842
1843                 /* The write the UID map */
1844                 a = procfs_file_alloca(ppid, "uid_map");
1845                 fd = open(a, O_WRONLY|O_CLOEXEC);
1846                 if (fd < 0) {
1847                         r = -errno;
1848                         goto child_fail;
1849                 }
1850                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1851                         r = -errno;
1852                         goto child_fail;
1853                 }
1854
1855                 _exit(EXIT_SUCCESS);
1856
1857         child_fail:
1858                 (void) write(errno_pipe[1], &r, sizeof(r));
1859                 _exit(EXIT_FAILURE);
1860         }
1861
1862         errno_pipe[1] = safe_close(errno_pipe[1]);
1863
1864         if (unshare(CLONE_NEWUSER) < 0)
1865                 return -errno;
1866
1867         /* Let the child know that the namespace is ready now */
1868         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1869                 return -errno;
1870
1871         /* Try to read an error code from the child */
1872         n = read(errno_pipe[0], &r, sizeof(r));
1873         if (n < 0)
1874                 return -errno;
1875         if (n == sizeof(r)) { /* an error code was sent to us */
1876                 if (r < 0)
1877                         return r;
1878                 return -EIO;
1879         }
1880         if (n != 0) /* on success we should have read 0 bytes */
1881                 return -EIO;
1882
1883         r = wait_for_terminate(pid, &si);
1884         if (r < 0)
1885                 return r;
1886         pid = 0;
1887
1888         /* If something strange happened with the child, let's consider this fatal, too */
1889         if (si.si_code != CLD_EXITED || si.si_status != 0)
1890                 return -EIO;
1891
1892         return 0;
1893 }
1894
1895 static int setup_exec_directory(
1896                 const ExecContext *context,
1897                 const ExecParameters *params,
1898                 uid_t uid,
1899                 gid_t gid,
1900                 ExecDirectoryType type,
1901                 int *exit_status) {
1902
1903         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1904                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1905                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1906                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1907                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1908                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1909         };
1910         char **rt;
1911         int r;
1912
1913         assert(context);
1914         assert(params);
1915         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1916         assert(exit_status);
1917
1918         if (!params->prefix[type])
1919                 return 0;
1920
1921         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1922                 if (!uid_is_valid(uid))
1923                         uid = 0;
1924                 if (!gid_is_valid(gid))
1925                         gid = 0;
1926         }
1927
1928         STRV_FOREACH(rt, context->directories[type].paths) {
1929                 _cleanup_free_ char *p = NULL, *pp = NULL;
1930                 const char *effective;
1931
1932                 p = strjoin(params->prefix[type], "/", *rt);
1933                 if (!p) {
1934                         r = -ENOMEM;
1935                         goto fail;
1936                 }
1937
1938                 r = mkdir_parents_label(p, 0755);
1939                 if (r < 0)
1940                         goto fail;
1941
1942                 if (context->dynamic_user &&
1943                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
1944                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
1945
1946                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
1947                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
1948                          * whose UID is later on reused. To lock this down we use the same trick used by container
1949                          * managers to prohibit host users to get access to files of the same UID in containers: we
1950                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
1951                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
1952                          * to make this directory permeable for the service itself.
1953                          *
1954                          * Specifically: for a service which wants a special directory "foo/" we first create a
1955                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
1956                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
1957                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
1958                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
1959                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
1960                          * disabling the access boundary for the service and making sure it only gets access to the
1961                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
1962                          *
1963                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
1964                          * owned by the service itself.
1965                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
1966                          * files or sockets with other services. */
1967
1968                         private_root = strjoin(params->prefix[type], "/private");
1969                         if (!private_root) {
1970                                 r = -ENOMEM;
1971                                 goto fail;
1972                         }
1973
1974                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
1975                         r = mkdir_safe_label(private_root, 0700, 0, 0, false);
1976                         if (r < 0)
1977                                 goto fail;
1978
1979                         pp = strjoin(private_root, "/", *rt);
1980                         if (!pp) {
1981                                 r = -ENOMEM;
1982                                 goto fail;
1983                         }
1984
1985                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
1986                         r = mkdir_parents_label(pp, 0755);
1987                         if (r < 0)
1988                                 goto fail;
1989
1990                         /* Finally, create the actual directory for the service */
1991                         r = mkdir_label(pp, context->directories[type].mode);
1992                         if (r < 0 && r != -EEXIST)
1993                                 goto fail;
1994
1995                         parent = dirname_malloc(p);
1996                         if (!parent) {
1997                                 r = -ENOMEM;
1998                                 goto fail;
1999                         }
2000
2001                         r = path_make_relative(parent, pp, &relative);
2002                         if (r < 0)
2003                                 goto fail;
2004
2005                         /* And link it up from the original place */
2006                         r = symlink_idempotent(relative, p);
2007                         if (r < 0)
2008                                 goto fail;
2009
2010                         effective = pp;
2011
2012                 } else {
2013                         r = mkdir_label(p, context->directories[type].mode);
2014                         if (r < 0 && r != -EEXIST)
2015                                 goto fail;
2016
2017                         effective = p;
2018                 }
2019
2020                 /* First lock down the access mode */
2021                 if (chmod(effective, context->directories[type].mode) < 0) {
2022                         r = -errno;
2023                         goto fail;
2024                 }
2025
2026                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2027                  * a service, and shall not be writable. */
2028                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2029                         continue;
2030
2031                 /* Then, change the ownership of the whole tree, if necessary */
2032                 r = path_chown_recursive(effective, uid, gid);
2033                 if (r < 0)
2034                         goto fail;
2035         }
2036
2037         return 0;
2038
2039 fail:
2040         *exit_status = exit_status_table[type];
2041         return r;
2042 }
2043
2044 static int setup_smack(
2045                 const ExecContext *context,
2046                 const ExecCommand *command) {
2047
2048         int r;
2049
2050         assert(context);
2051         assert(command);
2052
2053         if (context->smack_process_label) {
2054                 r = mac_smack_apply_pid(0, context->smack_process_label);
2055                 if (r < 0)
2056                         return r;
2057         }
2058 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2059         else {
2060                 _cleanup_free_ char *exec_label = NULL;
2061
2062                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2063                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2064                         return r;
2065
2066                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2067                 if (r < 0)
2068                         return r;
2069         }
2070 #endif
2071
2072         return 0;
2073 }
2074
2075 static int compile_bind_mounts(
2076                 const ExecContext *context,
2077                 const ExecParameters *params,
2078                 BindMount **ret_bind_mounts,
2079                 unsigned *ret_n_bind_mounts,
2080                 char ***ret_empty_directories) {
2081
2082         _cleanup_strv_free_ char **empty_directories = NULL;
2083         BindMount *bind_mounts;
2084         unsigned n, h = 0, i;
2085         ExecDirectoryType t;
2086         int r;
2087
2088         assert(context);
2089         assert(params);
2090         assert(ret_bind_mounts);
2091         assert(ret_n_bind_mounts);
2092         assert(ret_empty_directories);
2093
2094         n = context->n_bind_mounts;
2095         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2096                 if (!params->prefix[t])
2097                         continue;
2098
2099                 n += strv_length(context->directories[t].paths);
2100         }
2101
2102         if (n <= 0) {
2103                 *ret_bind_mounts = NULL;
2104                 *ret_n_bind_mounts = 0;
2105                 *ret_empty_directories = NULL;
2106                 return 0;
2107         }
2108
2109         bind_mounts = new(BindMount, n);
2110         if (!bind_mounts)
2111                 return -ENOMEM;
2112
2113         for (i = 0; i < context->n_bind_mounts; i++) {
2114                 BindMount *item = context->bind_mounts + i;
2115                 char *s, *d;
2116
2117                 s = strdup(item->source);
2118                 if (!s) {
2119                         r = -ENOMEM;
2120                         goto finish;
2121                 }
2122
2123                 d = strdup(item->destination);
2124                 if (!d) {
2125                         free(s);
2126                         r = -ENOMEM;
2127                         goto finish;
2128                 }
2129
2130                 bind_mounts[h++] = (BindMount) {
2131                         .source = s,
2132                         .destination = d,
2133                         .read_only = item->read_only,
2134                         .recursive = item->recursive,
2135                         .ignore_enoent = item->ignore_enoent,
2136                 };
2137         }
2138
2139         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2140                 char **suffix;
2141
2142                 if (!params->prefix[t])
2143                         continue;
2144
2145                 if (strv_isempty(context->directories[t].paths))
2146                         continue;
2147
2148                 if (context->dynamic_user &&
2149                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2150                         char *private_root;
2151
2152                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2153                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2154                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2155
2156                         private_root = strjoin(params->prefix[t], "/private");
2157                         if (!private_root) {
2158                                 r = -ENOMEM;
2159                                 goto finish;
2160                         }
2161
2162                         r = strv_consume(&empty_directories, private_root);
2163                         if (r < 0) {
2164                                 r = -ENOMEM;
2165                                 goto finish;
2166                         }
2167                 }
2168
2169                 STRV_FOREACH(suffix, context->directories[t].paths) {
2170                         char *s, *d;
2171
2172                         if (context->dynamic_user &&
2173                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2174                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2175                         else
2176                                 s = strjoin(params->prefix[t], "/", *suffix);
2177                         if (!s) {
2178                                 r = -ENOMEM;
2179                                 goto finish;
2180                         }
2181
2182                         d = strdup(s);
2183                         if (!d) {
2184                                 free(s);
2185                                 r = -ENOMEM;
2186                                 goto finish;
2187                         }
2188
2189                         bind_mounts[h++] = (BindMount) {
2190                                 .source = s,
2191                                 .destination = d,
2192                                 .read_only = false,
2193                                 .recursive = true,
2194                                 .ignore_enoent = false,
2195                         };
2196                 }
2197         }
2198
2199         assert(h == n);
2200
2201         *ret_bind_mounts = bind_mounts;
2202         *ret_n_bind_mounts = n;
2203         *ret_empty_directories = empty_directories;
2204
2205         empty_directories = NULL;
2206
2207         return (int) n;
2208
2209 finish:
2210         bind_mount_free_many(bind_mounts, h);
2211         return r;
2212 }
2213
2214 static int apply_mount_namespace(
2215                 Unit *u,
2216                 ExecCommand *command,
2217                 const ExecContext *context,
2218                 const ExecParameters *params,
2219                 ExecRuntime *runtime) {
2220
2221         _cleanup_strv_free_ char **empty_directories = NULL;
2222         char *tmp = NULL, *var = NULL;
2223         const char *root_dir = NULL, *root_image = NULL;
2224         NamespaceInfo ns_info = {
2225                 .ignore_protect_paths = false,
2226                 .private_dev = context->private_devices,
2227                 .protect_control_groups = context->protect_control_groups,
2228                 .protect_kernel_tunables = context->protect_kernel_tunables,
2229                 .protect_kernel_modules = context->protect_kernel_modules,
2230                 .mount_apivfs = context->mount_apivfs,
2231         };
2232         bool needs_sandboxing;
2233         BindMount *bind_mounts = NULL;
2234         unsigned n_bind_mounts = 0;
2235         int r;
2236
2237         assert(context);
2238
2239         /* The runtime struct only contains the parent of the private /tmp,
2240          * which is non-accessible to world users. Inside of it there's a /tmp
2241          * that is sticky, and that's the one we want to use here. */
2242
2243         if (context->private_tmp && runtime) {
2244                 if (runtime->tmp_dir)
2245                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2246                 if (runtime->var_tmp_dir)
2247                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2248         }
2249
2250         if (params->flags & EXEC_APPLY_CHROOT) {
2251                 root_image = context->root_image;
2252
2253                 if (!root_image)
2254                         root_dir = context->root_directory;
2255         }
2256
2257         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2258         if (r < 0)
2259                 return r;
2260
2261         /*
2262          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2263          * sandbox info, otherwise enforce it, don't ignore protected paths and
2264          * fail if we are enable to apply the sandbox inside the mount namespace.
2265          */
2266         if (!context->dynamic_user && root_dir)
2267                 ns_info.ignore_protect_paths = true;
2268
2269         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2270
2271         r = setup_namespace(root_dir, root_image,
2272                             &ns_info, context->read_write_paths,
2273                             needs_sandboxing ? context->read_only_paths : NULL,
2274                             needs_sandboxing ? context->inaccessible_paths : NULL,
2275                             empty_directories,
2276                             bind_mounts,
2277                             n_bind_mounts,
2278                             tmp,
2279                             var,
2280                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2281                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2282                             context->mount_flags,
2283                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2284
2285         bind_mount_free_many(bind_mounts, n_bind_mounts);
2286
2287         /* If we couldn't set up the namespace this is probably due to a
2288          * missing capability. In this case, silently proceeed. */
2289         if (IN_SET(r, -EPERM, -EACCES)) {
2290                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2291                 return 0;
2292         }
2293
2294         return r;
2295 }
2296
2297 static int apply_working_directory(
2298                 const ExecContext *context,
2299                 const ExecParameters *params,
2300                 const char *home,
2301                 const bool needs_mount_ns,
2302                 int *exit_status) {
2303
2304         const char *d, *wd;
2305
2306         assert(context);
2307         assert(exit_status);
2308
2309         if (context->working_directory_home) {
2310
2311                 if (!home) {
2312                         *exit_status = EXIT_CHDIR;
2313                         return -ENXIO;
2314                 }
2315
2316                 wd = home;
2317
2318         } else if (context->working_directory)
2319                 wd = context->working_directory;
2320         else
2321                 wd = "/";
2322
2323         if (params->flags & EXEC_APPLY_CHROOT) {
2324                 if (!needs_mount_ns && context->root_directory)
2325                         if (chroot(context->root_directory) < 0) {
2326                                 *exit_status = EXIT_CHROOT;
2327                                 return -errno;
2328                         }
2329
2330                 d = wd;
2331         } else
2332                 d = prefix_roota(context->root_directory, wd);
2333
2334         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2335                 *exit_status = EXIT_CHDIR;
2336                 return -errno;
2337         }
2338
2339         return 0;
2340 }
2341
2342 static int setup_keyring(
2343                 Unit *u,
2344                 const ExecContext *context,
2345                 const ExecParameters *p,
2346                 uid_t uid, gid_t gid) {
2347
2348         key_serial_t keyring;
2349         int r;
2350
2351         assert(u);
2352         assert(context);
2353         assert(p);
2354
2355         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2356          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2357          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2358          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2359          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2360          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2361
2362         if (!(p->flags & EXEC_NEW_KEYRING))
2363                 return 0;
2364
2365         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2366                 return 0;
2367
2368         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2369         if (keyring == -1) {
2370                 if (errno == ENOSYS)
2371                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2372                 else if (IN_SET(errno, EACCES, EPERM))
2373                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2374                 else if (errno == EDQUOT)
2375                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2376                 else
2377                         return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2378
2379                 return 0;
2380         }
2381
2382         /* Populate they keyring with the invocation ID by default. */
2383         if (!sd_id128_is_null(u->invocation_id)) {
2384                 key_serial_t key;
2385
2386                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2387                 if (key == -1)
2388                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2389                 else {
2390                         if (keyctl(KEYCTL_SETPERM, key,
2391                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2392                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2393                                 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2394                 }
2395         }
2396
2397         /* And now, make the keyring owned by the service's user */
2398         if (uid_is_valid(uid) || gid_is_valid(gid))
2399                 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2400                         return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2401
2402         /* When requested link the user keyring into the session keyring. */
2403         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2404                 uid_t saved_uid;
2405                 gid_t saved_gid;
2406
2407                 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2408                  * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2409                  * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2410
2411                 saved_uid = getuid();
2412                 saved_gid = getgid();
2413
2414                 if (gid_is_valid(gid) && gid != saved_gid) {
2415                         if (setregid(gid, -1) < 0)
2416                                 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2417                 }
2418
2419                 if (uid_is_valid(uid) && uid != saved_uid) {
2420                         if (setreuid(uid, -1) < 0) {
2421                                 (void) setregid(saved_gid, -1);
2422                                 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2423                         }
2424                 }
2425
2426                 if (keyctl(KEYCTL_LINK,
2427                            KEY_SPEC_USER_KEYRING,
2428                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2429
2430                         r = -errno;
2431
2432                         (void) setreuid(saved_uid, -1);
2433                         (void) setregid(saved_gid, -1);
2434
2435                         return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2436                 }
2437
2438                 if (uid_is_valid(uid) && uid != saved_uid) {
2439                         if (setreuid(saved_uid, -1) < 0) {
2440                                 (void) setregid(saved_gid, -1);
2441                                 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2442                         }
2443                 }
2444
2445                 if (gid_is_valid(gid) && gid != saved_gid) {
2446                         if (setregid(saved_gid, -1) < 0)
2447                                 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2448                 }
2449         }
2450
2451         return 0;
2452 }
2453
2454 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2455         assert(array);
2456         assert(n);
2457
2458         if (!pair)
2459                 return;
2460
2461         if (pair[0] >= 0)
2462                 array[(*n)++] = pair[0];
2463         if (pair[1] >= 0)
2464                 array[(*n)++] = pair[1];
2465 }
2466
2467 static int close_remaining_fds(
2468                 const ExecParameters *params,
2469                 ExecRuntime *runtime,
2470                 DynamicCreds *dcreds,
2471                 int user_lookup_fd,
2472                 int socket_fd,
2473                 int *fds, unsigned n_fds) {
2474
2475         unsigned n_dont_close = 0;
2476         int dont_close[n_fds + 12];
2477
2478         assert(params);
2479
2480         if (params->stdin_fd >= 0)
2481                 dont_close[n_dont_close++] = params->stdin_fd;
2482         if (params->stdout_fd >= 0)
2483                 dont_close[n_dont_close++] = params->stdout_fd;
2484         if (params->stderr_fd >= 0)
2485                 dont_close[n_dont_close++] = params->stderr_fd;
2486
2487         if (socket_fd >= 0)
2488                 dont_close[n_dont_close++] = socket_fd;
2489         if (n_fds > 0) {
2490                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2491                 n_dont_close += n_fds;
2492         }
2493
2494         if (runtime)
2495                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2496
2497         if (dcreds) {
2498                 if (dcreds->user)
2499                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2500                 if (dcreds->group)
2501                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2502         }
2503
2504         if (user_lookup_fd >= 0)
2505                 dont_close[n_dont_close++] = user_lookup_fd;
2506
2507         return close_all_fds(dont_close, n_dont_close);
2508 }
2509
2510 static int send_user_lookup(
2511                 Unit *unit,
2512                 int user_lookup_fd,
2513                 uid_t uid,
2514                 gid_t gid) {
2515
2516         assert(unit);
2517
2518         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2519          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2520          * specified. */
2521
2522         if (user_lookup_fd < 0)
2523                 return 0;
2524
2525         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2526                 return 0;
2527
2528         if (writev(user_lookup_fd,
2529                (struct iovec[]) {
2530                            IOVEC_INIT(&uid, sizeof(uid)),
2531                            IOVEC_INIT(&gid, sizeof(gid)),
2532                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2533                 return -errno;
2534
2535         return 0;
2536 }
2537
2538 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2539         int r;
2540
2541         assert(c);
2542         assert(home);
2543         assert(buf);
2544
2545         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2546
2547         if (*home)
2548                 return 0;
2549
2550         if (!c->working_directory_home)
2551                 return 0;
2552
2553         if (uid == 0) {
2554                 /* Hardcode /root as home directory for UID 0 */
2555                 *home = "/root";
2556                 return 1;
2557         }
2558
2559         r = get_home_dir(buf);
2560         if (r < 0)
2561                 return r;
2562
2563         *home = *buf;
2564         return 1;
2565 }
2566
2567 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2568         _cleanup_strv_free_ char ** list = NULL;
2569         ExecDirectoryType t;
2570         int r;
2571
2572         assert(c);
2573         assert(p);
2574         assert(ret);
2575
2576         assert(c->dynamic_user);
2577
2578         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2579          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2580          * directories. */
2581
2582         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2583                 char **i;
2584
2585                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2586                         continue;
2587
2588                 if (!p->prefix[t])
2589                         continue;
2590
2591                 STRV_FOREACH(i, c->directories[t].paths) {
2592                         char *e;
2593
2594                         if (t == EXEC_DIRECTORY_RUNTIME)
2595                                 e = strjoin(p->prefix[t], "/", *i);
2596                         else
2597                                 e = strjoin(p->prefix[t], "/private/", *i);
2598                         if (!e)
2599                                 return -ENOMEM;
2600
2601                         r = strv_consume(&list, e);
2602                         if (r < 0)
2603                                 return r;
2604                 }
2605         }
2606
2607         *ret = list;
2608         list = NULL;
2609
2610         return 0;
2611 }
2612
2613 static int exec_child(
2614                 Unit *unit,
2615                 ExecCommand *command,
2616                 const ExecContext *context,
2617                 const ExecParameters *params,
2618                 ExecRuntime *runtime,
2619                 DynamicCreds *dcreds,
2620                 char **argv,
2621                 int socket_fd,
2622                 int named_iofds[3],
2623                 int *fds,
2624                 unsigned n_storage_fds,
2625                 unsigned n_socket_fds,
2626                 char **files_env,
2627                 int user_lookup_fd,
2628                 int *exit_status) {
2629
2630         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2631         _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2632         _cleanup_free_ gid_t *supplementary_gids = NULL;
2633         const char *username = NULL, *groupname = NULL;
2634         const char *home = NULL, *shell = NULL;
2635         dev_t journal_stream_dev = 0;
2636         ino_t journal_stream_ino = 0;
2637         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2638                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2639                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2640                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2641 #if HAVE_SELINUX
2642         bool use_selinux = false;
2643 #endif
2644 #if ENABLE_SMACK
2645         bool use_smack = false;
2646 #endif
2647 #if HAVE_APPARMOR
2648         bool use_apparmor = false;
2649 #endif
2650         uid_t uid = UID_INVALID;
2651         gid_t gid = GID_INVALID;
2652         int i, r, ngids = 0;
2653         unsigned n_fds;
2654         ExecDirectoryType dt;
2655         int secure_bits;
2656
2657         assert(unit);
2658         assert(command);
2659         assert(context);
2660         assert(params);
2661         assert(exit_status);
2662
2663         rename_process_from_path(command->path);
2664
2665         /* We reset exactly these signals, since they are the
2666          * only ones we set to SIG_IGN in the main daemon. All
2667          * others we leave untouched because we set them to
2668          * SIG_DFL or a valid handler initially, both of which
2669          * will be demoted to SIG_DFL. */
2670         (void) default_signals(SIGNALS_CRASH_HANDLER,
2671                                SIGNALS_IGNORE, -1);
2672
2673         if (context->ignore_sigpipe)
2674                 (void) ignore_signals(SIGPIPE, -1);
2675
2676         r = reset_signal_mask();
2677         if (r < 0) {
2678                 *exit_status = EXIT_SIGNAL_MASK;
2679                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2680         }
2681
2682         if (params->idle_pipe)
2683                 do_idle_pipe_dance(params->idle_pipe);
2684
2685         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2686          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2687          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2688          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2689
2690         log_forget_fds();
2691         log_set_open_when_needed(true);
2692
2693         /* In case anything used libc syslog(), close this here, too */
2694         closelog();
2695
2696         n_fds = n_storage_fds + n_socket_fds;
2697         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2698         if (r < 0) {
2699                 *exit_status = EXIT_FDS;
2700                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2701         }
2702
2703         if (!context->same_pgrp)
2704                 if (setsid() < 0) {
2705                         *exit_status = EXIT_SETSID;
2706                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2707                 }
2708
2709         exec_context_tty_reset(context, params);
2710
2711         if (unit_shall_confirm_spawn(unit)) {
2712                 const char *vc = params->confirm_spawn;
2713                 _cleanup_free_ char *cmdline = NULL;
2714
2715                 cmdline = exec_command_line(argv);
2716                 if (!cmdline) {
2717                         *exit_status = EXIT_MEMORY;
2718                         return log_oom();
2719                 }
2720
2721                 r = ask_for_confirmation(vc, unit, cmdline);
2722                 if (r != CONFIRM_EXECUTE) {
2723                         if (r == CONFIRM_PRETEND_SUCCESS) {
2724                                 *exit_status = EXIT_SUCCESS;
2725                                 return 0;
2726                         }
2727                         *exit_status = EXIT_CONFIRM;
2728                         log_unit_error(unit, "Execution cancelled by the user");
2729                         return -ECANCELED;
2730                 }
2731         }
2732
2733         if (context->dynamic_user && dcreds) {
2734                 _cleanup_strv_free_ char **suggested_paths = NULL;
2735
2736                 /* Make sure we bypass our own NSS module for any NSS checks */
2737                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2738                         *exit_status = EXIT_USER;
2739                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2740                 }
2741
2742                 r = compile_suggested_paths(context, params, &suggested_paths);
2743                 if (r < 0) {
2744                         *exit_status = EXIT_MEMORY;
2745                         return log_oom();
2746                 }
2747
2748                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2749                 if (r < 0) {
2750                         *exit_status = EXIT_USER;
2751                         if (r == -EILSEQ) {
2752                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2753                                 return -EOPNOTSUPP;
2754                         }
2755                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2756                 }
2757
2758                 if (!uid_is_valid(uid)) {
2759                         *exit_status = EXIT_USER;
2760                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2761                         return -ESRCH;
2762                 }
2763
2764                 if (!gid_is_valid(gid)) {
2765                         *exit_status = EXIT_USER;
2766                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2767                         return -ESRCH;
2768                 }
2769
2770                 if (dcreds->user)
2771                         username = dcreds->user->name;
2772
2773         } else {
2774                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2775                 if (r < 0) {
2776                         *exit_status = EXIT_USER;
2777                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2778                 }
2779
2780                 r = get_fixed_group(context, &groupname, &gid);
2781                 if (r < 0) {
2782                         *exit_status = EXIT_GROUP;
2783                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2784                 }
2785         }
2786
2787         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2788         r = get_supplementary_groups(context, username, groupname, gid,
2789                                      &supplementary_gids, &ngids);
2790         if (r < 0) {
2791                 *exit_status = EXIT_GROUP;
2792                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2793         }
2794
2795         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2796         if (r < 0) {
2797                 *exit_status = EXIT_USER;
2798                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2799         }
2800
2801         user_lookup_fd = safe_close(user_lookup_fd);
2802
2803         r = acquire_home(context, uid, &home, &home_buffer);
2804         if (r < 0) {
2805                 *exit_status = EXIT_CHDIR;
2806                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2807         }
2808
2809         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2810          * must sure to drop O_NONBLOCK */
2811         if (socket_fd >= 0)
2812                 (void) fd_nonblock(socket_fd, false);
2813
2814         r = setup_input(context, params, socket_fd, named_iofds);
2815         if (r < 0) {
2816                 *exit_status = EXIT_STDIN;
2817                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2818         }
2819
2820         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2821         if (r < 0) {
2822                 *exit_status = EXIT_STDOUT;
2823                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2824         }
2825
2826         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2827         if (r < 0) {
2828                 *exit_status = EXIT_STDERR;
2829                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2830         }
2831
2832         if (params->cgroup_path) {
2833                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2834                 if (r < 0) {
2835                         *exit_status = EXIT_CGROUP;
2836                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2837                 }
2838         }
2839
2840         if (context->oom_score_adjust_set) {
2841                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2842
2843                 /* When we can't make this change due to EPERM, then
2844                  * let's silently skip over it. User namespaces
2845                  * prohibit write access to this file, and we
2846                  * shouldn't trip up over that. */
2847
2848                 sprintf(t, "%i", context->oom_score_adjust);
2849                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2850                 if (IN_SET(r, -EPERM, -EACCES))
2851                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2852                 else if (r < 0) {
2853                         *exit_status = EXIT_OOM_ADJUST;
2854                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2855                 }
2856         }
2857
2858         if (context->nice_set)
2859                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2860                         *exit_status = EXIT_NICE;
2861                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2862                 }
2863
2864         if (context->cpu_sched_set) {
2865                 struct sched_param param = {
2866                         .sched_priority = context->cpu_sched_priority,
2867                 };
2868
2869                 r = sched_setscheduler(0,
2870                                        context->cpu_sched_policy |
2871                                        (context->cpu_sched_reset_on_fork ?
2872                                         SCHED_RESET_ON_FORK : 0),
2873                                        &param);
2874                 if (r < 0) {
2875                         *exit_status = EXIT_SETSCHEDULER;
2876                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2877                 }
2878         }
2879
2880         if (context->cpuset)
2881                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2882                         *exit_status = EXIT_CPUAFFINITY;
2883                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2884                 }
2885
2886         if (context->ioprio_set)
2887                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2888                         *exit_status = EXIT_IOPRIO;
2889                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2890                 }
2891
2892         if (context->timer_slack_nsec != NSEC_INFINITY)
2893                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2894                         *exit_status = EXIT_TIMERSLACK;
2895                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2896                 }
2897
2898         if (context->personality != PERSONALITY_INVALID) {
2899                 r = safe_personality(context->personality);
2900                 if (r < 0) {
2901                         *exit_status = EXIT_PERSONALITY;
2902                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2903                 }
2904         }
2905
2906         if (context->utmp_id)
2907                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2908                                       context->tty_path,
2909                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
2910                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2911                                       USER_PROCESS,
2912                                       username);
2913
2914         if (context->user) {
2915                 r = chown_terminal(STDIN_FILENO, uid);
2916                 if (r < 0) {
2917                         *exit_status = EXIT_STDIN;
2918                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2919                 }
2920         }
2921
2922         /* If delegation is enabled we'll pass ownership of the cgroup
2923          * (but only in systemd's own controller hierarchy!) to the
2924          * user of the new process. */
2925         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2926                 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2927                 if (r < 0) {
2928                         *exit_status = EXIT_CGROUP;
2929                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2930                 }
2931
2932                 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2933                 if (r < 0) {
2934                         *exit_status = EXIT_CGROUP;
2935                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
2936                 }
2937         }
2938
2939         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2940                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2941                 if (r < 0)
2942                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
2943         }
2944
2945         r = build_environment(
2946                         unit,
2947                         context,
2948                         params,
2949                         n_fds,
2950                         home,
2951                         username,
2952                         shell,
2953                         journal_stream_dev,
2954                         journal_stream_ino,
2955                         &our_env);
2956         if (r < 0) {
2957                 *exit_status = EXIT_MEMORY;
2958                 return log_oom();
2959         }
2960
2961         r = build_pass_environment(context, &pass_env);
2962         if (r < 0) {
2963                 *exit_status = EXIT_MEMORY;
2964                 return log_oom();
2965         }
2966
2967         accum_env = strv_env_merge(5,
2968                                    params->environment,
2969                                    our_env,
2970                                    pass_env,
2971                                    context->environment,
2972                                    files_env,
2973                                    NULL);
2974         if (!accum_env) {
2975                 *exit_status = EXIT_MEMORY;
2976                 return log_oom();
2977         }
2978         accum_env = strv_env_clean(accum_env);
2979
2980         (void) umask(context->umask);
2981
2982         r = setup_keyring(unit, context, params, uid, gid);
2983         if (r < 0) {
2984                 *exit_status = EXIT_KEYRING;
2985                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
2986         }
2987
2988         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2989         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2990
2991         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2992         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
2993
2994         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2995         if (needs_ambient_hack)
2996                 needs_setuid = false;
2997         else
2998                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
2999
3000         if (needs_sandboxing) {
3001                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3002                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3003                  * impacting our own code paths. */
3004
3005 #if HAVE_SELINUX
3006                 use_selinux = mac_selinux_use();
3007 #endif
3008 #if ENABLE_SMACK
3009                 use_smack = mac_smack_use();
3010 #endif
3011 #if HAVE_APPARMOR
3012                 use_apparmor = mac_apparmor_use();
3013 #endif
3014         }
3015
3016         if (needs_setuid) {
3017                 if (context->pam_name && username) {
3018                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3019                         if (r < 0) {
3020                                 *exit_status = EXIT_PAM;
3021                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3022                         }
3023                 }
3024         }
3025
3026         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3027                 if (ns_type_supported(NAMESPACE_NET)) {
3028                         r = setup_netns(runtime->netns_storage_socket);
3029                         if (r < 0) {
3030                                 *exit_status = EXIT_NETWORK;
3031                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3032                         }
3033                 } else
3034                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3035         }
3036
3037         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3038         if (needs_mount_namespace) {
3039                 r = apply_mount_namespace(unit, command, context, params, runtime);
3040                 if (r < 0) {
3041                         *exit_status = EXIT_NAMESPACE;
3042                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3043                 }
3044         }
3045
3046         /* Apply just after mount namespace setup */
3047         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3048         if (r < 0)
3049                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3050
3051         /* Drop groups as early as possbile */
3052         if (needs_setuid) {
3053                 r = enforce_groups(gid, supplementary_gids, ngids);
3054                 if (r < 0) {
3055                         *exit_status = EXIT_GROUP;
3056                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3057                 }
3058         }
3059
3060         if (needs_sandboxing) {
3061 #if HAVE_SELINUX
3062                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3063                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3064                         if (r < 0) {
3065                                 *exit_status = EXIT_SELINUX_CONTEXT;
3066                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3067                         }
3068                 }
3069 #endif
3070
3071                 if (context->private_users) {
3072                         r = setup_private_users(uid, gid);
3073                         if (r < 0) {
3074                                 *exit_status = EXIT_USER;
3075                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3076                         }
3077                 }
3078         }
3079
3080         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3081          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3082          * was needed to upload the policy and can now be closed as well. */
3083         r = close_all_fds(fds, n_fds);
3084         if (r >= 0)
3085                 r = shift_fds(fds, n_fds);
3086         if (r >= 0)
3087                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3088         if (r < 0) {
3089                 *exit_status = EXIT_FDS;
3090                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3091         }
3092
3093         secure_bits = context->secure_bits;
3094
3095         if (needs_sandboxing) {
3096                 uint64_t bset;
3097
3098                 for (i = 0; i < _RLIMIT_MAX; i++) {
3099
3100                         if (!context->rlimit[i])
3101                                 continue;
3102
3103                         r = setrlimit_closest(i, context->rlimit[i]);
3104                         if (r < 0) {
3105                                 *exit_status = EXIT_LIMITS;
3106                                 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3107                         }
3108                 }
3109
3110                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3111                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3112                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3113                                 *exit_status = EXIT_LIMITS;
3114                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3115                         }
3116                 }
3117
3118                 bset = context->capability_bounding_set;
3119                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3120                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3121                  * instead of us doing that */
3122                 if (needs_ambient_hack)
3123                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3124                                 (UINT64_C(1) << CAP_SETUID) |
3125                                 (UINT64_C(1) << CAP_SETGID);
3126
3127                 if (!cap_test_all(bset)) {
3128                         r = capability_bounding_set_drop(bset, false);
3129                         if (r < 0) {
3130                                 *exit_status = EXIT_CAPABILITIES;
3131                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3132                         }
3133                 }
3134
3135                 /* This is done before enforce_user, but ambient set
3136                  * does not survive over setresuid() if keep_caps is not set. */
3137                 if (!needs_ambient_hack &&
3138                     context->capability_ambient_set != 0) {
3139                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3140                         if (r < 0) {
3141                                 *exit_status = EXIT_CAPABILITIES;
3142                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3143                         }
3144                 }
3145         }
3146
3147         if (needs_setuid) {
3148                 if (context->user) {
3149                         r = enforce_user(context, uid);
3150                         if (r < 0) {
3151                                 *exit_status = EXIT_USER;
3152                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3153                         }
3154
3155                         if (!needs_ambient_hack &&
3156                             context->capability_ambient_set != 0) {
3157
3158                                 /* Fix the ambient capabilities after user change. */
3159                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3160                                 if (r < 0) {
3161                                         *exit_status = EXIT_CAPABILITIES;
3162                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3163                                 }
3164
3165                                 /* If we were asked to change user and ambient capabilities
3166                                  * were requested, we had to add keep-caps to the securebits
3167                                  * so that we would maintain the inherited capability set
3168                                  * through the setresuid(). Make sure that the bit is added
3169                                  * also to the context secure_bits so that we don't try to
3170                                  * drop the bit away next. */
3171
3172                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3173                         }
3174                 }
3175         }
3176
3177         if (needs_sandboxing) {
3178                 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3179                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3180                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3181                  * are restricted. */
3182
3183 #if HAVE_SELINUX
3184                 if (use_selinux) {
3185                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3186
3187                         if (exec_context) {
3188                                 r = setexeccon(exec_context);
3189                                 if (r < 0) {
3190                                         *exit_status = EXIT_SELINUX_CONTEXT;
3191                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3192                                 }
3193                         }
3194                 }
3195 #endif
3196
3197 #if ENABLE_SMACK
3198                 if (use_smack) {
3199                         r = setup_smack(context, command);
3200                         if (r < 0) {
3201                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3202                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3203                         }
3204                 }
3205 #endif
3206
3207 #if HAVE_APPARMOR
3208                 if (use_apparmor && context->apparmor_profile) {
3209                         r = aa_change_onexec(context->apparmor_profile);
3210                         if (r < 0 && !context->apparmor_profile_ignore) {
3211                                 *exit_status = EXIT_APPARMOR_PROFILE;
3212                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3213                         }
3214                 }
3215 #endif
3216
3217                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3218                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3219                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3220                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3221                                 *exit_status = EXIT_SECUREBITS;
3222                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3223                         }
3224
3225                 if (context_has_no_new_privileges(context))
3226                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3227                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3228                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3229                         }
3230
3231 #if HAVE_SECCOMP
3232                 r = apply_address_families(unit, context);
3233                 if (r < 0) {
3234                         *exit_status = EXIT_ADDRESS_FAMILIES;
3235                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3236                 }
3237
3238                 r = apply_memory_deny_write_execute(unit, context);
3239                 if (r < 0) {
3240                         *exit_status = EXIT_SECCOMP;
3241                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3242                 }
3243
3244                 r = apply_restrict_realtime(unit, context);
3245                 if (r < 0) {
3246                         *exit_status = EXIT_SECCOMP;
3247                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3248                 }
3249
3250                 r = apply_restrict_namespaces(unit, context);
3251                 if (r < 0) {
3252                         *exit_status = EXIT_SECCOMP;
3253                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3254                 }
3255
3256                 r = apply_protect_sysctl(unit, context);
3257                 if (r < 0) {
3258                         *exit_status = EXIT_SECCOMP;
3259                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3260                 }
3261
3262                 r = apply_protect_kernel_modules(unit, context);
3263                 if (r < 0) {
3264                         *exit_status = EXIT_SECCOMP;
3265                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3266                 }
3267
3268                 r = apply_private_devices(unit, context);
3269                 if (r < 0) {
3270                         *exit_status = EXIT_SECCOMP;
3271                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3272                 }
3273
3274                 r = apply_syscall_archs(unit, context);
3275                 if (r < 0) {
3276                         *exit_status = EXIT_SECCOMP;
3277                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3278                 }
3279
3280                 r = apply_lock_personality(unit, context);
3281                 if (r < 0) {
3282                         *exit_status = EXIT_SECCOMP;
3283                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3284                 }
3285
3286                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3287                  * by the filter as little as possible. */
3288                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3289                 if (r < 0) {
3290                         *exit_status = EXIT_SECCOMP;
3291                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3292                 }
3293 #endif
3294         }
3295
3296         if (!strv_isempty(context->unset_environment)) {
3297                 char **ee = NULL;
3298
3299                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3300                 if (!ee) {
3301                         *exit_status = EXIT_MEMORY;
3302                         return log_oom();
3303                 }
3304
3305                 strv_free(accum_env);
3306                 accum_env = ee;
3307         }
3308
3309         final_argv = replace_env_argv(argv, accum_env);
3310         if (!final_argv) {
3311                 *exit_status = EXIT_MEMORY;
3312                 return log_oom();
3313         }
3314
3315         if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3316                 _cleanup_free_ char *line;
3317
3318                 line = exec_command_line(final_argv);
3319                 if (line) {
3320                         log_struct(LOG_DEBUG,
3321                                    "EXECUTABLE=%s", command->path,
3322                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3323                                    LOG_UNIT_ID(unit),
3324                                    LOG_UNIT_INVOCATION_ID(unit),
3325                                    NULL);
3326                 }
3327         }
3328
3329         execve(command->path, final_argv, accum_env);
3330
3331         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3332
3333                 log_struct_errno(LOG_INFO, errno,
3334                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3335                                  LOG_UNIT_ID(unit),
3336                                  LOG_UNIT_INVOCATION_ID(unit),
3337                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3338                                                   command->path),
3339                                  "EXECUTABLE=%s", command->path,
3340                                  NULL);
3341
3342                 return 0;
3343         }
3344
3345         *exit_status = EXIT_EXEC;
3346         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3347 }
3348
3349 int exec_spawn(Unit *unit,
3350                ExecCommand *command,
3351                const ExecContext *context,
3352                const ExecParameters *params,
3353                ExecRuntime *runtime,
3354                DynamicCreds *dcreds,
3355                pid_t *ret) {
3356
3357         _cleanup_strv_free_ char **files_env = NULL;
3358         int *fds = NULL;
3359         unsigned n_storage_fds = 0, n_socket_fds = 0;
3360         _cleanup_free_ char *line = NULL;
3361         int socket_fd, r;
3362         int named_iofds[3] = { -1, -1, -1 };
3363         char **argv;
3364         pid_t pid;
3365
3366         assert(unit);
3367         assert(command);
3368         assert(context);
3369         assert(ret);
3370         assert(params);
3371         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3372
3373         if (context->std_input == EXEC_INPUT_SOCKET ||
3374             context->std_output == EXEC_OUTPUT_SOCKET ||
3375             context->std_error == EXEC_OUTPUT_SOCKET) {
3376
3377                 if (params->n_socket_fds > 1) {
3378                         log_unit_error(unit, "Got more than one socket.");
3379                         return -EINVAL;
3380                 }
3381
3382                 if (params->n_socket_fds == 0) {
3383                         log_unit_error(unit, "Got no socket.");
3384                         return -EINVAL;
3385                 }
3386
3387                 socket_fd = params->fds[0];
3388         } else {
3389                 socket_fd = -1;
3390                 fds = params->fds;
3391                 n_storage_fds = params->n_storage_fds;
3392                 n_socket_fds = params->n_socket_fds;
3393         }
3394
3395         r = exec_context_named_iofds(unit, context, params, named_iofds);
3396         if (r < 0)
3397                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3398
3399         r = exec_context_load_environment(unit, context, &files_env);
3400         if (r < 0)
3401                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3402
3403         argv = params->argv ?: command->argv;
3404         line = exec_command_line(argv);
3405         if (!line)
3406                 return log_oom();
3407
3408         log_struct(LOG_DEBUG,
3409                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3410                    "EXECUTABLE=%s", command->path,
3411                    LOG_UNIT_ID(unit),
3412                    LOG_UNIT_INVOCATION_ID(unit),
3413                    NULL);
3414
3415         pid = fork();
3416         if (pid < 0)
3417                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3418
3419         if (pid == 0) {
3420                 int exit_status = EXIT_SUCCESS;
3421
3422                 r = exec_child(unit,
3423                                command,
3424                                context,
3425                                params,
3426                                runtime,
3427                                dcreds,
3428                                argv,
3429                                socket_fd,
3430                                named_iofds,
3431                                fds,
3432                                n_storage_fds,
3433                                n_socket_fds,
3434                                files_env,
3435                                unit->manager->user_lookup_fds[1],
3436                                &exit_status);
3437
3438                 if (r < 0) {
3439                         log_struct_errno(LOG_ERR, r,
3440                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3441                                          LOG_UNIT_ID(unit),
3442                                          LOG_UNIT_INVOCATION_ID(unit),
3443                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3444                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3445                                                           command->path),
3446                                          "EXECUTABLE=%s", command->path,
3447                                          NULL);
3448                 }
3449
3450                 _exit(exit_status);
3451         }
3452
3453         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3454
3455         /* We add the new process to the cgroup both in the child (so
3456          * that we can be sure that no user code is ever executed
3457          * outside of the cgroup) and in the parent (so that we can be
3458          * sure that when we kill the cgroup the process will be
3459          * killed too). */
3460         if (params->cgroup_path)
3461                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3462
3463         exec_status_start(&command->exec_status, pid);
3464
3465         *ret = pid;
3466         return 0;
3467 }
3468
3469 void exec_context_init(ExecContext *c) {
3470         ExecDirectoryType i;
3471
3472         assert(c);
3473
3474         c->umask = 0022;
3475         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3476         c->cpu_sched_policy = SCHED_OTHER;
3477         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3478         c->syslog_level_prefix = true;
3479         c->ignore_sigpipe = true;
3480         c->timer_slack_nsec = NSEC_INFINITY;
3481         c->personality = PERSONALITY_INVALID;
3482         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3483                 c->directories[i].mode = 0755;
3484         c->capability_bounding_set = CAP_ALL;
3485         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3486         c->log_level_max = -1;
3487 }
3488
3489 void exec_context_done(ExecContext *c) {
3490         ExecDirectoryType i;
3491         size_t l;
3492
3493         assert(c);
3494
3495         c->environment = strv_free(c->environment);
3496         c->environment_files = strv_free(c->environment_files);
3497         c->pass_environment = strv_free(c->pass_environment);
3498         c->unset_environment = strv_free(c->unset_environment);
3499
3500         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3501                 c->rlimit[l] = mfree(c->rlimit[l]);
3502
3503         for (l = 0; l < 3; l++)
3504                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3505
3506         c->working_directory = mfree(c->working_directory);
3507         c->root_directory = mfree(c->root_directory);
3508         c->root_image = mfree(c->root_image);
3509         c->tty_path = mfree(c->tty_path);
3510         c->syslog_identifier = mfree(c->syslog_identifier);
3511         c->user = mfree(c->user);
3512         c->group = mfree(c->group);
3513
3514         c->supplementary_groups = strv_free(c->supplementary_groups);
3515
3516         c->pam_name = mfree(c->pam_name);
3517
3518         c->read_only_paths = strv_free(c->read_only_paths);
3519         c->read_write_paths = strv_free(c->read_write_paths);
3520         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3521
3522         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3523
3524         if (c->cpuset)
3525                 CPU_FREE(c->cpuset);
3526
3527         c->utmp_id = mfree(c->utmp_id);
3528         c->selinux_context = mfree(c->selinux_context);
3529         c->apparmor_profile = mfree(c->apparmor_profile);
3530         c->smack_process_label = mfree(c->smack_process_label);
3531
3532         c->syscall_filter = hashmap_free(c->syscall_filter);
3533         c->syscall_archs = set_free(c->syscall_archs);
3534         c->address_families = set_free(c->address_families);
3535
3536         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3537                 c->directories[i].paths = strv_free(c->directories[i].paths);
3538
3539         c->log_level_max = -1;
3540
3541         exec_context_free_log_extra_fields(c);
3542 }
3543
3544 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3545         char **i;
3546
3547         assert(c);
3548
3549         if (!runtime_prefix)
3550                 return 0;
3551
3552         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3553                 _cleanup_free_ char *p;
3554
3555                 p = strjoin(runtime_prefix, "/", *i);
3556                 if (!p)
3557                         return -ENOMEM;
3558
3559                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3560                  * next. */
3561                 (void) rm_rf(p, REMOVE_ROOT);
3562         }
3563
3564         return 0;
3565 }
3566
3567 void exec_command_done(ExecCommand *c) {
3568         assert(c);
3569
3570         c->path = mfree(c->path);
3571
3572         c->argv = strv_free(c->argv);
3573 }
3574
3575 void exec_command_done_array(ExecCommand *c, unsigned n) {
3576         unsigned i;
3577
3578         for (i = 0; i < n; i++)
3579                 exec_command_done(c+i);
3580 }
3581
3582 ExecCommand* exec_command_free_list(ExecCommand *c) {
3583         ExecCommand *i;
3584
3585         while ((i = c)) {
3586                 LIST_REMOVE(command, c, i);
3587                 exec_command_done(i);
3588                 free(i);
3589         }
3590
3591         return NULL;
3592 }
3593
3594 void exec_command_free_array(ExecCommand **c, unsigned n) {
3595         unsigned i;
3596
3597         for (i = 0; i < n; i++)
3598                 c[i] = exec_command_free_list(c[i]);
3599 }
3600
3601 typedef struct InvalidEnvInfo {
3602         Unit *unit;
3603         const char *path;
3604 } InvalidEnvInfo;
3605
3606 static void invalid_env(const char *p, void *userdata) {
3607         InvalidEnvInfo *info = userdata;
3608
3609         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3610 }
3611
3612 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3613         assert(c);
3614
3615         switch (fd_index) {
3616         case STDIN_FILENO:
3617                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3618                         return NULL;
3619                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3620         case STDOUT_FILENO:
3621                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3622                         return NULL;
3623                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3624         case STDERR_FILENO:
3625                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3626                         return NULL;
3627                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3628         default:
3629                 return NULL;
3630         }
3631 }
3632
3633 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3634         unsigned i, targets;
3635         const char* stdio_fdname[3];
3636         unsigned n_fds;
3637
3638         assert(c);
3639         assert(p);
3640
3641         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3642                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3643                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3644
3645         for (i = 0; i < 3; i++)
3646                 stdio_fdname[i] = exec_context_fdname(c, i);
3647
3648         n_fds = p->n_storage_fds + p->n_socket_fds;
3649
3650         for (i = 0; i < n_fds  && targets > 0; i++)
3651                 if (named_iofds[STDIN_FILENO] < 0 &&
3652                     c->std_input == EXEC_INPUT_NAMED_FD &&
3653                     stdio_fdname[STDIN_FILENO] &&
3654                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3655
3656                         named_iofds[STDIN_FILENO] = p->fds[i];
3657                         targets--;
3658
3659                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3660                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3661                            stdio_fdname[STDOUT_FILENO] &&
3662                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3663
3664                         named_iofds[STDOUT_FILENO] = p->fds[i];
3665                         targets--;
3666
3667                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3668                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3669                            stdio_fdname[STDERR_FILENO] &&
3670                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3671
3672                         named_iofds[STDERR_FILENO] = p->fds[i];
3673                         targets--;
3674                 }
3675
3676         return targets == 0 ? 0 : -ENOENT;
3677 }
3678
3679 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3680         char **i, **r = NULL;
3681
3682         assert(c);
3683         assert(l);
3684
3685         STRV_FOREACH(i, c->environment_files) {
3686                 char *fn;
3687                 int k;
3688                 unsigned n;
3689                 bool ignore = false;
3690                 char **p;
3691                 _cleanup_globfree_ glob_t pglob = {};
3692
3693                 fn = *i;
3694
3695                 if (fn[0] == '-') {
3696                         ignore = true;
3697                         fn++;
3698                 }
3699
3700                 if (!path_is_absolute(fn)) {
3701                         if (ignore)
3702                                 continue;
3703
3704                         strv_free(r);
3705                         return -EINVAL;
3706                 }
3707
3708                 /* Filename supports globbing, take all matching files */
3709                 k = safe_glob(fn, 0, &pglob);
3710                 if (k < 0) {
3711                         if (ignore)
3712                                 continue;
3713
3714                         strv_free(r);
3715                         return k;
3716                 }
3717
3718                 /* When we don't match anything, -ENOENT should be returned */
3719                 assert(pglob.gl_pathc > 0);
3720
3721                 for (n = 0; n < pglob.gl_pathc; n++) {
3722                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3723                         if (k < 0) {
3724                                 if (ignore)
3725                                         continue;
3726
3727                                 strv_free(r);
3728                                 return k;
3729                         }
3730                         /* Log invalid environment variables with filename */
3731                         if (p) {
3732                                 InvalidEnvInfo info = {
3733                                         .unit = unit,
3734                                         .path = pglob.gl_pathv[n]
3735                                 };
3736
3737                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3738                         }
3739
3740                         if (r == NULL)
3741                                 r = p;
3742                         else {
3743                                 char **m;
3744
3745                                 m = strv_env_merge(2, r, p);
3746                                 strv_free(r);
3747                                 strv_free(p);
3748                                 if (!m)
3749                                         return -ENOMEM;
3750
3751                                 r = m;
3752                         }
3753                 }
3754         }
3755
3756         *l = r;
3757
3758         return 0;
3759 }
3760
3761 static bool tty_may_match_dev_console(const char *tty) {
3762         _cleanup_free_ char *active = NULL;
3763         char *console;
3764
3765         if (!tty)
3766                 return true;
3767
3768         tty = skip_dev_prefix(tty);
3769
3770         /* trivial identity? */
3771         if (streq(tty, "console"))
3772                 return true;
3773
3774         console = resolve_dev_console(&active);
3775         /* if we could not resolve, assume it may */
3776         if (!console)
3777                 return true;
3778
3779         /* "tty0" means the active VC, so it may be the same sometimes */
3780         return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3781 }
3782
3783 bool exec_context_may_touch_console(ExecContext *ec) {
3784
3785         return (ec->tty_reset ||
3786                 ec->tty_vhangup ||
3787                 ec->tty_vt_disallocate ||
3788                 is_terminal_input(ec->std_input) ||
3789                 is_terminal_output(ec->std_output) ||
3790                 is_terminal_output(ec->std_error)) &&
3791                tty_may_match_dev_console(exec_context_tty_path(ec));
3792 }
3793
3794 static void strv_fprintf(FILE *f, char **l) {
3795         char **g;
3796
3797         assert(f);
3798
3799         STRV_FOREACH(g, l)
3800                 fprintf(f, " %s", *g);
3801 }
3802
3803 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3804         ExecDirectoryType dt;
3805         char **e, **d;
3806         unsigned i;
3807         int r;
3808
3809         assert(c);
3810         assert(f);
3811
3812         prefix = strempty(prefix);
3813
3814         fprintf(f,
3815                 "%sUMask: %04o\n"
3816                 "%sWorkingDirectory: %s\n"
3817                 "%sRootDirectory: %s\n"
3818                 "%sNonBlocking: %s\n"
3819                 "%sPrivateTmp: %s\n"
3820                 "%sPrivateDevices: %s\n"
3821                 "%sProtectKernelTunables: %s\n"
3822                 "%sProtectKernelModules: %s\n"
3823                 "%sProtectControlGroups: %s\n"
3824                 "%sPrivateNetwork: %s\n"
3825                 "%sPrivateUsers: %s\n"
3826                 "%sProtectHome: %s\n"
3827                 "%sProtectSystem: %s\n"
3828                 "%sMountAPIVFS: %s\n"
3829                 "%sIgnoreSIGPIPE: %s\n"
3830                 "%sMemoryDenyWriteExecute: %s\n"
3831                 "%sRestrictRealtime: %s\n"
3832                 "%sKeyringMode: %s\n",
3833                 prefix, c->umask,
3834                 prefix, c->working_directory ? c->working_directory : "/",
3835                 prefix, c->root_directory ? c->root_directory : "/",
3836                 prefix, yes_no(c->non_blocking),
3837                 prefix, yes_no(c->private_tmp),
3838                 prefix, yes_no(c->private_devices),
3839                 prefix, yes_no(c->protect_kernel_tunables),
3840                 prefix, yes_no(c->protect_kernel_modules),
3841                 prefix, yes_no(c->protect_control_groups),
3842                 prefix, yes_no(c->private_network),
3843                 prefix, yes_no(c->private_users),
3844                 prefix, protect_home_to_string(c->protect_home),
3845                 prefix, protect_system_to_string(c->protect_system),
3846                 prefix, yes_no(c->mount_apivfs),
3847                 prefix, yes_no(c->ignore_sigpipe),
3848                 prefix, yes_no(c->memory_deny_write_execute),
3849                 prefix, yes_no(c->restrict_realtime),
3850                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3851
3852         if (c->root_image)
3853                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3854
3855         STRV_FOREACH(e, c->environment)
3856                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3857
3858         STRV_FOREACH(e, c->environment_files)
3859                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3860
3861         STRV_FOREACH(e, c->pass_environment)
3862                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3863
3864         STRV_FOREACH(e, c->unset_environment)
3865                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3866
3867         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3868
3869         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3870                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3871
3872                 STRV_FOREACH(d, c->directories[dt].paths)
3873                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3874         }
3875
3876         if (c->nice_set)
3877                 fprintf(f,
3878                         "%sNice: %i\n",
3879                         prefix, c->nice);
3880
3881         if (c->oom_score_adjust_set)
3882                 fprintf(f,
3883                         "%sOOMScoreAdjust: %i\n",
3884                         prefix, c->oom_score_adjust);
3885
3886         for (i = 0; i < RLIM_NLIMITS; i++)
3887                 if (c->rlimit[i]) {
3888                         fprintf(f, "%s%s: " RLIM_FMT "\n",
3889                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3890                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3891                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3892                 }
3893
3894         if (c->ioprio_set) {
3895                 _cleanup_free_ char *class_str = NULL;
3896
3897                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3898                 if (r >= 0)
3899                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3900
3901                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3902         }
3903
3904         if (c->cpu_sched_set) {
3905                 _cleanup_free_ char *policy_str = NULL;
3906
3907                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3908                 if (r >= 0)
3909                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3910
3911                 fprintf(f,
3912                         "%sCPUSchedulingPriority: %i\n"
3913                         "%sCPUSchedulingResetOnFork: %s\n",
3914                         prefix, c->cpu_sched_priority,
3915                         prefix, yes_no(c->cpu_sched_reset_on_fork));
3916         }
3917
3918         if (c->cpuset) {
3919                 fprintf(f, "%sCPUAffinity:", prefix);
3920                 for (i = 0; i < c->cpuset_ncpus; i++)
3921                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3922                                 fprintf(f, " %u", i);
3923                 fputs("\n", f);
3924         }
3925
3926         if (c->timer_slack_nsec != NSEC_INFINITY)
3927                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3928
3929         fprintf(f,
3930                 "%sStandardInput: %s\n"
3931                 "%sStandardOutput: %s\n"
3932                 "%sStandardError: %s\n",
3933                 prefix, exec_input_to_string(c->std_input),
3934                 prefix, exec_output_to_string(c->std_output),
3935                 prefix, exec_output_to_string(c->std_error));
3936
3937         if (c->tty_path)
3938                 fprintf(f,
3939                         "%sTTYPath: %s\n"
3940                         "%sTTYReset: %s\n"
3941                         "%sTTYVHangup: %s\n"
3942                         "%sTTYVTDisallocate: %s\n",
3943                         prefix, c->tty_path,
3944                         prefix, yes_no(c->tty_reset),
3945                         prefix, yes_no(c->tty_vhangup),
3946                         prefix, yes_no(c->tty_vt_disallocate));
3947
3948         if (IN_SET(c->std_output,
3949                    EXEC_OUTPUT_SYSLOG,
3950                    EXEC_OUTPUT_KMSG,
3951                    EXEC_OUTPUT_JOURNAL,
3952                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3953                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
3954                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3955             IN_SET(c->std_error,
3956                    EXEC_OUTPUT_SYSLOG,
3957                    EXEC_OUTPUT_KMSG,
3958                    EXEC_OUTPUT_JOURNAL,
3959                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3960                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
3961                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3962
3963                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3964
3965                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3966                 if (r >= 0)
3967                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3968
3969                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3970                 if (r >= 0)
3971                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3972         }
3973
3974         if (c->log_level_max >= 0) {
3975                 _cleanup_free_ char *t = NULL;
3976
3977                 (void) log_level_to_string_alloc(c->log_level_max, &t);
3978
3979                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
3980         }
3981
3982         if (c->n_log_extra_fields > 0) {
3983                 size_t j;
3984
3985                 for (j = 0; j < c->n_log_extra_fields; j++) {
3986                         fprintf(f, "%sLogExtraFields: ", prefix);
3987                         fwrite(c->log_extra_fields[j].iov_base,
3988                                1, c->log_extra_fields[j].iov_len,
3989                                f);
3990                         fputc('\n', f);
3991                 }
3992         }
3993
3994         if (c->secure_bits) {
3995                 _cleanup_free_ char *str = NULL;
3996
3997                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
3998                 if (r >= 0)
3999                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4000         }
4001
4002         if (c->capability_bounding_set != CAP_ALL) {
4003                 _cleanup_free_ char *str = NULL;
4004
4005                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4006                 if (r >= 0)
4007                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4008         }
4009
4010         if (c->capability_ambient_set != 0) {
4011                 _cleanup_free_ char *str = NULL;
4012
4013                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4014                 if (r >= 0)
4015                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4016         }
4017
4018         if (c->user)
4019                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4020         if (c->group)
4021                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4022
4023         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4024
4025         if (!strv_isempty(c->supplementary_groups)) {
4026                 fprintf(f, "%sSupplementaryGroups:", prefix);
4027                 strv_fprintf(f, c->supplementary_groups);
4028                 fputs("\n", f);
4029         }
4030
4031         if (c->pam_name)
4032                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4033
4034         if (strv_length(c->read_write_paths) > 0) {
4035                 fprintf(f, "%sReadWritePaths:", prefix);
4036                 strv_fprintf(f, c->read_write_paths);
4037                 fputs("\n", f);
4038         }
4039
4040         if (strv_length(c->read_only_paths) > 0) {
4041                 fprintf(f, "%sReadOnlyPaths:", prefix);
4042                 strv_fprintf(f, c->read_only_paths);
4043                 fputs("\n", f);
4044         }
4045
4046         if (strv_length(c->inaccessible_paths) > 0) {
4047                 fprintf(f, "%sInaccessiblePaths:", prefix);
4048                 strv_fprintf(f, c->inaccessible_paths);
4049                 fputs("\n", f);
4050         }
4051
4052         if (c->n_bind_mounts > 0)
4053                 for (i = 0; i < c->n_bind_mounts; i++) {
4054                         fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4055                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4056                                 c->bind_mounts[i].source,
4057                                 c->bind_mounts[i].destination,
4058                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4059                 }
4060
4061         if (c->utmp_id)
4062                 fprintf(f,
4063                         "%sUtmpIdentifier: %s\n",
4064                         prefix, c->utmp_id);
4065
4066         if (c->selinux_context)
4067                 fprintf(f,
4068                         "%sSELinuxContext: %s%s\n",
4069                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4070
4071         if (c->apparmor_profile)
4072                 fprintf(f,
4073                         "%sAppArmorProfile: %s%s\n",
4074                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4075
4076         if (c->smack_process_label)
4077                 fprintf(f,
4078                         "%sSmackProcessLabel: %s%s\n",
4079                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4080
4081         if (c->personality != PERSONALITY_INVALID)
4082                 fprintf(f,
4083                         "%sPersonality: %s\n",
4084                         prefix, strna(personality_to_string(c->personality)));
4085
4086         fprintf(f,
4087                 "%sLockPersonality: %s\n",
4088                 prefix, yes_no(c->lock_personality));
4089
4090         if (c->syscall_filter) {
4091 #if HAVE_SECCOMP
4092                 Iterator j;
4093                 void *id, *val;
4094                 bool first = true;
4095 #endif
4096
4097                 fprintf(f,
4098                         "%sSystemCallFilter: ",
4099                         prefix);
4100
4101                 if (!c->syscall_whitelist)
4102                         fputc('~', f);
4103
4104 #if HAVE_SECCOMP
4105                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4106                         _cleanup_free_ char *name = NULL;
4107                         const char *errno_name = NULL;
4108                         int num = PTR_TO_INT(val);
4109
4110                         if (first)
4111                                 first = false;
4112                         else
4113                                 fputc(' ', f);
4114
4115                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4116                         fputs(strna(name), f);
4117
4118                         if (num >= 0) {
4119                                 errno_name = errno_to_name(num);
4120                                 if (errno_name)
4121                                         fprintf(f, ":%s", errno_name);
4122                                 else
4123                                         fprintf(f, ":%d", num);
4124                         }
4125                 }
4126 #endif
4127
4128                 fputc('\n', f);
4129         }
4130
4131         if (c->syscall_archs) {
4132 #if HAVE_SECCOMP
4133                 Iterator j;
4134                 void *id;
4135 #endif
4136
4137                 fprintf(f,
4138                         "%sSystemCallArchitectures:",
4139                         prefix);
4140
4141 #if HAVE_SECCOMP
4142                 SET_FOREACH(id, c->syscall_archs, j)
4143                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4144 #endif
4145                 fputc('\n', f);
4146         }
4147
4148         if (exec_context_restrict_namespaces_set(c)) {
4149                 _cleanup_free_ char *s = NULL;
4150
4151                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4152                 if (r >= 0)
4153                         fprintf(f, "%sRestrictNamespaces: %s\n",
4154                                 prefix, s);
4155         }
4156
4157         if (c->syscall_errno > 0) {
4158                 const char *errno_name;
4159
4160                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4161
4162                 errno_name = errno_to_name(c->syscall_errno);
4163                 if (errno_name)
4164                         fprintf(f, "%s\n", errno_name);
4165                 else
4166                         fprintf(f, "%d\n", c->syscall_errno);
4167         }
4168
4169         if (c->apparmor_profile)
4170                 fprintf(f,
4171                         "%sAppArmorProfile: %s%s\n",
4172                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4173 }
4174
4175 bool exec_context_maintains_privileges(ExecContext *c) {
4176         assert(c);
4177
4178         /* Returns true if the process forked off would run under
4179          * an unchanged UID or as root. */
4180
4181         if (!c->user)
4182                 return true;
4183
4184         if (streq(c->user, "root") || streq(c->user, "0"))
4185                 return true;
4186
4187         return false;
4188 }
4189
4190 int exec_context_get_effective_ioprio(ExecContext *c) {
4191         int p;
4192
4193         assert(c);
4194
4195         if (c->ioprio_set)
4196                 return c->ioprio;
4197
4198         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4199         if (p < 0)
4200                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4201
4202         return p;
4203 }
4204
4205 void exec_context_free_log_extra_fields(ExecContext *c) {
4206         size_t l;
4207
4208         assert(c);
4209
4210         for (l = 0; l < c->n_log_extra_fields; l++)
4211                 free(c->log_extra_fields[l].iov_base);
4212         c->log_extra_fields = mfree(c->log_extra_fields);
4213         c->n_log_extra_fields = 0;
4214 }
4215
4216 void exec_status_start(ExecStatus *s, pid_t pid) {
4217         assert(s);
4218
4219         zero(*s);
4220         s->pid = pid;
4221         dual_timestamp_get(&s->start_timestamp);
4222 }
4223
4224 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4225         assert(s);
4226
4227         if (s->pid && s->pid != pid)
4228                 zero(*s);
4229
4230         s->pid = pid;
4231         dual_timestamp_get(&s->exit_timestamp);
4232
4233         s->code = code;
4234         s->status = status;
4235
4236         if (context) {
4237                 if (context->utmp_id)
4238                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4239
4240                 exec_context_tty_reset(context, NULL);
4241         }
4242 }
4243
4244 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4245         char buf[FORMAT_TIMESTAMP_MAX];
4246
4247         assert(s);
4248         assert(f);
4249
4250         if (s->pid <= 0)
4251                 return;
4252
4253         prefix = strempty(prefix);
4254
4255         fprintf(f,
4256                 "%sPID: "PID_FMT"\n",
4257                 prefix, s->pid);
4258
4259         if (dual_timestamp_is_set(&s->start_timestamp))
4260                 fprintf(f,
4261                         "%sStart Timestamp: %s\n",
4262                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4263
4264         if (dual_timestamp_is_set(&s->exit_timestamp))
4265                 fprintf(f,
4266                         "%sExit Timestamp: %s\n"
4267                         "%sExit Code: %s\n"
4268                         "%sExit Status: %i\n",
4269                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4270                         prefix, sigchld_code_to_string(s->code),
4271                         prefix, s->status);
4272 }
4273
4274 char *exec_command_line(char **argv) {
4275         size_t k;
4276         char *n, *p, **a;
4277         bool first = true;
4278
4279         assert(argv);
4280
4281         k = 1;
4282         STRV_FOREACH(a, argv)
4283                 k += strlen(*a)+3;
4284
4285         n = new(char, k);
4286         if (!n)
4287                 return NULL;
4288
4289         p = n;
4290         STRV_FOREACH(a, argv) {
4291
4292                 if (!first)
4293                         *(p++) = ' ';
4294                 else
4295                         first = false;
4296
4297                 if (strpbrk(*a, WHITESPACE)) {
4298                         *(p++) = '\'';
4299                         p = stpcpy(p, *a);
4300                         *(p++) = '\'';
4301                 } else
4302                         p = stpcpy(p, *a);
4303
4304         }
4305
4306         *p = 0;
4307
4308         /* FIXME: this doesn't really handle arguments that have
4309          * spaces and ticks in them */
4310
4311         return n;
4312 }
4313
4314 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4315         _cleanup_free_ char *cmd = NULL;
4316         const char *prefix2;
4317
4318         assert(c);
4319         assert(f);
4320
4321         prefix = strempty(prefix);
4322         prefix2 = strjoina(prefix, "\t");
4323
4324         cmd = exec_command_line(c->argv);
4325         fprintf(f,
4326                 "%sCommand Line: %s\n",
4327                 prefix, cmd ? cmd : strerror(ENOMEM));
4328
4329         exec_status_dump(&c->exec_status, f, prefix2);
4330 }
4331
4332 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4333         assert(f);
4334
4335         prefix = strempty(prefix);
4336
4337         LIST_FOREACH(command, c, c)
4338                 exec_command_dump(c, f, prefix);
4339 }
4340
4341 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4342         ExecCommand *end;
4343
4344         assert(l);
4345         assert(e);
4346
4347         if (*l) {
4348                 /* It's kind of important, that we keep the order here */
4349                 LIST_FIND_TAIL(command, *l, end);
4350                 LIST_INSERT_AFTER(command, *l, end, e);
4351         } else
4352               *l = e;
4353 }
4354
4355 int exec_command_set(ExecCommand *c, const char *path, ...) {
4356         va_list ap;
4357         char **l, *p;
4358
4359         assert(c);
4360         assert(path);
4361
4362         va_start(ap, path);
4363         l = strv_new_ap(path, ap);
4364         va_end(ap);
4365
4366         if (!l)
4367                 return -ENOMEM;
4368
4369         p = strdup(path);
4370         if (!p) {
4371                 strv_free(l);
4372                 return -ENOMEM;
4373         }
4374
4375         free(c->path);
4376         c->path = p;
4377
4378         strv_free(c->argv);
4379         c->argv = l;
4380
4381         return 0;
4382 }
4383
4384 int exec_command_append(ExecCommand *c, const char *path, ...) {
4385         _cleanup_strv_free_ char **l = NULL;
4386         va_list ap;
4387         int r;
4388
4389         assert(c);
4390         assert(path);
4391
4392         va_start(ap, path);
4393         l = strv_new_ap(path, ap);
4394         va_end(ap);
4395
4396         if (!l)
4397                 return -ENOMEM;
4398
4399         r = strv_extend_strv(&c->argv, l, false);
4400         if (r < 0)
4401                 return r;
4402
4403         return 0;
4404 }
4405
4406
4407 static int exec_runtime_allocate(ExecRuntime **rt) {
4408
4409         if (*rt)
4410                 return 0;
4411
4412         *rt = new0(ExecRuntime, 1);
4413         if (!*rt)
4414                 return -ENOMEM;
4415
4416         (*rt)->n_ref = 1;
4417         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4418
4419         return 0;
4420 }
4421
4422 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4423         int r;
4424
4425         assert(rt);
4426         assert(c);
4427         assert(id);
4428
4429         if (*rt)
4430                 return 1;
4431
4432         if (!c->private_network && !c->private_tmp)
4433                 return 0;
4434
4435         r = exec_runtime_allocate(rt);
4436         if (r < 0)
4437                 return r;
4438
4439         if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4440                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4441                         return -errno;
4442         }
4443
4444         if (c->private_tmp && !(*rt)->tmp_dir) {
4445                 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4446                 if (r < 0)
4447                         return r;
4448         }
4449
4450         return 1;
4451 }
4452
4453 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4454         assert(r);
4455         assert(r->n_ref > 0);
4456
4457         r->n_ref++;
4458         return r;
4459 }
4460
4461 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4462
4463         if (!r)
4464                 return NULL;
4465
4466         assert(r->n_ref > 0);
4467
4468         r->n_ref--;
4469         if (r->n_ref > 0)
4470                 return NULL;
4471
4472         free(r->tmp_dir);
4473         free(r->var_tmp_dir);
4474         safe_close_pair(r->netns_storage_socket);
4475         return mfree(r);
4476 }
4477
4478 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4479         assert(u);
4480         assert(f);
4481         assert(fds);
4482
4483         if (!rt)
4484                 return 0;
4485
4486         if (rt->tmp_dir)
4487                 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4488
4489         if (rt->var_tmp_dir)
4490                 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4491
4492         if (rt->netns_storage_socket[0] >= 0) {
4493                 int copy;
4494
4495                 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4496                 if (copy < 0)
4497                         return copy;
4498
4499                 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4500         }
4501
4502         if (rt->netns_storage_socket[1] >= 0) {
4503                 int copy;
4504
4505                 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4506                 if (copy < 0)
4507                         return copy;
4508
4509                 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4510         }
4511
4512         return 0;
4513 }
4514
4515 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4516         int r;
4517
4518         assert(rt);
4519         assert(key);
4520         assert(value);
4521
4522         if (streq(key, "tmp-dir")) {
4523                 char *copy;
4524
4525                 r = exec_runtime_allocate(rt);
4526                 if (r < 0)
4527                         return log_oom();
4528
4529                 copy = strdup(value);
4530                 if (!copy)
4531                         return log_oom();
4532
4533                 free((*rt)->tmp_dir);
4534                 (*rt)->tmp_dir = copy;
4535
4536         } else if (streq(key, "var-tmp-dir")) {
4537                 char *copy;
4538
4539                 r = exec_runtime_allocate(rt);
4540                 if (r < 0)
4541                         return log_oom();
4542
4543                 copy = strdup(value);
4544                 if (!copy)
4545                         return log_oom();
4546
4547                 free((*rt)->var_tmp_dir);
4548                 (*rt)->var_tmp_dir = copy;
4549
4550         } else if (streq(key, "netns-socket-0")) {
4551                 int fd;
4552
4553                 r = exec_runtime_allocate(rt);
4554                 if (r < 0)
4555                         return log_oom();
4556
4557                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4558                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4559                 else {
4560                         safe_close((*rt)->netns_storage_socket[0]);
4561                         (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4562                 }
4563         } else if (streq(key, "netns-socket-1")) {
4564                 int fd;
4565
4566                 r = exec_runtime_allocate(rt);
4567                 if (r < 0)
4568                         return log_oom();
4569
4570                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4571                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4572                 else {
4573                         safe_close((*rt)->netns_storage_socket[1]);
4574                         (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4575                 }
4576         } else
4577                 return 0;
4578
4579         return 1;
4580 }
4581
4582 static void *remove_tmpdir_thread(void *p) {
4583         _cleanup_free_ char *path = p;
4584
4585         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4586         return NULL;
4587 }
4588
4589 void exec_runtime_destroy(ExecRuntime *rt) {
4590         int r;
4591
4592         if (!rt)
4593                 return;
4594
4595         /* If there are multiple users of this, let's leave the stuff around */
4596         if (rt->n_ref > 1)
4597                 return;
4598
4599         if (rt->tmp_dir) {
4600                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4601
4602                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4603                 if (r < 0) {
4604                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4605                         free(rt->tmp_dir);
4606                 }
4607
4608                 rt->tmp_dir = NULL;
4609         }
4610
4611         if (rt->var_tmp_dir) {
4612                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4613
4614                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4615                 if (r < 0) {
4616                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4617                         free(rt->var_tmp_dir);
4618                 }
4619
4620                 rt->var_tmp_dir = NULL;
4621         }
4622
4623         safe_close_pair(rt->netns_storage_socket);
4624 }
4625
4626 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4627         [EXEC_INPUT_NULL] = "null",
4628         [EXEC_INPUT_TTY] = "tty",
4629         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4630         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4631         [EXEC_INPUT_SOCKET] = "socket",
4632         [EXEC_INPUT_NAMED_FD] = "fd",
4633 };
4634
4635 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4636
4637 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4638         [EXEC_OUTPUT_INHERIT] = "inherit",
4639         [EXEC_OUTPUT_NULL] = "null",
4640         [EXEC_OUTPUT_TTY] = "tty",
4641         [EXEC_OUTPUT_SYSLOG] = "syslog",
4642         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4643         [EXEC_OUTPUT_KMSG] = "kmsg",
4644         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4645         [EXEC_OUTPUT_JOURNAL] = "journal",
4646         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4647         [EXEC_OUTPUT_SOCKET] = "socket",
4648         [EXEC_OUTPUT_NAMED_FD] = "fd",
4649 };
4650
4651 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4652
4653 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4654         [EXEC_UTMP_INIT] = "init",
4655         [EXEC_UTMP_LOGIN] = "login",
4656         [EXEC_UTMP_USER] = "user",
4657 };
4658
4659 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4660
4661 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4662         [EXEC_PRESERVE_NO] = "no",
4663         [EXEC_PRESERVE_YES] = "yes",
4664         [EXEC_PRESERVE_RESTART] = "restart",
4665 };
4666
4667 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4668
4669 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4670         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4671         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4672         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4673         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4674         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4675 };
4676
4677 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4678
4679 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4680         [EXEC_KEYRING_INHERIT] = "inherit",
4681         [EXEC_KEYRING_PRIVATE] = "private",
4682         [EXEC_KEYRING_SHARED] = "shared",
4683 };
4684
4685 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);