src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2010 Lennart Poettering
   6 ***/
   7
   8 #include <errno.h>
   9 #include <fcntl.h>
  10 #include <glob.h>
  11 #include <grp.h>
  12 #include <poll.h>
  13 #include <signal.h>
  14 #include <string.h>
  15 #include <sys/capability.h>
  16 #include <sys/eventfd.h>
  17 #include <sys/mman.h>
  18 #include <sys/personality.h>
  19 #include <sys/prctl.h>
  20 #include <sys/shm.h>
  21 #include <sys/socket.h>
  22 #include <sys/stat.h>
  23 #include <sys/types.h>
  24 #include <sys/un.h>
  25 #include <unistd.h>
  26 #include <utmpx.h>
  27
  28 #if HAVE_PAM
  29 #include <security/pam_appl.h>
  30 #endif
  31
  32 #if HAVE_SELINUX
  33 #include <selinux/selinux.h>
  34 #endif
  35
  36 #if HAVE_SECCOMP
  37 #include <seccomp.h>
  38 #endif
  39
  40 #if HAVE_APPARMOR
  41 #include <sys/apparmor.h>
  42 #endif
  43
  44 #include "sd-messages.h"
  45
  46 #include "af-list.h"
  47 #include "alloc-util.h"
  48 #if HAVE_APPARMOR
  49 #include "apparmor-util.h"
  50 #endif
  51 #include "async.h"
  52 #include "barrier.h"
  53 #include "cap-list.h"
  54 #include "capability-util.h"
  55 #include "chown-recursive.h"
  56 #include "cpu-set-util.h"
  57 #include "def.h"
  58 #include "env-util.h"
  59 #include "errno-list.h"
  60 #include "execute.h"
  61 #include "exit-status.h"
  62 #include "fd-util.h"
  63 #include "fileio.h"
  64 #include "format-util.h"
  65 #include "fs-util.h"
  66 #include "glob-util.h"
  67 #include "io-util.h"
  68 #include "ioprio.h"
  69 #include "label.h"
  70 #include "log.h"
  71 #include "macro.h"
  72 #include "manager.h"
  73 #include "missing.h"
  74 #include "mkdir.h"
  75 #include "namespace.h"
  76 #include "parse-util.h"
  77 #include "path-util.h"
  78 #include "process-util.h"
  79 #include "rlimit-util.h"
  80 #include "rm-rf.h"
  81 #if HAVE_SECCOMP
  82 #include "seccomp-util.h"
  83 #endif
  84 #include "securebits.h"
  85 #include "securebits-util.h"
  86 #include "selinux-util.h"
  87 #include "signal-util.h"
  88 #include "smack-util.h"
  89 #include "socket-util.h"
  90 #include "special.h"
  91 #include "stat-util.h"
  92 #include "string-table.h"
  93 #include "string-util.h"
  94 #include "strv.h"
  95 #include "syslog-util.h"
  96 #include "terminal-util.h"
  97 #include "unit.h"
  98 #include "user-util.h"
  99 #include "util.h"
 100 #include "utmp-wtmp.h"
 101
 102 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 103 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 104
 105 /* This assumes there is a 'tty' group */
 106 #define TTY_MODE 0620
 107
 108 #define SNDBUF_SIZE (8*1024*1024)
 109
 110 static int shift_fds(int fds[], size_t n_fds) {
 111         int start, restart_from;
 112
 113         if (n_fds <= 0)
 114                 return 0;
 115
 116         /* Modifies the fds array! (sorts it) */
 117
 118         assert(fds);
 119
 120         start = 0;
 121         for (;;) {
 122                 int i;
 123
 124                 restart_from = -1;
 125
 126                 for (i = start; i < (int) n_fds; i++) {
 127                         int nfd;
 128
 129                         /* Already at right index? */
 130                         if (fds[i] == i+3)
 131                                 continue;
 132
 133                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 134                         if (nfd < 0)
 135                                 return -errno;
 136
 137                         safe_close(fds[i]);
 138                         fds[i] = nfd;
 139
 140                         /* Hmm, the fd we wanted isn't free? Then
 141                          * let's remember that and try again from here */
 142                         if (nfd != i+3 && restart_from < 0)
 143                                 restart_from = i;
 144                 }
 145
 146                 if (restart_from < 0)
 147                         break;
 148
 149                 start = restart_from;
 150         }
 151
 152         return 0;
 153 }
 154
 155 static int flags_fds(const int fds[], size_t n_storage_fds, size_t n_socket_fds, bool nonblock) {
 156         size_t i, n_fds;
 157         int r;
 158
 159         n_fds = n_storage_fds + n_socket_fds;
 160         if (n_fds <= 0)
 161                 return 0;
 162
 163         assert(fds);
 164
 165         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 166          * O_NONBLOCK only applies to socket activation though. */
 167
 168         for (i = 0; i < n_fds; i++) {
 169
 170                 if (i < n_socket_fds) {
 171                         r = fd_nonblock(fds[i], nonblock);
 172                         if (r < 0)
 173                                 return r;
 174                 }
 175
 176                 /* We unconditionally drop FD_CLOEXEC from the fds,
 177                  * since after all we want to pass these fds to our
 178                  * children */
 179
 180                 r = fd_cloexec(fds[i], false);
 181                 if (r < 0)
 182                         return r;
 183         }
 184
 185         return 0;
 186 }
 187
 188 static const char *exec_context_tty_path(const ExecContext *context) {
 189         assert(context);
 190
 191         if (context->stdio_as_fds)
 192                 return NULL;
 193
 194         if (context->tty_path)
 195                 return context->tty_path;
 196
 197         return "/dev/console";
 198 }
 199
 200 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 201         const char *path;
 202
 203         assert(context);
 204
 205         path = exec_context_tty_path(context);
 206
 207         if (context->tty_vhangup) {
 208                 if (p && p->stdin_fd >= 0)
 209                         (void) terminal_vhangup_fd(p->stdin_fd);
 210                 else if (path)
 211                         (void) terminal_vhangup(path);
 212         }
 213
 214         if (context->tty_reset) {
 215                 if (p && p->stdin_fd >= 0)
 216                         (void) reset_terminal_fd(p->stdin_fd, true);
 217                 else if (path)
 218                         (void) reset_terminal(path);
 219         }
 220
 221         if (context->tty_vt_disallocate && path)
 222                 (void) vt_disallocate(path);
 223 }
 224
 225 static bool is_terminal_input(ExecInput i) {
 226         return IN_SET(i,
 227                       EXEC_INPUT_TTY,
 228                       EXEC_INPUT_TTY_FORCE,
 229                       EXEC_INPUT_TTY_FAIL);
 230 }
 231
 232 static bool is_terminal_output(ExecOutput o) {
 233         return IN_SET(o,
 234                       EXEC_OUTPUT_TTY,
 235                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 236                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 237                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 238 }
 239
 240 static bool is_syslog_output(ExecOutput o) {
 241         return IN_SET(o,
 242                       EXEC_OUTPUT_SYSLOG,
 243                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 244 }
 245
 246 static bool is_kmsg_output(ExecOutput o) {
 247         return IN_SET(o,
 248                       EXEC_OUTPUT_KMSG,
 249                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 250 }
 251
 252 static bool exec_context_needs_term(const ExecContext *c) {
 253         assert(c);
 254
 255         /* Return true if the execution context suggests we should set $TERM to something useful. */
 256
 257         if (is_terminal_input(c->std_input))
 258                 return true;
 259
 260         if (is_terminal_output(c->std_output))
 261                 return true;
 262
 263         if (is_terminal_output(c->std_error))
 264                 return true;
 265
 266         return !!c->tty_path;
 267 }
 268
 269 static int open_null_as(int flags, int nfd) {
 270         int fd;
 271
 272         assert(nfd >= 0);
 273
 274         fd = open("/dev/null", flags|O_NOCTTY);
 275         if (fd < 0)
 276                 return -errno;
 277
 278         return move_fd(fd, nfd, false);
 279 }
 280
 281 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 282         static const union sockaddr_union sa = {
 283                 .un.sun_family = AF_UNIX,
 284                 .un.sun_path = "/run/systemd/journal/stdout",
 285         };
 286         uid_t olduid = UID_INVALID;
 287         gid_t oldgid = GID_INVALID;
 288         int r;
 289
 290         if (gid_is_valid(gid)) {
 291                 oldgid = getgid();
 292
 293                 if (setegid(gid) < 0)
 294                         return -errno;
 295         }
 296
 297         if (uid_is_valid(uid)) {
 298                 olduid = getuid();
 299
 300                 if (seteuid(uid) < 0) {
 301                         r = -errno;
 302                         goto restore_gid;
 303                 }
 304         }
 305
 306         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 307
 308         /* If we fail to restore the uid or gid, things will likely
 309            fail later on. This should only happen if an LSM interferes. */
 310
 311         if (uid_is_valid(uid))
 312                 (void) seteuid(olduid);
 313
 314  restore_gid:
 315         if (gid_is_valid(gid))
 316                 (void) setegid(oldgid);
 317
 318         return r;
 319 }
 320
 321 static int connect_logger_as(
 322                 const Unit *unit,
 323                 const ExecContext *context,
 324                 const ExecParameters *params,
 325                 ExecOutput output,
 326                 const char *ident,
 327                 int nfd,
 328                 uid_t uid,
 329                 gid_t gid) {
 330
 331         int fd, r;
 332
 333         assert(context);
 334         assert(params);
 335         assert(output < _EXEC_OUTPUT_MAX);
 336         assert(ident);
 337         assert(nfd >= 0);
 338
 339         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 340         if (fd < 0)
 341                 return -errno;
 342
 343         r = connect_journal_socket(fd, uid, gid);
 344         if (r < 0)
 345                 return r;
 346
 347         if (shutdown(fd, SHUT_RD) < 0) {
 348                 safe_close(fd);
 349                 return -errno;
 350         }
 351
 352         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 353
 354         dprintf(fd,
 355                 "%s\n"
 356                 "%s\n"
 357                 "%i\n"
 358                 "%i\n"
 359                 "%i\n"
 360                 "%i\n"
 361                 "%i\n",
 362                 context->syslog_identifier ?: ident,
 363                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 364                 context->syslog_priority,
 365                 !!context->syslog_level_prefix,
 366                 is_syslog_output(output),
 367                 is_kmsg_output(output),
 368                 is_terminal_output(output));
 369
 370         return move_fd(fd, nfd, false);
 371 }
 372 static int open_terminal_as(const char *path, int flags, int nfd) {
 373         int fd;
 374
 375         assert(path);
 376         assert(nfd >= 0);
 377
 378         fd = open_terminal(path, flags | O_NOCTTY);
 379         if (fd < 0)
 380                 return fd;
 381
 382         return move_fd(fd, nfd, false);
 383 }
 384
 385 static int acquire_path(const char *path, int flags, mode_t mode) {
 386         union sockaddr_union sa = {
 387                 .sa.sa_family = AF_UNIX,
 388         };
 389         int fd, r;
 390
 391         assert(path);
 392
 393         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 394                 flags |= O_CREAT;
 395
 396         fd = open(path, flags|O_NOCTTY, mode);
 397         if (fd >= 0)
 398                 return fd;
 399
 400         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 401                 return -errno;
 402         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 403                 return -ENXIO;
 404
 405         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 406
 407         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 408         if (fd < 0)
 409                 return -errno;
 410
 411         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 412         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 413                 safe_close(fd);
 414                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 415                                                            * indication that his wasn't an AF_UNIX socket after all */
 416         }
 417
 418         if ((flags & O_ACCMODE) == O_RDONLY)
 419                 r = shutdown(fd, SHUT_WR);
 420         else if ((flags & O_ACCMODE) == O_WRONLY)
 421                 r = shutdown(fd, SHUT_RD);
 422         else
 423                 return fd;
 424         if (r < 0) {
 425                 safe_close(fd);
 426                 return -errno;
 427         }
 428
 429         return fd;
 430 }
 431
 432 static int fixup_input(
 433                 const ExecContext *context,
 434                 int socket_fd,
 435                 bool apply_tty_stdin) {
 436
 437         ExecInput std_input;
 438
 439         assert(context);
 440
 441         std_input = context->std_input;
 442
 443         if (is_terminal_input(std_input) && !apply_tty_stdin)
 444                 return EXEC_INPUT_NULL;
 445
 446         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 447                 return EXEC_INPUT_NULL;
 448
 449         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 450                 return EXEC_INPUT_NULL;
 451
 452         return std_input;
 453 }
 454
 455 static int fixup_output(ExecOutput std_output, int socket_fd) {
 456
 457         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 458                 return EXEC_OUTPUT_INHERIT;
 459
 460         return std_output;
 461 }
 462
 463 static int setup_input(
 464                 const ExecContext *context,
 465                 const ExecParameters *params,
 466                 int socket_fd,
 467                 int named_iofds[3]) {
 468
 469         ExecInput i;
 470
 471         assert(context);
 472         assert(params);
 473
 474         if (params->stdin_fd >= 0) {
 475                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 476                         return -errno;
 477
 478                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 479                 if (isatty(STDIN_FILENO)) {
 480                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 481                         (void) reset_terminal_fd(STDIN_FILENO, true);
 482                 }
 483
 484                 return STDIN_FILENO;
 485         }
 486
 487         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 488
 489         switch (i) {
 490
 491         case EXEC_INPUT_NULL:
 492                 return open_null_as(O_RDONLY, STDIN_FILENO);
 493
 494         case EXEC_INPUT_TTY:
 495         case EXEC_INPUT_TTY_FORCE:
 496         case EXEC_INPUT_TTY_FAIL: {
 497                 int fd;
 498
 499                 fd = acquire_terminal(exec_context_tty_path(context),
 500                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 501                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 502                                                                   ACQUIRE_TERMINAL_WAIT,
 503                                       USEC_INFINITY);
 504                 if (fd < 0)
 505                         return fd;
 506
 507                 return move_fd(fd, STDIN_FILENO, false);
 508         }
 509
 510         case EXEC_INPUT_SOCKET:
 511                 assert(socket_fd >= 0);
 512
 513                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 514
 515         case EXEC_INPUT_NAMED_FD:
 516                 assert(named_iofds[STDIN_FILENO] >= 0);
 517
 518                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 519                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 520
 521         case EXEC_INPUT_DATA: {
 522                 int fd;
 523
 524                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 525                 if (fd < 0)
 526                         return fd;
 527
 528                 return move_fd(fd, STDIN_FILENO, false);
 529         }
 530
 531         case EXEC_INPUT_FILE: {
 532                 bool rw;
 533                 int fd;
 534
 535                 assert(context->stdio_file[STDIN_FILENO]);
 536
 537                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 538                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 539
 540                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 541                 if (fd < 0)
 542                         return fd;
 543
 544                 return move_fd(fd, STDIN_FILENO, false);
 545         }
 546
 547         default:
 548                 assert_not_reached("Unknown input type");
 549         }
 550 }
 551
 552 static int setup_output(
 553                 const Unit *unit,
 554                 const ExecContext *context,
 555                 const ExecParameters *params,
 556                 int fileno,
 557                 int socket_fd,
 558                 int named_iofds[3],
 559                 const char *ident,
 560                 uid_t uid,
 561                 gid_t gid,
 562                 dev_t *journal_stream_dev,
 563                 ino_t *journal_stream_ino) {
 564
 565         ExecOutput o;
 566         ExecInput i;
 567         int r;
 568
 569         assert(unit);
 570         assert(context);
 571         assert(params);
 572         assert(ident);
 573         assert(journal_stream_dev);
 574         assert(journal_stream_ino);
 575
 576         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 577
 578                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 579                         return -errno;
 580
 581                 return STDOUT_FILENO;
 582         }
 583
 584         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 585                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 586                         return -errno;
 587
 588                 return STDERR_FILENO;
 589         }
 590
 591         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 592         o = fixup_output(context->std_output, socket_fd);
 593
 594         if (fileno == STDERR_FILENO) {
 595                 ExecOutput e;
 596                 e = fixup_output(context->std_error, socket_fd);
 597
 598                 /* This expects the input and output are already set up */
 599
 600                 /* Don't change the stderr file descriptor if we inherit all
 601                  * the way and are not on a tty */
 602                 if (e == EXEC_OUTPUT_INHERIT &&
 603                     o == EXEC_OUTPUT_INHERIT &&
 604                     i == EXEC_INPUT_NULL &&
 605                     !is_terminal_input(context->std_input) &&
 606                     getppid () != 1)
 607                         return fileno;
 608
 609                 /* Duplicate from stdout if possible */
 610                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 611                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 612
 613                 o = e;
 614
 615         } else if (o == EXEC_OUTPUT_INHERIT) {
 616                 /* If input got downgraded, inherit the original value */
 617                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 618                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 619
 620                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 621                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 622                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 623
 624                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 625                 if (getppid() != 1)
 626                         return fileno;
 627
 628                 /* We need to open /dev/null here anew, to get the right access mode. */
 629                 return open_null_as(O_WRONLY, fileno);
 630         }
 631
 632         switch (o) {
 633
 634         case EXEC_OUTPUT_NULL:
 635                 return open_null_as(O_WRONLY, fileno);
 636
 637         case EXEC_OUTPUT_TTY:
 638                 if (is_terminal_input(i))
 639                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 640
 641                 /* We don't reset the terminal if this is just about output */
 642                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 643
 644         case EXEC_OUTPUT_SYSLOG:
 645         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 646         case EXEC_OUTPUT_KMSG:
 647         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 648         case EXEC_OUTPUT_JOURNAL:
 649         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 650                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 651                 if (r < 0) {
 652                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 653                         r = open_null_as(O_WRONLY, fileno);
 654                 } else {
 655                         struct stat st;
 656
 657                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 658                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 659                          * services to detect whether they are connected to the journal or not.
 660                          *
 661                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 662                          * about STDERR as that's usually the best way to do logging. */
 663
 664                         if (fstat(fileno, &st) >= 0 &&
 665                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 666                                 *journal_stream_dev = st.st_dev;
 667                                 *journal_stream_ino = st.st_ino;
 668                         }
 669                 }
 670                 return r;
 671
 672         case EXEC_OUTPUT_SOCKET:
 673                 assert(socket_fd >= 0);
 674
 675                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 676
 677         case EXEC_OUTPUT_NAMED_FD:
 678                 assert(named_iofds[fileno] >= 0);
 679
 680                 (void) fd_nonblock(named_iofds[fileno], false);
 681                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 682
 683         case EXEC_OUTPUT_FILE: {
 684                 bool rw;
 685                 int fd;
 686
 687                 assert(context->stdio_file[fileno]);
 688
 689                 rw = context->std_input == EXEC_INPUT_FILE &&
 690                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 691
 692                 if (rw)
 693                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 694
 695                 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
 696                 if (fd < 0)
 697                         return fd;
 698
 699                 return move_fd(fd, fileno, false);
 700         }
 701
 702         default:
 703                 assert_not_reached("Unknown error type");
 704         }
 705 }
 706
 707 static int chown_terminal(int fd, uid_t uid) {
 708         struct stat st;
 709
 710         assert(fd >= 0);
 711
 712         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 713         if (isatty(fd) < 1)
 714                 return 0;
 715
 716         /* This might fail. What matters are the results. */
 717         (void) fchown(fd, uid, -1);
 718         (void) fchmod(fd, TTY_MODE);
 719
 720         if (fstat(fd, &st) < 0)
 721                 return -errno;
 722
 723         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 724                 return -EPERM;
 725
 726         return 0;
 727 }
 728
 729 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 730         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 731         int r;
 732
 733         assert(_saved_stdin);
 734         assert(_saved_stdout);
 735
 736         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 737         if (saved_stdin < 0)
 738                 return -errno;
 739
 740         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 741         if (saved_stdout < 0)
 742                 return -errno;
 743
 744         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 745         if (fd < 0)
 746                 return fd;
 747
 748         r = chown_terminal(fd, getuid());
 749         if (r < 0)
 750                 return r;
 751
 752         r = reset_terminal_fd(fd, true);
 753         if (r < 0)
 754                 return r;
 755
 756         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 757         fd = -1;
 758         if (r < 0)
 759                 return r;
 760
 761         *_saved_stdin = saved_stdin;
 762         *_saved_stdout = saved_stdout;
 763
 764         saved_stdin = saved_stdout = -1;
 765
 766         return 0;
 767 }
 768
 769 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 770         assert(err < 0);
 771
 772         if (err == -ETIMEDOUT)
 773                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 774         else {
 775                 errno = -err;
 776                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 777         }
 778 }
 779
 780 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 781         _cleanup_close_ int fd = -1;
 782
 783         assert(vc);
 784
 785         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 786         if (fd < 0)
 787                 return;
 788
 789         write_confirm_error_fd(err, fd, u);
 790 }
 791
 792 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 793         int r = 0;
 794
 795         assert(saved_stdin);
 796         assert(saved_stdout);
 797
 798         release_terminal();
 799
 800         if (*saved_stdin >= 0)
 801                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 802                         r = -errno;
 803
 804         if (*saved_stdout >= 0)
 805                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 806                         r = -errno;
 807
 808         *saved_stdin = safe_close(*saved_stdin);
 809         *saved_stdout = safe_close(*saved_stdout);
 810
 811         return r;
 812 }
 813
 814 enum {
 815         CONFIRM_PRETEND_FAILURE = -1,
 816         CONFIRM_PRETEND_SUCCESS =  0,
 817         CONFIRM_EXECUTE = 1,
 818 };
 819
 820 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 821         int saved_stdout = -1, saved_stdin = -1, r;
 822         _cleanup_free_ char *e = NULL;
 823         char c;
 824
 825         /* For any internal errors, assume a positive response. */
 826         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 827         if (r < 0) {
 828                 write_confirm_error(r, vc, u);
 829                 return CONFIRM_EXECUTE;
 830         }
 831
 832         /* confirm_spawn might have been disabled while we were sleeping. */
 833         if (manager_is_confirm_spawn_disabled(u->manager)) {
 834                 r = 1;
 835                 goto restore_stdio;
 836         }
 837
 838         e = ellipsize(cmdline, 60, 100);
 839         if (!e) {
 840                 log_oom();
 841                 r = CONFIRM_EXECUTE;
 842                 goto restore_stdio;
 843         }
 844
 845         for (;;) {
 846                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 847                 if (r < 0) {
 848                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 849                         r = CONFIRM_EXECUTE;
 850                         goto restore_stdio;
 851                 }
 852
 853                 switch (c) {
 854                 case 'c':
 855                         printf("Resuming normal execution.\n");
 856                         manager_disable_confirm_spawn();
 857                         r = 1;
 858                         break;
 859                 case 'D':
 860                         unit_dump(u, stdout, "  ");
 861                         continue; /* ask again */
 862                 case 'f':
 863                         printf("Failing execution.\n");
 864                         r = CONFIRM_PRETEND_FAILURE;
 865                         break;
 866                 case 'h':
 867                         printf("  c - continue, proceed without asking anymore\n"
 868                                "  D - dump, show the state of the unit\n"
 869                                "  f - fail, don't execute the command and pretend it failed\n"
 870                                "  h - help\n"
 871                                "  i - info, show a short summary of the unit\n"
 872                                "  j - jobs, show jobs that are in progress\n"
 873                                "  s - skip, don't execute the command and pretend it succeeded\n"
 874                                "  y - yes, execute the command\n");
 875                         continue; /* ask again */
 876                 case 'i':
 877                         printf("  Description: %s\n"
 878                                "  Unit:        %s\n"
 879                                "  Command:     %s\n",
 880                                u->id, u->description, cmdline);
 881                         continue; /* ask again */
 882                 case 'j':
 883                         manager_dump_jobs(u->manager, stdout, "  ");
 884                         continue; /* ask again */
 885                 case 'n':
 886                         /* 'n' was removed in favor of 'f'. */
 887                         printf("Didn't understand 'n', did you mean 'f'?\n");
 888                         continue; /* ask again */
 889                 case 's':
 890                         printf("Skipping execution.\n");
 891                         r = CONFIRM_PRETEND_SUCCESS;
 892                         break;
 893                 case 'y':
 894                         r = CONFIRM_EXECUTE;
 895                         break;
 896                 default:
 897                         assert_not_reached("Unhandled choice");
 898                 }
 899                 break;
 900         }
 901
 902 restore_stdio:
 903         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 904         return r;
 905 }
 906
 907 static int get_fixed_user(const ExecContext *c, const char **user,
 908                           uid_t *uid, gid_t *gid,
 909                           const char **home, const char **shell) {
 910         int r;
 911         const char *name;
 912
 913         assert(c);
 914
 915         if (!c->user)
 916                 return 0;
 917
 918         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 919          * (i.e. are "/" or "/bin/nologin"). */
 920
 921         name = c->user;
 922         r = get_user_creds_clean(&name, uid, gid, home, shell);
 923         if (r < 0)
 924                 return r;
 925
 926         *user = name;
 927         return 0;
 928 }
 929
 930 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 931         int r;
 932         const char *name;
 933
 934         assert(c);
 935
 936         if (!c->group)
 937                 return 0;
 938
 939         name = c->group;
 940         r = get_group_creds(&name, gid);
 941         if (r < 0)
 942                 return r;
 943
 944         *group = name;
 945         return 0;
 946 }
 947
 948 static int get_supplementary_groups(const ExecContext *c, const char *user,
 949                                     const char *group, gid_t gid,
 950                                     gid_t **supplementary_gids, int *ngids) {
 951         char **i;
 952         int r, k = 0;
 953         int ngroups_max;
 954         bool keep_groups = false;
 955         gid_t *groups = NULL;
 956         _cleanup_free_ gid_t *l_gids = NULL;
 957
 958         assert(c);
 959
 960         /*
 961          * If user is given, then lookup GID and supplementary groups list.
 962          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 963          * here and as early as possible so we keep the list of supplementary
 964          * groups of the caller.
 965          */
 966         if (user && gid_is_valid(gid) && gid != 0) {
 967                 /* First step, initialize groups from /etc/groups */
 968                 if (initgroups(user, gid) < 0)
 969                         return -errno;
 970
 971                 keep_groups = true;
 972         }
 973
 974         if (strv_isempty(c->supplementary_groups))
 975                 return 0;
 976
 977         /*
 978          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 979          * be positive, otherwise fail.
 980          */
 981         errno = 0;
 982         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 983         if (ngroups_max <= 0) {
 984                 if (errno > 0)
 985                         return -errno;
 986                 else
 987                         return -EOPNOTSUPP; /* For all other values */
 988         }
 989
 990         l_gids = new(gid_t, ngroups_max);
 991         if (!l_gids)
 992                 return -ENOMEM;
 993
 994         if (keep_groups) {
 995                 /*
 996                  * Lookup the list of groups that the user belongs to, we
 997                  * avoid NSS lookups here too for gid=0.
 998                  */
 999                 k = ngroups_max;
1000                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1001                         return -EINVAL;
1002         } else
1003                 k = 0;
1004
1005         STRV_FOREACH(i, c->supplementary_groups) {
1006                 const char *g;
1007
1008                 if (k >= ngroups_max)
1009                         return -E2BIG;
1010
1011                 g = *i;
1012                 r = get_group_creds(&g, l_gids+k);
1013                 if (r < 0)
1014                         return r;
1015
1016                 k++;
1017         }
1018
1019         /*
1020          * Sets ngids to zero to drop all supplementary groups, happens
1021          * when we are under root and SupplementaryGroups= is empty.
1022          */
1023         if (k == 0) {
1024                 *ngids = 0;
1025                 return 0;
1026         }
1027
1028         /* Otherwise get the final list of supplementary groups */
1029         groups = memdup(l_gids, sizeof(gid_t) * k);
1030         if (!groups)
1031                 return -ENOMEM;
1032
1033         *supplementary_gids = groups;
1034         *ngids = k;
1035
1036         groups = NULL;
1037
1038         return 0;
1039 }
1040
1041 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1042         int r;
1043
1044         /* Handle SupplementaryGroups= if it is not empty */
1045         if (ngids > 0) {
1046                 r = maybe_setgroups(ngids, supplementary_gids);
1047                 if (r < 0)
1048                         return r;
1049         }
1050
1051         if (gid_is_valid(gid)) {
1052                 /* Then set our gids */
1053                 if (setresgid(gid, gid, gid) < 0)
1054                         return -errno;
1055         }
1056
1057         return 0;
1058 }
1059
1060 static int enforce_user(const ExecContext *context, uid_t uid) {
1061         assert(context);
1062
1063         if (!uid_is_valid(uid))
1064                 return 0;
1065
1066         /* Sets (but doesn't look up) the uid and make sure we keep the
1067          * capabilities while doing so. */
1068
1069         if (context->capability_ambient_set != 0) {
1070
1071                 /* First step: If we need to keep capabilities but
1072                  * drop privileges we need to make sure we keep our
1073                  * caps, while we drop privileges. */
1074                 if (uid != 0) {
1075                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1076
1077                         if (prctl(PR_GET_SECUREBITS) != sb)
1078                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1079                                         return -errno;
1080                 }
1081         }
1082
1083         /* Second step: actually set the uids */
1084         if (setresuid(uid, uid, uid) < 0)
1085                 return -errno;
1086
1087         /* At this point we should have all necessary capabilities but
1088            are otherwise a normal user. However, the caps might got
1089            corrupted due to the setresuid() so we need clean them up
1090            later. This is done outside of this call. */
1091
1092         return 0;
1093 }
1094
1095 #if HAVE_PAM
1096
1097 static int null_conv(
1098                 int num_msg,
1099                 const struct pam_message **msg,
1100                 struct pam_response **resp,
1101                 void *appdata_ptr) {
1102
1103         /* We don't support conversations */
1104
1105         return PAM_CONV_ERR;
1106 }
1107
1108 #endif
1109
1110 static int setup_pam(
1111                 const char *name,
1112                 const char *user,
1113                 uid_t uid,
1114                 gid_t gid,
1115                 const char *tty,
1116                 char ***env,
1117                 int fds[], size_t n_fds) {
1118
1119 #if HAVE_PAM
1120
1121         static const struct pam_conv conv = {
1122                 .conv = null_conv,
1123                 .appdata_ptr = NULL
1124         };
1125
1126         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1127         pam_handle_t *handle = NULL;
1128         sigset_t old_ss;
1129         int pam_code = PAM_SUCCESS, r;
1130         char **nv, **e = NULL;
1131         bool close_session = false;
1132         pid_t pam_pid = 0, parent_pid;
1133         int flags = 0;
1134
1135         assert(name);
1136         assert(user);
1137         assert(env);
1138
1139         /* We set up PAM in the parent process, then fork. The child
1140          * will then stay around until killed via PR_GET_PDEATHSIG or
1141          * systemd via the cgroup logic. It will then remove the PAM
1142          * session again. The parent process will exec() the actual
1143          * daemon. We do things this way to ensure that the main PID
1144          * of the daemon is the one we initially fork()ed. */
1145
1146         r = barrier_create(&barrier);
1147         if (r < 0)
1148                 goto fail;
1149
1150         if (log_get_max_level() < LOG_DEBUG)
1151                 flags |= PAM_SILENT;
1152
1153         pam_code = pam_start(name, user, &conv, &handle);
1154         if (pam_code != PAM_SUCCESS) {
1155                 handle = NULL;
1156                 goto fail;
1157         }
1158
1159         if (tty) {
1160                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1161                 if (pam_code != PAM_SUCCESS)
1162                         goto fail;
1163         }
1164
1165         STRV_FOREACH(nv, *env) {
1166                 pam_code = pam_putenv(handle, *nv);
1167                 if (pam_code != PAM_SUCCESS)
1168                         goto fail;
1169         }
1170
1171         pam_code = pam_acct_mgmt(handle, flags);
1172         if (pam_code != PAM_SUCCESS)
1173                 goto fail;
1174
1175         pam_code = pam_open_session(handle, flags);
1176         if (pam_code != PAM_SUCCESS)
1177                 goto fail;
1178
1179         close_session = true;
1180
1181         e = pam_getenvlist(handle);
1182         if (!e) {
1183                 pam_code = PAM_BUF_ERR;
1184                 goto fail;
1185         }
1186
1187         /* Block SIGTERM, so that we know that it won't get lost in
1188          * the child */
1189
1190         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1191
1192         parent_pid = getpid_cached();
1193
1194         r = safe_fork("(sd-pam)", 0, &pam_pid);
1195         if (r < 0)
1196                 goto fail;
1197         if (r == 0) {
1198                 int sig, ret = EXIT_PAM;
1199
1200                 /* The child's job is to reset the PAM session on
1201                  * termination */
1202                 barrier_set_role(&barrier, BARRIER_CHILD);
1203
1204                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1205                  * are open here that have been opened by PAM. */
1206                 (void) close_many(fds, n_fds);
1207
1208                 /* Drop privileges - we don't need any to pam_close_session
1209                  * and this will make PR_SET_PDEATHSIG work in most cases.
1210                  * If this fails, ignore the error - but expect sd-pam threads
1211                  * to fail to exit normally */
1212
1213                 r = maybe_setgroups(0, NULL);
1214                 if (r < 0)
1215                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1216                 if (setresgid(gid, gid, gid) < 0)
1217                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1218                 if (setresuid(uid, uid, uid) < 0)
1219                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1220
1221                 (void) ignore_signals(SIGPIPE, -1);
1222
1223                 /* Wait until our parent died. This will only work if
1224                  * the above setresuid() succeeds, otherwise the kernel
1225                  * will not allow unprivileged parents kill their privileged
1226                  * children this way. We rely on the control groups kill logic
1227                  * to do the rest for us. */
1228                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1229                         goto child_finish;
1230
1231                 /* Tell the parent that our setup is done. This is especially
1232                  * important regarding dropping privileges. Otherwise, unit
1233                  * setup might race against our setresuid(2) call.
1234                  *
1235                  * If the parent aborted, we'll detect this below, hence ignore
1236                  * return failure here. */
1237                 (void) barrier_place(&barrier);
1238
1239                 /* Check if our parent process might already have died? */
1240                 if (getppid() == parent_pid) {
1241                         sigset_t ss;
1242
1243                         assert_se(sigemptyset(&ss) >= 0);
1244                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1245
1246                         for (;;) {
1247                                 if (sigwait(&ss, &sig) < 0) {
1248                                         if (errno == EINTR)
1249                                                 continue;
1250
1251                                         goto child_finish;
1252                                 }
1253
1254                                 assert(sig == SIGTERM);
1255                                 break;
1256                         }
1257                 }
1258
1259                 /* If our parent died we'll end the session */
1260                 if (getppid() != parent_pid) {
1261                         pam_code = pam_close_session(handle, flags);
1262                         if (pam_code != PAM_SUCCESS)
1263                                 goto child_finish;
1264                 }
1265
1266                 ret = 0;
1267
1268         child_finish:
1269                 pam_end(handle, pam_code | flags);
1270                 _exit(ret);
1271         }
1272
1273         barrier_set_role(&barrier, BARRIER_PARENT);
1274
1275         /* If the child was forked off successfully it will do all the
1276          * cleanups, so forget about the handle here. */
1277         handle = NULL;
1278
1279         /* Unblock SIGTERM again in the parent */
1280         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1281
1282         /* We close the log explicitly here, since the PAM modules
1283          * might have opened it, but we don't want this fd around. */
1284         closelog();
1285
1286         /* Synchronously wait for the child to initialize. We don't care for
1287          * errors as we cannot recover. However, warn loudly if it happens. */
1288         if (!barrier_place_and_sync(&barrier))
1289                 log_error("PAM initialization failed");
1290
1291         return strv_free_and_replace(*env, e);
1292
1293 fail:
1294         if (pam_code != PAM_SUCCESS) {
1295                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1296                 r = -EPERM;  /* PAM errors do not map to errno */
1297         } else
1298                 log_error_errno(r, "PAM failed: %m");
1299
1300         if (handle) {
1301                 if (close_session)
1302                         pam_code = pam_close_session(handle, flags);
1303
1304                 pam_end(handle, pam_code | flags);
1305         }
1306
1307         strv_free(e);
1308         closelog();
1309
1310         return r;
1311 #else
1312         return 0;
1313 #endif
1314 }
1315
1316 static void rename_process_from_path(const char *path) {
1317         char process_name[11];
1318         const char *p;
1319         size_t l;
1320
1321         /* This resulting string must fit in 10 chars (i.e. the length
1322          * of "/sbin/init") to look pretty in /bin/ps */
1323
1324         p = basename(path);
1325         if (isempty(p)) {
1326                 rename_process("(...)");
1327                 return;
1328         }
1329
1330         l = strlen(p);
1331         if (l > 8) {
1332                 /* The end of the process name is usually more
1333                  * interesting, since the first bit might just be
1334                  * "systemd-" */
1335                 p = p + l - 8;
1336                 l = 8;
1337         }
1338
1339         process_name[0] = '(';
1340         memcpy(process_name+1, p, l);
1341         process_name[1+l] = ')';
1342         process_name[1+l+1] = 0;
1343
1344         rename_process(process_name);
1345 }
1346
1347 static bool context_has_address_families(const ExecContext *c) {
1348         assert(c);
1349
1350         return c->address_families_whitelist ||
1351                 !set_isempty(c->address_families);
1352 }
1353
1354 static bool context_has_syscall_filters(const ExecContext *c) {
1355         assert(c);
1356
1357         return c->syscall_whitelist ||
1358                 !hashmap_isempty(c->syscall_filter);
1359 }
1360
1361 static bool context_has_no_new_privileges(const ExecContext *c) {
1362         assert(c);
1363
1364         if (c->no_new_privileges)
1365                 return true;
1366
1367         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1368                 return false;
1369
1370         /* We need NNP if we have any form of seccomp and are unprivileged */
1371         return context_has_address_families(c) ||
1372                 c->memory_deny_write_execute ||
1373                 c->restrict_realtime ||
1374                 exec_context_restrict_namespaces_set(c) ||
1375                 c->protect_kernel_tunables ||
1376                 c->protect_kernel_modules ||
1377                 c->private_devices ||
1378                 context_has_syscall_filters(c) ||
1379                 !set_isempty(c->syscall_archs) ||
1380                 c->lock_personality;
1381 }
1382
1383 #if HAVE_SECCOMP
1384
1385 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1386
1387         if (is_seccomp_available())
1388                 return false;
1389
1390         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1391         return true;
1392 }
1393
1394 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1395         uint32_t negative_action, default_action, action;
1396         int r;
1397
1398         assert(u);
1399         assert(c);
1400
1401         if (!context_has_syscall_filters(c))
1402                 return 0;
1403
1404         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1405                 return 0;
1406
1407         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1408
1409         if (c->syscall_whitelist) {
1410                 default_action = negative_action;
1411                 action = SCMP_ACT_ALLOW;
1412         } else {
1413                 default_action = SCMP_ACT_ALLOW;
1414                 action = negative_action;
1415         }
1416
1417         if (needs_ambient_hack) {
1418                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1419                 if (r < 0)
1420                         return r;
1421         }
1422
1423         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1424 }
1425
1426 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1427         assert(u);
1428         assert(c);
1429
1430         if (set_isempty(c->syscall_archs))
1431                 return 0;
1432
1433         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1434                 return 0;
1435
1436         return seccomp_restrict_archs(c->syscall_archs);
1437 }
1438
1439 static int apply_address_families(const Unit* u, const ExecContext *c) {
1440         assert(u);
1441         assert(c);
1442
1443         if (!context_has_address_families(c))
1444                 return 0;
1445
1446         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1447                 return 0;
1448
1449         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1450 }
1451
1452 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1453         assert(u);
1454         assert(c);
1455
1456         if (!c->memory_deny_write_execute)
1457                 return 0;
1458
1459         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1460                 return 0;
1461
1462         return seccomp_memory_deny_write_execute();
1463 }
1464
1465 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1466         assert(u);
1467         assert(c);
1468
1469         if (!c->restrict_realtime)
1470                 return 0;
1471
1472         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1473                 return 0;
1474
1475         return seccomp_restrict_realtime();
1476 }
1477
1478 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1479         assert(u);
1480         assert(c);
1481
1482         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1483          * let's protect even those systems where this is left on in the kernel. */
1484
1485         if (!c->protect_kernel_tunables)
1486                 return 0;
1487
1488         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1489                 return 0;
1490
1491         return seccomp_protect_sysctl();
1492 }
1493
1494 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1495         assert(u);
1496         assert(c);
1497
1498         /* Turn off module syscalls on ProtectKernelModules=yes */
1499
1500         if (!c->protect_kernel_modules)
1501                 return 0;
1502
1503         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1504                 return 0;
1505
1506         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1507 }
1508
1509 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1510         assert(u);
1511         assert(c);
1512
1513         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1514
1515         if (!c->private_devices)
1516                 return 0;
1517
1518         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1519                 return 0;
1520
1521         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1522 }
1523
1524 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1525         assert(u);
1526         assert(c);
1527
1528         if (!exec_context_restrict_namespaces_set(c))
1529                 return 0;
1530
1531         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1532                 return 0;
1533
1534         return seccomp_restrict_namespaces(c->restrict_namespaces);
1535 }
1536
1537 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1538         unsigned long personality;
1539         int r;
1540
1541         assert(u);
1542         assert(c);
1543
1544         if (!c->lock_personality)
1545                 return 0;
1546
1547         if (skip_seccomp_unavailable(u, "LockPersonality="))
1548                 return 0;
1549
1550         personality = c->personality;
1551
1552         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1553         if (personality == PERSONALITY_INVALID) {
1554
1555                 r = opinionated_personality(&personality);
1556                 if (r < 0)
1557                         return r;
1558         }
1559
1560         return seccomp_lock_personality(personality);
1561 }
1562
1563 #endif
1564
1565 static void do_idle_pipe_dance(int idle_pipe[4]) {
1566         assert(idle_pipe);
1567
1568         idle_pipe[1] = safe_close(idle_pipe[1]);
1569         idle_pipe[2] = safe_close(idle_pipe[2]);
1570
1571         if (idle_pipe[0] >= 0) {
1572                 int r;
1573
1574                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1575
1576                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1577                         ssize_t n;
1578
1579                         /* Signal systemd that we are bored and want to continue. */
1580                         n = write(idle_pipe[3], "x", 1);
1581                         if (n > 0)
1582                                 /* Wait for systemd to react to the signal above. */
1583                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1584                 }
1585
1586                 idle_pipe[0] = safe_close(idle_pipe[0]);
1587
1588         }
1589
1590         idle_pipe[3] = safe_close(idle_pipe[3]);
1591 }
1592
1593 static int build_environment(
1594                 const Unit *u,
1595                 const ExecContext *c,
1596                 const ExecParameters *p,
1597                 size_t n_fds,
1598                 const char *home,
1599                 const char *username,
1600                 const char *shell,
1601                 dev_t journal_stream_dev,
1602                 ino_t journal_stream_ino,
1603                 char ***ret) {
1604
1605         _cleanup_strv_free_ char **our_env = NULL;
1606         size_t n_env = 0;
1607         char *x;
1608
1609         assert(u);
1610         assert(c);
1611         assert(ret);
1612
1613         our_env = new0(char*, 14);
1614         if (!our_env)
1615                 return -ENOMEM;
1616
1617         if (n_fds > 0) {
1618                 _cleanup_free_ char *joined = NULL;
1619
1620                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1621                         return -ENOMEM;
1622                 our_env[n_env++] = x;
1623
1624                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1625                         return -ENOMEM;
1626                 our_env[n_env++] = x;
1627
1628                 joined = strv_join(p->fd_names, ":");
1629                 if (!joined)
1630                         return -ENOMEM;
1631
1632                 x = strjoin("LISTEN_FDNAMES=", joined);
1633                 if (!x)
1634                         return -ENOMEM;
1635                 our_env[n_env++] = x;
1636         }
1637
1638         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1639                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1640                         return -ENOMEM;
1641                 our_env[n_env++] = x;
1642
1643                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1644                         return -ENOMEM;
1645                 our_env[n_env++] = x;
1646         }
1647
1648         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1649          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1650          * check the database directly. */
1651         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1652                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1653                 if (!x)
1654                         return -ENOMEM;
1655                 our_env[n_env++] = x;
1656         }
1657
1658         if (home) {
1659                 x = strappend("HOME=", home);
1660                 if (!x)
1661                         return -ENOMEM;
1662                 our_env[n_env++] = x;
1663         }
1664
1665         if (username) {
1666                 x = strappend("LOGNAME=", username);
1667                 if (!x)
1668                         return -ENOMEM;
1669                 our_env[n_env++] = x;
1670
1671                 x = strappend("USER=", username);
1672                 if (!x)
1673                         return -ENOMEM;
1674                 our_env[n_env++] = x;
1675         }
1676
1677         if (shell) {
1678                 x = strappend("SHELL=", shell);
1679                 if (!x)
1680                         return -ENOMEM;
1681                 our_env[n_env++] = x;
1682         }
1683
1684         if (!sd_id128_is_null(u->invocation_id)) {
1685                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1686                         return -ENOMEM;
1687
1688                 our_env[n_env++] = x;
1689         }
1690
1691         if (exec_context_needs_term(c)) {
1692                 const char *tty_path, *term = NULL;
1693
1694                 tty_path = exec_context_tty_path(c);
1695
1696                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1697                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1698                  * passes to PID 1 ends up all the way in the console login shown. */
1699
1700                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1701                         term = getenv("TERM");
1702                 if (!term)
1703                         term = default_term_for_tty(tty_path);
1704
1705                 x = strappend("TERM=", term);
1706                 if (!x)
1707                         return -ENOMEM;
1708                 our_env[n_env++] = x;
1709         }
1710
1711         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1712                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1713                         return -ENOMEM;
1714
1715                 our_env[n_env++] = x;
1716         }
1717
1718         our_env[n_env++] = NULL;
1719         assert(n_env <= 12);
1720
1721         *ret = TAKE_PTR(our_env);
1722
1723         return 0;
1724 }
1725
1726 static int build_pass_environment(const ExecContext *c, char ***ret) {
1727         _cleanup_strv_free_ char **pass_env = NULL;
1728         size_t n_env = 0, n_bufsize = 0;
1729         char **i;
1730
1731         STRV_FOREACH(i, c->pass_environment) {
1732                 _cleanup_free_ char *x = NULL;
1733                 char *v;
1734
1735                 v = getenv(*i);
1736                 if (!v)
1737                         continue;
1738                 x = strjoin(*i, "=", v);
1739                 if (!x)
1740                         return -ENOMEM;
1741
1742                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1743                         return -ENOMEM;
1744
1745                 pass_env[n_env++] = TAKE_PTR(x);
1746                 pass_env[n_env] = NULL;
1747         }
1748
1749         *ret = TAKE_PTR(pass_env);
1750
1751         return 0;
1752 }
1753
1754 static bool exec_needs_mount_namespace(
1755                 const ExecContext *context,
1756                 const ExecParameters *params,
1757                 const ExecRuntime *runtime) {
1758
1759         assert(context);
1760         assert(params);
1761
1762         if (context->root_image)
1763                 return true;
1764
1765         if (!strv_isempty(context->read_write_paths) ||
1766             !strv_isempty(context->read_only_paths) ||
1767             !strv_isempty(context->inaccessible_paths))
1768                 return true;
1769
1770         if (context->n_bind_mounts > 0)
1771                 return true;
1772
1773         if (context->n_temporary_filesystems > 0)
1774                 return true;
1775
1776         if (context->mount_flags != 0)
1777                 return true;
1778
1779         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1780                 return true;
1781
1782         if (context->private_devices ||
1783             context->protect_system != PROTECT_SYSTEM_NO ||
1784             context->protect_home != PROTECT_HOME_NO ||
1785             context->protect_kernel_tunables ||
1786             context->protect_kernel_modules ||
1787             context->protect_control_groups)
1788                 return true;
1789
1790         if (context->mount_apivfs && (context->root_image || context->root_directory))
1791                 return true;
1792
1793         if (context->dynamic_user &&
1794             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1795              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1796              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1797                 return true;
1798
1799         return false;
1800 }
1801
1802 static int setup_private_users(uid_t uid, gid_t gid) {
1803         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1804         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1805         _cleanup_close_ int unshare_ready_fd = -1;
1806         _cleanup_(sigkill_waitp) pid_t pid = 0;
1807         uint64_t c = 1;
1808         ssize_t n;
1809         int r;
1810
1811         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1812          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1813          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1814          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1815          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1816          * continues execution normally. */
1817
1818         if (uid != 0 && uid_is_valid(uid)) {
1819                 r = asprintf(&uid_map,
1820                              "0 0 1\n"                      /* Map root → root */
1821                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1822                              uid, uid);
1823                 if (r < 0)
1824                         return -ENOMEM;
1825         } else {
1826                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1827                 if (!uid_map)
1828                         return -ENOMEM;
1829         }
1830
1831         if (gid != 0 && gid_is_valid(gid)) {
1832                 r = asprintf(&gid_map,
1833                              "0 0 1\n"                      /* Map root → root */
1834                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1835                              gid, gid);
1836                 if (r < 0)
1837                         return -ENOMEM;
1838         } else {
1839                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1840                 if (!gid_map)
1841                         return -ENOMEM;
1842         }
1843
1844         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1845          * namespace. */
1846         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1847         if (unshare_ready_fd < 0)
1848                 return -errno;
1849
1850         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1851          * failed. */
1852         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1853                 return -errno;
1854
1855         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1856         if (r < 0)
1857                 return r;
1858         if (r == 0) {
1859                 _cleanup_close_ int fd = -1;
1860                 const char *a;
1861                 pid_t ppid;
1862
1863                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1864                  * here, after the parent opened its own user namespace. */
1865
1866                 ppid = getppid();
1867                 errno_pipe[0] = safe_close(errno_pipe[0]);
1868
1869                 /* Wait until the parent unshared the user namespace */
1870                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1871                         r = -errno;
1872                         goto child_fail;
1873                 }
1874
1875                 /* Disable the setgroups() system call in the child user namespace, for good. */
1876                 a = procfs_file_alloca(ppid, "setgroups");
1877                 fd = open(a, O_WRONLY|O_CLOEXEC);
1878                 if (fd < 0) {
1879                         if (errno != ENOENT) {
1880                                 r = -errno;
1881                                 goto child_fail;
1882                         }
1883
1884                         /* If the file is missing the kernel is too old, let's continue anyway. */
1885                 } else {
1886                         if (write(fd, "deny\n", 5) < 0) {
1887                                 r = -errno;
1888                                 goto child_fail;
1889                         }
1890
1891                         fd = safe_close(fd);
1892                 }
1893
1894                 /* First write the GID map */
1895                 a = procfs_file_alloca(ppid, "gid_map");
1896                 fd = open(a, O_WRONLY|O_CLOEXEC);
1897                 if (fd < 0) {
1898                         r = -errno;
1899                         goto child_fail;
1900                 }
1901                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1902                         r = -errno;
1903                         goto child_fail;
1904                 }
1905                 fd = safe_close(fd);
1906
1907                 /* The write the UID map */
1908                 a = procfs_file_alloca(ppid, "uid_map");
1909                 fd = open(a, O_WRONLY|O_CLOEXEC);
1910                 if (fd < 0) {
1911                         r = -errno;
1912                         goto child_fail;
1913                 }
1914                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1915                         r = -errno;
1916                         goto child_fail;
1917                 }
1918
1919                 _exit(EXIT_SUCCESS);
1920
1921         child_fail:
1922                 (void) write(errno_pipe[1], &r, sizeof(r));
1923                 _exit(EXIT_FAILURE);
1924         }
1925
1926         errno_pipe[1] = safe_close(errno_pipe[1]);
1927
1928         if (unshare(CLONE_NEWUSER) < 0)
1929                 return -errno;
1930
1931         /* Let the child know that the namespace is ready now */
1932         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1933                 return -errno;
1934
1935         /* Try to read an error code from the child */
1936         n = read(errno_pipe[0], &r, sizeof(r));
1937         if (n < 0)
1938                 return -errno;
1939         if (n == sizeof(r)) { /* an error code was sent to us */
1940                 if (r < 0)
1941                         return r;
1942                 return -EIO;
1943         }
1944         if (n != 0) /* on success we should have read 0 bytes */
1945                 return -EIO;
1946
1947         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1948         pid = 0;
1949         if (r < 0)
1950                 return r;
1951         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1952                 return -EIO;
1953
1954         return 0;
1955 }
1956
1957 static int setup_exec_directory(
1958                 const ExecContext *context,
1959                 const ExecParameters *params,
1960                 uid_t uid,
1961                 gid_t gid,
1962                 ExecDirectoryType type,
1963                 int *exit_status) {
1964
1965         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1966                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1967                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1968                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1969                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1970                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1971         };
1972         char **rt;
1973         int r;
1974
1975         assert(context);
1976         assert(params);
1977         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1978         assert(exit_status);
1979
1980         if (!params->prefix[type])
1981                 return 0;
1982
1983         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1984                 if (!uid_is_valid(uid))
1985                         uid = 0;
1986                 if (!gid_is_valid(gid))
1987                         gid = 0;
1988         }
1989
1990         STRV_FOREACH(rt, context->directories[type].paths) {
1991                 _cleanup_free_ char *p = NULL, *pp = NULL;
1992
1993                 p = strjoin(params->prefix[type], "/", *rt);
1994                 if (!p) {
1995                         r = -ENOMEM;
1996                         goto fail;
1997                 }
1998
1999                 r = mkdir_parents_label(p, 0755);
2000                 if (r < 0)
2001                         goto fail;
2002
2003                 if (context->dynamic_user &&
2004                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2005                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2006
2007                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2008                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2009                          * whose UID is later on reused. To lock this down we use the same trick used by container
2010                          * managers to prohibit host users to get access to files of the same UID in containers: we
2011                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2012                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2013                          * to make this directory permeable for the service itself.
2014                          *
2015                          * Specifically: for a service which wants a special directory "foo/" we first create a
2016                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2017                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2018                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2019                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2020                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2021                          * disabling the access boundary for the service and making sure it only gets access to the
2022                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2023                          *
2024                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2025                          * owned by the service itself.
2026                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2027                          * files or sockets with other services. */
2028
2029                         private_root = strjoin(params->prefix[type], "/private");
2030                         if (!private_root) {
2031                                 r = -ENOMEM;
2032                                 goto fail;
2033                         }
2034
2035                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2036                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2037                         if (r < 0)
2038                                 goto fail;
2039
2040                         pp = strjoin(private_root, "/", *rt);
2041                         if (!pp) {
2042                                 r = -ENOMEM;
2043                                 goto fail;
2044                         }
2045
2046                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2047                         r = mkdir_parents_label(pp, 0755);
2048                         if (r < 0)
2049                                 goto fail;
2050
2051                         if (is_dir(p, false) > 0 &&
2052                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2053
2054                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2055                                  * it over. Most likely the service has been upgraded from one that didn't use
2056                                  * DynamicUser=1, to one that does. */
2057
2058                                 if (rename(p, pp) < 0) {
2059                                         r = -errno;
2060                                         goto fail;
2061                                 }
2062                         } else {
2063                                 /* Otherwise, create the actual directory for the service */
2064
2065                                 r = mkdir_label(pp, context->directories[type].mode);
2066                                 if (r < 0 && r != -EEXIST)
2067                                         goto fail;
2068                         }
2069
2070                         parent = dirname_malloc(p);
2071                         if (!parent) {
2072                                 r = -ENOMEM;
2073                                 goto fail;
2074                         }
2075
2076                         r = path_make_relative(parent, pp, &relative);
2077                         if (r < 0)
2078                                 goto fail;
2079
2080                         /* And link it up from the original place */
2081                         r = symlink_idempotent(relative, p);
2082                         if (r < 0)
2083                                 goto fail;
2084
2085                         /* Lock down the access mode */
2086                         if (chmod(pp, context->directories[type].mode) < 0) {
2087                                 r = -errno;
2088                                 goto fail;
2089                         }
2090                 } else {
2091                         r = mkdir_label(p, context->directories[type].mode);
2092                         if (r < 0 && r != -EEXIST)
2093                                 goto fail;
2094                         if (r == -EEXIST && !context->dynamic_user)
2095                                 continue;
2096                 }
2097
2098                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2099                  * a service, and shall not be writable. */
2100                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2101                         continue;
2102
2103                 /* Then, change the ownership of the whole tree, if necessary */
2104                 r = path_chown_recursive(pp ?: p, uid, gid);
2105                 if (r < 0)
2106                         goto fail;
2107         }
2108
2109         return 0;
2110
2111 fail:
2112         *exit_status = exit_status_table[type];
2113         return r;
2114 }
2115
2116 #if ENABLE_SMACK
2117 static int setup_smack(
2118                 const ExecContext *context,
2119                 const ExecCommand *command) {
2120
2121         int r;
2122
2123         assert(context);
2124         assert(command);
2125
2126         if (context->smack_process_label) {
2127                 r = mac_smack_apply_pid(0, context->smack_process_label);
2128                 if (r < 0)
2129                         return r;
2130         }
2131 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2132         else {
2133                 _cleanup_free_ char *exec_label = NULL;
2134
2135                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2136                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2137                         return r;
2138
2139                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2140                 if (r < 0)
2141                         return r;
2142         }
2143 #endif
2144
2145         return 0;
2146 }
2147 #endif
2148
2149 static int compile_bind_mounts(
2150                 const ExecContext *context,
2151                 const ExecParameters *params,
2152                 BindMount **ret_bind_mounts,
2153                 size_t *ret_n_bind_mounts,
2154                 char ***ret_empty_directories) {
2155
2156         _cleanup_strv_free_ char **empty_directories = NULL;
2157         BindMount *bind_mounts;
2158         size_t n, h = 0, i;
2159         ExecDirectoryType t;
2160         int r;
2161
2162         assert(context);
2163         assert(params);
2164         assert(ret_bind_mounts);
2165         assert(ret_n_bind_mounts);
2166         assert(ret_empty_directories);
2167
2168         n = context->n_bind_mounts;
2169         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2170                 if (!params->prefix[t])
2171                         continue;
2172
2173                 n += strv_length(context->directories[t].paths);
2174         }
2175
2176         if (n <= 0) {
2177                 *ret_bind_mounts = NULL;
2178                 *ret_n_bind_mounts = 0;
2179                 *ret_empty_directories = NULL;
2180                 return 0;
2181         }
2182
2183         bind_mounts = new(BindMount, n);
2184         if (!bind_mounts)
2185                 return -ENOMEM;
2186
2187         for (i = 0; i < context->n_bind_mounts; i++) {
2188                 BindMount *item = context->bind_mounts + i;
2189                 char *s, *d;
2190
2191                 s = strdup(item->source);
2192                 if (!s) {
2193                         r = -ENOMEM;
2194                         goto finish;
2195                 }
2196
2197                 d = strdup(item->destination);
2198                 if (!d) {
2199                         free(s);
2200                         r = -ENOMEM;
2201                         goto finish;
2202                 }
2203
2204                 bind_mounts[h++] = (BindMount) {
2205                         .source = s,
2206                         .destination = d,
2207                         .read_only = item->read_only,
2208                         .recursive = item->recursive,
2209                         .ignore_enoent = item->ignore_enoent,
2210                 };
2211         }
2212
2213         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2214                 char **suffix;
2215
2216                 if (!params->prefix[t])
2217                         continue;
2218
2219                 if (strv_isempty(context->directories[t].paths))
2220                         continue;
2221
2222                 if (context->dynamic_user &&
2223                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2224                         char *private_root;
2225
2226                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2227                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2228                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2229
2230                         private_root = strjoin(params->prefix[t], "/private");
2231                         if (!private_root) {
2232                                 r = -ENOMEM;
2233                                 goto finish;
2234                         }
2235
2236                         r = strv_consume(&empty_directories, private_root);
2237                         if (r < 0)
2238                                 goto finish;
2239                 }
2240
2241                 STRV_FOREACH(suffix, context->directories[t].paths) {
2242                         char *s, *d;
2243
2244                         if (context->dynamic_user &&
2245                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2246                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2247                         else
2248                                 s = strjoin(params->prefix[t], "/", *suffix);
2249                         if (!s) {
2250                                 r = -ENOMEM;
2251                                 goto finish;
2252                         }
2253
2254                         d = strdup(s);
2255                         if (!d) {
2256                                 free(s);
2257                                 r = -ENOMEM;
2258                                 goto finish;
2259                         }
2260
2261                         bind_mounts[h++] = (BindMount) {
2262                                 .source = s,
2263                                 .destination = d,
2264                                 .read_only = false,
2265                                 .recursive = true,
2266                                 .ignore_enoent = false,
2267                         };
2268                 }
2269         }
2270
2271         assert(h == n);
2272
2273         *ret_bind_mounts = bind_mounts;
2274         *ret_n_bind_mounts = n;
2275         *ret_empty_directories = TAKE_PTR(empty_directories);
2276
2277         return (int) n;
2278
2279 finish:
2280         bind_mount_free_many(bind_mounts, h);
2281         return r;
2282 }
2283
2284 static int apply_mount_namespace(
2285                 const Unit *u,
2286                 const ExecCommand *command,
2287                 const ExecContext *context,
2288                 const ExecParameters *params,
2289                 const ExecRuntime *runtime) {
2290
2291         _cleanup_strv_free_ char **empty_directories = NULL;
2292         char *tmp = NULL, *var = NULL;
2293         const char *root_dir = NULL, *root_image = NULL;
2294         NamespaceInfo ns_info = {};
2295         bool needs_sandboxing;
2296         BindMount *bind_mounts = NULL;
2297         size_t n_bind_mounts = 0;
2298         int r;
2299
2300         assert(context);
2301
2302         /* The runtime struct only contains the parent of the private /tmp,
2303          * which is non-accessible to world users. Inside of it there's a /tmp
2304          * that is sticky, and that's the one we want to use here. */
2305
2306         if (context->private_tmp && runtime) {
2307                 if (runtime->tmp_dir)
2308                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2309                 if (runtime->var_tmp_dir)
2310                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2311         }
2312
2313         if (params->flags & EXEC_APPLY_CHROOT) {
2314                 root_image = context->root_image;
2315
2316                 if (!root_image)
2317                         root_dir = context->root_directory;
2318         }
2319
2320         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2321         if (r < 0)
2322                 return r;
2323
2324         /*
2325          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2326          * sandbox info, otherwise enforce it, don't ignore protected paths and
2327          * fail if we are enable to apply the sandbox inside the mount namespace.
2328          */
2329         if (!context->dynamic_user && root_dir)
2330                 ns_info.ignore_protect_paths = true;
2331
2332         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2333
2334         if (needs_sandboxing)
2335                 ns_info = (NamespaceInfo) {
2336                         .ignore_protect_paths = false,
2337                         .private_dev = context->private_devices,
2338                         .protect_control_groups = context->protect_control_groups,
2339                         .protect_kernel_tunables = context->protect_kernel_tunables,
2340                         .protect_kernel_modules = context->protect_kernel_modules,
2341                         .mount_apivfs = context->mount_apivfs,
2342                 };
2343
2344         r = setup_namespace(root_dir, root_image,
2345                             &ns_info, context->read_write_paths,
2346                             needs_sandboxing ? context->read_only_paths : NULL,
2347                             needs_sandboxing ? context->inaccessible_paths : NULL,
2348                             empty_directories,
2349                             bind_mounts,
2350                             n_bind_mounts,
2351                             context->temporary_filesystems,
2352                             context->n_temporary_filesystems,
2353                             tmp,
2354                             var,
2355                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2356                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2357                             context->mount_flags,
2358                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2359
2360         bind_mount_free_many(bind_mounts, n_bind_mounts);
2361
2362         /* If we couldn't set up the namespace this is probably due to a
2363          * missing capability. In this case, silently proceeed. */
2364         if (IN_SET(r, -EPERM, -EACCES)) {
2365                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2366                 return 0;
2367         }
2368
2369         return r;
2370 }
2371
2372 static int apply_working_directory(
2373                 const ExecContext *context,
2374                 const ExecParameters *params,
2375                 const char *home,
2376                 const bool needs_mount_ns,
2377                 int *exit_status) {
2378
2379         const char *d, *wd;
2380
2381         assert(context);
2382         assert(exit_status);
2383
2384         if (context->working_directory_home) {
2385
2386                 if (!home) {
2387                         *exit_status = EXIT_CHDIR;
2388                         return -ENXIO;
2389                 }
2390
2391                 wd = home;
2392
2393         } else if (context->working_directory)
2394                 wd = context->working_directory;
2395         else
2396                 wd = "/";
2397
2398         if (params->flags & EXEC_APPLY_CHROOT) {
2399                 if (!needs_mount_ns && context->root_directory)
2400                         if (chroot(context->root_directory) < 0) {
2401                                 *exit_status = EXIT_CHROOT;
2402                                 return -errno;
2403                         }
2404
2405                 d = wd;
2406         } else
2407                 d = prefix_roota(context->root_directory, wd);
2408
2409         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2410                 *exit_status = EXIT_CHDIR;
2411                 return -errno;
2412         }
2413
2414         return 0;
2415 }
2416
2417 static int setup_keyring(
2418                 const Unit *u,
2419                 const ExecContext *context,
2420                 const ExecParameters *p,
2421                 uid_t uid, gid_t gid) {
2422
2423         key_serial_t keyring;
2424         int r = 0;
2425         uid_t saved_uid;
2426         gid_t saved_gid;
2427
2428         assert(u);
2429         assert(context);
2430         assert(p);
2431
2432         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2433          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2434          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2435          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2436          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2437          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2438
2439         if (!(p->flags & EXEC_NEW_KEYRING))
2440                 return 0;
2441
2442         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2443                 return 0;
2444
2445         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2446          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2447          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2448          * & group is just as nasty as acquiring a reference to the user keyring. */
2449
2450         saved_uid = getuid();
2451         saved_gid = getgid();
2452
2453         if (gid_is_valid(gid) && gid != saved_gid) {
2454                 if (setregid(gid, -1) < 0)
2455                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2456         }
2457
2458         if (uid_is_valid(uid) && uid != saved_uid) {
2459                 if (setreuid(uid, -1) < 0) {
2460                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2461                         goto out;
2462                 }
2463         }
2464
2465         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2466         if (keyring == -1) {
2467                 if (errno == ENOSYS)
2468                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2469                 else if (IN_SET(errno, EACCES, EPERM))
2470                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2471                 else if (errno == EDQUOT)
2472                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2473                 else
2474                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2475
2476                 goto out;
2477         }
2478
2479         /* When requested link the user keyring into the session keyring. */
2480         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2481
2482                 if (keyctl(KEYCTL_LINK,
2483                            KEY_SPEC_USER_KEYRING,
2484                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2485                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2486                         goto out;
2487                 }
2488         }
2489
2490         /* Restore uid/gid back */
2491         if (uid_is_valid(uid) && uid != saved_uid) {
2492                 if (setreuid(saved_uid, -1) < 0) {
2493                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2494                         goto out;
2495                 }
2496         }
2497
2498         if (gid_is_valid(gid) && gid != saved_gid) {
2499                 if (setregid(saved_gid, -1) < 0)
2500                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2501         }
2502
2503         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2504         if (!sd_id128_is_null(u->invocation_id)) {
2505                 key_serial_t key;
2506
2507                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2508                 if (key == -1)
2509                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2510                 else {
2511                         if (keyctl(KEYCTL_SETPERM, key,
2512                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2513                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2514                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2515                 }
2516         }
2517
2518 out:
2519         /* Revert back uid & gid for the the last time, and exit */
2520         /* no extra logging, as only the first already reported error matters */
2521         if (getuid() != saved_uid)
2522                 (void) setreuid(saved_uid, -1);
2523
2524         if (getgid() != saved_gid)
2525                 (void) setregid(saved_gid, -1);
2526
2527         return r;
2528 }
2529
2530 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2531         assert(array);
2532         assert(n);
2533
2534         if (!pair)
2535                 return;
2536
2537         if (pair[0] >= 0)
2538                 array[(*n)++] = pair[0];
2539         if (pair[1] >= 0)
2540                 array[(*n)++] = pair[1];
2541 }
2542
2543 static int close_remaining_fds(
2544                 const ExecParameters *params,
2545                 const ExecRuntime *runtime,
2546                 const DynamicCreds *dcreds,
2547                 int user_lookup_fd,
2548                 int socket_fd,
2549                 int *fds, size_t n_fds) {
2550
2551         size_t n_dont_close = 0;
2552         int dont_close[n_fds + 12];
2553
2554         assert(params);
2555
2556         if (params->stdin_fd >= 0)
2557                 dont_close[n_dont_close++] = params->stdin_fd;
2558         if (params->stdout_fd >= 0)
2559                 dont_close[n_dont_close++] = params->stdout_fd;
2560         if (params->stderr_fd >= 0)
2561                 dont_close[n_dont_close++] = params->stderr_fd;
2562
2563         if (socket_fd >= 0)
2564                 dont_close[n_dont_close++] = socket_fd;
2565         if (n_fds > 0) {
2566                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2567                 n_dont_close += n_fds;
2568         }
2569
2570         if (runtime)
2571                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2572
2573         if (dcreds) {
2574                 if (dcreds->user)
2575                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2576                 if (dcreds->group)
2577                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2578         }
2579
2580         if (user_lookup_fd >= 0)
2581                 dont_close[n_dont_close++] = user_lookup_fd;
2582
2583         return close_all_fds(dont_close, n_dont_close);
2584 }
2585
2586 static int send_user_lookup(
2587                 Unit *unit,
2588                 int user_lookup_fd,
2589                 uid_t uid,
2590                 gid_t gid) {
2591
2592         assert(unit);
2593
2594         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2595          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2596          * specified. */
2597
2598         if (user_lookup_fd < 0)
2599                 return 0;
2600
2601         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2602                 return 0;
2603
2604         if (writev(user_lookup_fd,
2605                (struct iovec[]) {
2606                            IOVEC_INIT(&uid, sizeof(uid)),
2607                            IOVEC_INIT(&gid, sizeof(gid)),
2608                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2609                 return -errno;
2610
2611         return 0;
2612 }
2613
2614 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2615         int r;
2616
2617         assert(c);
2618         assert(home);
2619         assert(buf);
2620
2621         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2622
2623         if (*home)
2624                 return 0;
2625
2626         if (!c->working_directory_home)
2627                 return 0;
2628
2629         if (uid == 0) {
2630                 /* Hardcode /root as home directory for UID 0 */
2631                 *home = "/root";
2632                 return 1;
2633         }
2634
2635         r = get_home_dir(buf);
2636         if (r < 0)
2637                 return r;
2638
2639         *home = *buf;
2640         return 1;
2641 }
2642
2643 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2644         _cleanup_strv_free_ char ** list = NULL;
2645         ExecDirectoryType t;
2646         int r;
2647
2648         assert(c);
2649         assert(p);
2650         assert(ret);
2651
2652         assert(c->dynamic_user);
2653
2654         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2655          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2656          * directories. */
2657
2658         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2659                 char **i;
2660
2661                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2662                         continue;
2663
2664                 if (!p->prefix[t])
2665                         continue;
2666
2667                 STRV_FOREACH(i, c->directories[t].paths) {
2668                         char *e;
2669
2670                         if (t == EXEC_DIRECTORY_RUNTIME)
2671                                 e = strjoin(p->prefix[t], "/", *i);
2672                         else
2673                                 e = strjoin(p->prefix[t], "/private/", *i);
2674                         if (!e)
2675                                 return -ENOMEM;
2676
2677                         r = strv_consume(&list, e);
2678                         if (r < 0)
2679                                 return r;
2680                 }
2681         }
2682
2683         *ret = TAKE_PTR(list);
2684
2685         return 0;
2686 }
2687
2688 static char *exec_command_line(char **argv);
2689
2690 static int exec_child(
2691                 Unit *unit,
2692                 const ExecCommand *command,
2693                 const ExecContext *context,
2694                 const ExecParameters *params,
2695                 ExecRuntime *runtime,
2696                 DynamicCreds *dcreds,
2697                 char **argv,
2698                 int socket_fd,
2699                 int named_iofds[3],
2700                 int *fds,
2701                 size_t n_storage_fds,
2702                 size_t n_socket_fds,
2703                 char **files_env,
2704                 int user_lookup_fd,
2705                 int *exit_status) {
2706
2707         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2708         _cleanup_free_ char *home_buffer = NULL;
2709         _cleanup_free_ gid_t *supplementary_gids = NULL;
2710         const char *username = NULL, *groupname = NULL;
2711         const char *home = NULL, *shell = NULL;
2712         dev_t journal_stream_dev = 0;
2713         ino_t journal_stream_ino = 0;
2714         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2715                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2716                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2717                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2718 #if HAVE_SELINUX
2719         _cleanup_free_ char *mac_selinux_context_net = NULL;
2720         bool use_selinux = false;
2721 #endif
2722 #if ENABLE_SMACK
2723         bool use_smack = false;
2724 #endif
2725 #if HAVE_APPARMOR
2726         bool use_apparmor = false;
2727 #endif
2728         uid_t uid = UID_INVALID;
2729         gid_t gid = GID_INVALID;
2730         int r, ngids = 0;
2731         size_t n_fds;
2732         ExecDirectoryType dt;
2733         int secure_bits;
2734
2735         assert(unit);
2736         assert(command);
2737         assert(context);
2738         assert(params);
2739         assert(exit_status);
2740
2741         rename_process_from_path(command->path);
2742
2743         /* We reset exactly these signals, since they are the
2744          * only ones we set to SIG_IGN in the main daemon. All
2745          * others we leave untouched because we set them to
2746          * SIG_DFL or a valid handler initially, both of which
2747          * will be demoted to SIG_DFL. */
2748         (void) default_signals(SIGNALS_CRASH_HANDLER,
2749                                SIGNALS_IGNORE, -1);
2750
2751         if (context->ignore_sigpipe)
2752                 (void) ignore_signals(SIGPIPE, -1);
2753
2754         r = reset_signal_mask();
2755         if (r < 0) {
2756                 *exit_status = EXIT_SIGNAL_MASK;
2757                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2758         }
2759
2760         if (params->idle_pipe)
2761                 do_idle_pipe_dance(params->idle_pipe);
2762
2763         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2764          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2765          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2766          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2767
2768         log_forget_fds();
2769         log_set_open_when_needed(true);
2770
2771         /* In case anything used libc syslog(), close this here, too */
2772         closelog();
2773
2774         n_fds = n_storage_fds + n_socket_fds;
2775         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2776         if (r < 0) {
2777                 *exit_status = EXIT_FDS;
2778                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2779         }
2780
2781         if (!context->same_pgrp)
2782                 if (setsid() < 0) {
2783                         *exit_status = EXIT_SETSID;
2784                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2785                 }
2786
2787         exec_context_tty_reset(context, params);
2788
2789         if (unit_shall_confirm_spawn(unit)) {
2790                 const char *vc = params->confirm_spawn;
2791                 _cleanup_free_ char *cmdline = NULL;
2792
2793                 cmdline = exec_command_line(argv);
2794                 if (!cmdline) {
2795                         *exit_status = EXIT_MEMORY;
2796                         return log_oom();
2797                 }
2798
2799                 r = ask_for_confirmation(vc, unit, cmdline);
2800                 if (r != CONFIRM_EXECUTE) {
2801                         if (r == CONFIRM_PRETEND_SUCCESS) {
2802                                 *exit_status = EXIT_SUCCESS;
2803                                 return 0;
2804                         }
2805                         *exit_status = EXIT_CONFIRM;
2806                         log_unit_error(unit, "Execution cancelled by the user");
2807                         return -ECANCELED;
2808                 }
2809         }
2810
2811         if (context->dynamic_user && dcreds) {
2812                 _cleanup_strv_free_ char **suggested_paths = NULL;
2813
2814                 /* Make sure we bypass our own NSS module for any NSS checks */
2815                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2816                         *exit_status = EXIT_USER;
2817                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2818                 }
2819
2820                 r = compile_suggested_paths(context, params, &suggested_paths);
2821                 if (r < 0) {
2822                         *exit_status = EXIT_MEMORY;
2823                         return log_oom();
2824                 }
2825
2826                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2827                 if (r < 0) {
2828                         *exit_status = EXIT_USER;
2829                         if (r == -EILSEQ) {
2830                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2831                                 return -EOPNOTSUPP;
2832                         }
2833                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2834                 }
2835
2836                 if (!uid_is_valid(uid)) {
2837                         *exit_status = EXIT_USER;
2838                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2839                         return -ESRCH;
2840                 }
2841
2842                 if (!gid_is_valid(gid)) {
2843                         *exit_status = EXIT_USER;
2844                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2845                         return -ESRCH;
2846                 }
2847
2848                 if (dcreds->user)
2849                         username = dcreds->user->name;
2850
2851         } else {
2852                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2853                 if (r < 0) {
2854                         *exit_status = EXIT_USER;
2855                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2856                 }
2857
2858                 r = get_fixed_group(context, &groupname, &gid);
2859                 if (r < 0) {
2860                         *exit_status = EXIT_GROUP;
2861                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2862                 }
2863         }
2864
2865         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2866         r = get_supplementary_groups(context, username, groupname, gid,
2867                                      &supplementary_gids, &ngids);
2868         if (r < 0) {
2869                 *exit_status = EXIT_GROUP;
2870                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2871         }
2872
2873         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2874         if (r < 0) {
2875                 *exit_status = EXIT_USER;
2876                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2877         }
2878
2879         user_lookup_fd = safe_close(user_lookup_fd);
2880
2881         r = acquire_home(context, uid, &home, &home_buffer);
2882         if (r < 0) {
2883                 *exit_status = EXIT_CHDIR;
2884                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2885         }
2886
2887         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2888          * must sure to drop O_NONBLOCK */
2889         if (socket_fd >= 0)
2890                 (void) fd_nonblock(socket_fd, false);
2891
2892         r = setup_input(context, params, socket_fd, named_iofds);
2893         if (r < 0) {
2894                 *exit_status = EXIT_STDIN;
2895                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2896         }
2897
2898         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2899         if (r < 0) {
2900                 *exit_status = EXIT_STDOUT;
2901                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2902         }
2903
2904         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2905         if (r < 0) {
2906                 *exit_status = EXIT_STDERR;
2907                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2908         }
2909
2910         if (params->cgroup_path) {
2911                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2912                 if (r < 0) {
2913                         *exit_status = EXIT_CGROUP;
2914                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2915                 }
2916         }
2917
2918         if (context->oom_score_adjust_set) {
2919                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
2920                  * prohibit write access to this file, and we shouldn't trip up over that. */
2921                 r = set_oom_score_adjust(context->oom_score_adjust);
2922                 if (IN_SET(r, -EPERM, -EACCES))
2923                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2924                 else if (r < 0) {
2925                         *exit_status = EXIT_OOM_ADJUST;
2926                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2927                 }
2928         }
2929
2930         if (context->nice_set)
2931                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2932                         *exit_status = EXIT_NICE;
2933                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2934                 }
2935
2936         if (context->cpu_sched_set) {
2937                 struct sched_param param = {
2938                         .sched_priority = context->cpu_sched_priority,
2939                 };
2940
2941                 r = sched_setscheduler(0,
2942                                        context->cpu_sched_policy |
2943                                        (context->cpu_sched_reset_on_fork ?
2944                                         SCHED_RESET_ON_FORK : 0),
2945                                        &param);
2946                 if (r < 0) {
2947                         *exit_status = EXIT_SETSCHEDULER;
2948                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2949                 }
2950         }
2951
2952         if (context->cpuset)
2953                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2954                         *exit_status = EXIT_CPUAFFINITY;
2955                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2956                 }
2957
2958         if (context->ioprio_set)
2959                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2960                         *exit_status = EXIT_IOPRIO;
2961                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2962                 }
2963
2964         if (context->timer_slack_nsec != NSEC_INFINITY)
2965                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2966                         *exit_status = EXIT_TIMERSLACK;
2967                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2968                 }
2969
2970         if (context->personality != PERSONALITY_INVALID) {
2971                 r = safe_personality(context->personality);
2972                 if (r < 0) {
2973                         *exit_status = EXIT_PERSONALITY;
2974                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2975                 }
2976         }
2977
2978         if (context->utmp_id)
2979                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2980                                       context->tty_path,
2981                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
2982                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2983                                       USER_PROCESS,
2984                                       username);
2985
2986         if (context->user) {
2987                 r = chown_terminal(STDIN_FILENO, uid);
2988                 if (r < 0) {
2989                         *exit_status = EXIT_STDIN;
2990                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
2991                 }
2992         }
2993
2994         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
2995          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
2996          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
2997          * touch a single hierarchy too. */
2998         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2999                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3000                 if (r < 0) {
3001                         *exit_status = EXIT_CGROUP;
3002                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3003                 }
3004         }
3005
3006         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3007                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3008                 if (r < 0)
3009                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3010         }
3011
3012         r = build_environment(
3013                         unit,
3014                         context,
3015                         params,
3016                         n_fds,
3017                         home,
3018                         username,
3019                         shell,
3020                         journal_stream_dev,
3021                         journal_stream_ino,
3022                         &our_env);
3023         if (r < 0) {
3024                 *exit_status = EXIT_MEMORY;
3025                 return log_oom();
3026         }
3027
3028         r = build_pass_environment(context, &pass_env);
3029         if (r < 0) {
3030                 *exit_status = EXIT_MEMORY;
3031                 return log_oom();
3032         }
3033
3034         accum_env = strv_env_merge(5,
3035                                    params->environment,
3036                                    our_env,
3037                                    pass_env,
3038                                    context->environment,
3039                                    files_env,
3040                                    NULL);
3041         if (!accum_env) {
3042                 *exit_status = EXIT_MEMORY;
3043                 return log_oom();
3044         }
3045         accum_env = strv_env_clean(accum_env);
3046
3047         (void) umask(context->umask);
3048
3049         r = setup_keyring(unit, context, params, uid, gid);
3050         if (r < 0) {
3051                 *exit_status = EXIT_KEYRING;
3052                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3053         }
3054
3055         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3056         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3057
3058         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3059         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3060
3061         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3062         if (needs_ambient_hack)
3063                 needs_setuid = false;
3064         else
3065                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3066
3067         if (needs_sandboxing) {
3068                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3069                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3070                  * impacting our own code paths. */
3071
3072 #if HAVE_SELINUX
3073                 use_selinux = mac_selinux_use();
3074 #endif
3075 #if ENABLE_SMACK
3076                 use_smack = mac_smack_use();
3077 #endif
3078 #if HAVE_APPARMOR
3079                 use_apparmor = mac_apparmor_use();
3080 #endif
3081         }
3082
3083         if (needs_setuid) {
3084                 if (context->pam_name && username) {
3085                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3086                         if (r < 0) {
3087                                 *exit_status = EXIT_PAM;
3088                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3089                         }
3090                 }
3091         }
3092
3093         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3094                 if (ns_type_supported(NAMESPACE_NET)) {
3095                         r = setup_netns(runtime->netns_storage_socket);
3096                         if (r < 0) {
3097                                 *exit_status = EXIT_NETWORK;
3098                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3099                         }
3100                 } else
3101                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3102         }
3103
3104         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3105         if (needs_mount_namespace) {
3106                 r = apply_mount_namespace(unit, command, context, params, runtime);
3107                 if (r < 0) {
3108                         *exit_status = EXIT_NAMESPACE;
3109                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3110                 }
3111         }
3112
3113         /* Apply just after mount namespace setup */
3114         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3115         if (r < 0)
3116                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3117
3118         /* Drop groups as early as possbile */
3119         if (needs_setuid) {
3120                 r = enforce_groups(gid, supplementary_gids, ngids);
3121                 if (r < 0) {
3122                         *exit_status = EXIT_GROUP;
3123                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3124                 }
3125         }
3126
3127         if (needs_sandboxing) {
3128 #if HAVE_SELINUX
3129                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3130                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3131                         if (r < 0) {
3132                                 *exit_status = EXIT_SELINUX_CONTEXT;
3133                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3134                         }
3135                 }
3136 #endif
3137
3138                 if (context->private_users) {
3139                         r = setup_private_users(uid, gid);
3140                         if (r < 0) {
3141                                 *exit_status = EXIT_USER;
3142                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3143                         }
3144                 }
3145         }
3146
3147         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3148          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3149          * was needed to upload the policy and can now be closed as well. */
3150         r = close_all_fds(fds, n_fds);
3151         if (r >= 0)
3152                 r = shift_fds(fds, n_fds);
3153         if (r >= 0)
3154                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3155         if (r < 0) {
3156                 *exit_status = EXIT_FDS;
3157                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3158         }
3159
3160         secure_bits = context->secure_bits;
3161
3162         if (needs_sandboxing) {
3163                 uint64_t bset;
3164                 int which_failed;
3165
3166                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3167                 if (r < 0) {
3168                         *exit_status = EXIT_LIMITS;
3169                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3170                 }
3171
3172                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3173                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3174                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3175                                 *exit_status = EXIT_LIMITS;
3176                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3177                         }
3178                 }
3179
3180 #if ENABLE_SMACK
3181                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3182                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3183                 if (use_smack) {
3184                         r = setup_smack(context, command);
3185                         if (r < 0) {
3186                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3187                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3188                         }
3189                 }
3190 #endif
3191
3192                 bset = context->capability_bounding_set;
3193                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3194                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3195                  * instead of us doing that */
3196                 if (needs_ambient_hack)
3197                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3198                                 (UINT64_C(1) << CAP_SETUID) |
3199                                 (UINT64_C(1) << CAP_SETGID);
3200
3201                 if (!cap_test_all(bset)) {
3202                         r = capability_bounding_set_drop(bset, false);
3203                         if (r < 0) {
3204                                 *exit_status = EXIT_CAPABILITIES;
3205                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3206                         }
3207                 }
3208
3209                 /* This is done before enforce_user, but ambient set
3210                  * does not survive over setresuid() if keep_caps is not set. */
3211                 if (!needs_ambient_hack &&
3212                     context->capability_ambient_set != 0) {
3213                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3214                         if (r < 0) {
3215                                 *exit_status = EXIT_CAPABILITIES;
3216                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3217                         }
3218                 }
3219         }
3220
3221         if (needs_setuid) {
3222                 if (context->user) {
3223                         r = enforce_user(context, uid);
3224                         if (r < 0) {
3225                                 *exit_status = EXIT_USER;
3226                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3227                         }
3228
3229                         if (!needs_ambient_hack &&
3230                             context->capability_ambient_set != 0) {
3231
3232                                 /* Fix the ambient capabilities after user change. */
3233                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3234                                 if (r < 0) {
3235                                         *exit_status = EXIT_CAPABILITIES;
3236                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3237                                 }
3238
3239                                 /* If we were asked to change user and ambient capabilities
3240                                  * were requested, we had to add keep-caps to the securebits
3241                                  * so that we would maintain the inherited capability set
3242                                  * through the setresuid(). Make sure that the bit is added
3243                                  * also to the context secure_bits so that we don't try to
3244                                  * drop the bit away next. */
3245
3246                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3247                         }
3248                 }
3249         }
3250
3251         if (needs_sandboxing) {
3252                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3253                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3254                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3255                  * are restricted. */
3256
3257 #if HAVE_SELINUX
3258                 if (use_selinux) {
3259                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3260
3261                         if (exec_context) {
3262                                 r = setexeccon(exec_context);
3263                                 if (r < 0) {
3264                                         *exit_status = EXIT_SELINUX_CONTEXT;
3265                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3266                                 }
3267                         }
3268                 }
3269 #endif
3270
3271 #if HAVE_APPARMOR
3272                 if (use_apparmor && context->apparmor_profile) {
3273                         r = aa_change_onexec(context->apparmor_profile);
3274                         if (r < 0 && !context->apparmor_profile_ignore) {
3275                                 *exit_status = EXIT_APPARMOR_PROFILE;
3276                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3277                         }
3278                 }
3279 #endif
3280
3281                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3282                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3283                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3284                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3285                                 *exit_status = EXIT_SECUREBITS;
3286                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3287                         }
3288
3289                 if (context_has_no_new_privileges(context))
3290                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3291                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3292                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3293                         }
3294
3295 #if HAVE_SECCOMP
3296                 r = apply_address_families(unit, context);
3297                 if (r < 0) {
3298                         *exit_status = EXIT_ADDRESS_FAMILIES;
3299                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3300                 }
3301
3302                 r = apply_memory_deny_write_execute(unit, context);
3303                 if (r < 0) {
3304                         *exit_status = EXIT_SECCOMP;
3305                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3306                 }
3307
3308                 r = apply_restrict_realtime(unit, context);
3309                 if (r < 0) {
3310                         *exit_status = EXIT_SECCOMP;
3311                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3312                 }
3313
3314                 r = apply_restrict_namespaces(unit, context);
3315                 if (r < 0) {
3316                         *exit_status = EXIT_SECCOMP;
3317                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3318                 }
3319
3320                 r = apply_protect_sysctl(unit, context);
3321                 if (r < 0) {
3322                         *exit_status = EXIT_SECCOMP;
3323                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3324                 }
3325
3326                 r = apply_protect_kernel_modules(unit, context);
3327                 if (r < 0) {
3328                         *exit_status = EXIT_SECCOMP;
3329                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3330                 }
3331
3332                 r = apply_private_devices(unit, context);
3333                 if (r < 0) {
3334                         *exit_status = EXIT_SECCOMP;
3335                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3336                 }
3337
3338                 r = apply_syscall_archs(unit, context);
3339                 if (r < 0) {
3340                         *exit_status = EXIT_SECCOMP;
3341                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3342                 }
3343
3344                 r = apply_lock_personality(unit, context);
3345                 if (r < 0) {
3346                         *exit_status = EXIT_SECCOMP;
3347                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3348                 }
3349
3350                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3351                  * by the filter as little as possible. */
3352                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3353                 if (r < 0) {
3354                         *exit_status = EXIT_SECCOMP;
3355                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3356                 }
3357 #endif
3358         }
3359
3360         if (!strv_isempty(context->unset_environment)) {
3361                 char **ee = NULL;
3362
3363                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3364                 if (!ee) {
3365                         *exit_status = EXIT_MEMORY;
3366                         return log_oom();
3367                 }
3368
3369                 strv_free_and_replace(accum_env, ee);
3370         }
3371
3372         final_argv = replace_env_argv(argv, accum_env);
3373         if (!final_argv) {
3374                 *exit_status = EXIT_MEMORY;
3375                 return log_oom();
3376         }
3377
3378         if (DEBUG_LOGGING) {
3379                 _cleanup_free_ char *line;
3380
3381                 line = exec_command_line(final_argv);
3382                 if (line) {
3383                         log_struct(LOG_DEBUG,
3384                                    "EXECUTABLE=%s", command->path,
3385                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3386                                    LOG_UNIT_ID(unit),
3387                                    LOG_UNIT_INVOCATION_ID(unit),
3388                                    NULL);
3389                 }
3390         }
3391
3392         execve(command->path, final_argv, accum_env);
3393
3394         if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3395
3396                 log_struct_errno(LOG_INFO, errno,
3397                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3398                                  LOG_UNIT_ID(unit),
3399                                  LOG_UNIT_INVOCATION_ID(unit),
3400                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3401                                                   command->path),
3402                                  "EXECUTABLE=%s", command->path,
3403                                  NULL);
3404
3405                 return 0;
3406         }
3407
3408         *exit_status = EXIT_EXEC;
3409         return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3410 }
3411
3412 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3413 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3414
3415 int exec_spawn(Unit *unit,
3416                ExecCommand *command,
3417                const ExecContext *context,
3418                const ExecParameters *params,
3419                ExecRuntime *runtime,
3420                DynamicCreds *dcreds,
3421                pid_t *ret) {
3422
3423         _cleanup_strv_free_ char **files_env = NULL;
3424         int *fds = NULL;
3425         size_t n_storage_fds = 0, n_socket_fds = 0;
3426         _cleanup_free_ char *line = NULL;
3427         int socket_fd, r;
3428         int named_iofds[3] = { -1, -1, -1 };
3429         char **argv;
3430         pid_t pid;
3431
3432         assert(unit);
3433         assert(command);
3434         assert(context);
3435         assert(ret);
3436         assert(params);
3437         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3438
3439         if (context->std_input == EXEC_INPUT_SOCKET ||
3440             context->std_output == EXEC_OUTPUT_SOCKET ||
3441             context->std_error == EXEC_OUTPUT_SOCKET) {
3442
3443                 if (params->n_socket_fds > 1) {
3444                         log_unit_error(unit, "Got more than one socket.");
3445                         return -EINVAL;
3446                 }
3447
3448                 if (params->n_socket_fds == 0) {
3449                         log_unit_error(unit, "Got no socket.");
3450                         return -EINVAL;
3451                 }
3452
3453                 socket_fd = params->fds[0];
3454         } else {
3455                 socket_fd = -1;
3456                 fds = params->fds;
3457                 n_storage_fds = params->n_storage_fds;
3458                 n_socket_fds = params->n_socket_fds;
3459         }
3460
3461         r = exec_context_named_iofds(context, params, named_iofds);
3462         if (r < 0)
3463                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3464
3465         r = exec_context_load_environment(unit, context, &files_env);
3466         if (r < 0)
3467                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3468
3469         argv = params->argv ?: command->argv;
3470         line = exec_command_line(argv);
3471         if (!line)
3472                 return log_oom();
3473
3474         log_struct(LOG_DEBUG,
3475                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3476                    "EXECUTABLE=%s", command->path,
3477                    LOG_UNIT_ID(unit),
3478                    LOG_UNIT_INVOCATION_ID(unit),
3479                    NULL);
3480
3481         pid = fork();
3482         if (pid < 0)
3483                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3484
3485         if (pid == 0) {
3486                 int exit_status = EXIT_SUCCESS;
3487
3488                 r = exec_child(unit,
3489                                command,
3490                                context,
3491                                params,
3492                                runtime,
3493                                dcreds,
3494                                argv,
3495                                socket_fd,
3496                                named_iofds,
3497                                fds,
3498                                n_storage_fds,
3499                                n_socket_fds,
3500                                files_env,
3501                                unit->manager->user_lookup_fds[1],
3502                                &exit_status);
3503
3504                 if (r < 0) {
3505                         log_struct_errno(LOG_ERR, r,
3506                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3507                                          LOG_UNIT_ID(unit),
3508                                          LOG_UNIT_INVOCATION_ID(unit),
3509                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3510                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3511                                                           command->path),
3512                                          "EXECUTABLE=%s", command->path,
3513                                          NULL);
3514                 }
3515
3516                 _exit(exit_status);
3517         }
3518
3519         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3520
3521         /* We add the new process to the cgroup both in the child (so
3522          * that we can be sure that no user code is ever executed
3523          * outside of the cgroup) and in the parent (so that we can be
3524          * sure that when we kill the cgroup the process will be
3525          * killed too). */
3526         if (params->cgroup_path)
3527                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3528
3529         exec_status_start(&command->exec_status, pid);
3530
3531         *ret = pid;
3532         return 0;
3533 }
3534
3535 void exec_context_init(ExecContext *c) {
3536         ExecDirectoryType i;
3537
3538         assert(c);
3539
3540         c->umask = 0022;
3541         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3542         c->cpu_sched_policy = SCHED_OTHER;
3543         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3544         c->syslog_level_prefix = true;
3545         c->ignore_sigpipe = true;
3546         c->timer_slack_nsec = NSEC_INFINITY;
3547         c->personality = PERSONALITY_INVALID;
3548         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3549                 c->directories[i].mode = 0755;
3550         c->capability_bounding_set = CAP_ALL;
3551         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3552         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3553         c->log_level_max = -1;
3554 }
3555
3556 void exec_context_done(ExecContext *c) {
3557         ExecDirectoryType i;
3558         size_t l;
3559
3560         assert(c);
3561
3562         c->environment = strv_free(c->environment);
3563         c->environment_files = strv_free(c->environment_files);
3564         c->pass_environment = strv_free(c->pass_environment);
3565         c->unset_environment = strv_free(c->unset_environment);
3566
3567         rlimit_free_all(c->rlimit);
3568
3569         for (l = 0; l < 3; l++) {
3570                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3571                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3572         }
3573
3574         c->working_directory = mfree(c->working_directory);
3575         c->root_directory = mfree(c->root_directory);
3576         c->root_image = mfree(c->root_image);
3577         c->tty_path = mfree(c->tty_path);
3578         c->syslog_identifier = mfree(c->syslog_identifier);
3579         c->user = mfree(c->user);
3580         c->group = mfree(c->group);
3581
3582         c->supplementary_groups = strv_free(c->supplementary_groups);
3583
3584         c->pam_name = mfree(c->pam_name);
3585
3586         c->read_only_paths = strv_free(c->read_only_paths);
3587         c->read_write_paths = strv_free(c->read_write_paths);
3588         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3589
3590         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3591         c->bind_mounts = NULL;
3592         c->n_bind_mounts = 0;
3593         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3594         c->temporary_filesystems = NULL;
3595         c->n_temporary_filesystems = 0;
3596
3597         c->cpuset = cpu_set_mfree(c->cpuset);
3598
3599         c->utmp_id = mfree(c->utmp_id);
3600         c->selinux_context = mfree(c->selinux_context);
3601         c->apparmor_profile = mfree(c->apparmor_profile);
3602         c->smack_process_label = mfree(c->smack_process_label);
3603
3604         c->syscall_filter = hashmap_free(c->syscall_filter);
3605         c->syscall_archs = set_free(c->syscall_archs);
3606         c->address_families = set_free(c->address_families);
3607
3608         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3609                 c->directories[i].paths = strv_free(c->directories[i].paths);
3610
3611         c->log_level_max = -1;
3612
3613         exec_context_free_log_extra_fields(c);
3614
3615         c->stdin_data = mfree(c->stdin_data);
3616         c->stdin_data_size = 0;
3617 }
3618
3619 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3620         char **i;
3621
3622         assert(c);
3623
3624         if (!runtime_prefix)
3625                 return 0;
3626
3627         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3628                 _cleanup_free_ char *p;
3629
3630                 p = strjoin(runtime_prefix, "/", *i);
3631                 if (!p)
3632                         return -ENOMEM;
3633
3634                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3635                  * next. */
3636                 (void) rm_rf(p, REMOVE_ROOT);
3637         }
3638
3639         return 0;
3640 }
3641
3642 static void exec_command_done(ExecCommand *c) {
3643         assert(c);
3644
3645         c->path = mfree(c->path);
3646
3647         c->argv = strv_free(c->argv);
3648 }
3649
3650 void exec_command_done_array(ExecCommand *c, size_t n) {
3651         size_t i;
3652
3653         for (i = 0; i < n; i++)
3654                 exec_command_done(c+i);
3655 }
3656
3657 ExecCommand* exec_command_free_list(ExecCommand *c) {
3658         ExecCommand *i;
3659
3660         while ((i = c)) {
3661                 LIST_REMOVE(command, c, i);
3662                 exec_command_done(i);
3663                 free(i);
3664         }
3665
3666         return NULL;
3667 }
3668
3669 void exec_command_free_array(ExecCommand **c, size_t n) {
3670         size_t i;
3671
3672         for (i = 0; i < n; i++)
3673                 c[i] = exec_command_free_list(c[i]);
3674 }
3675
3676 typedef struct InvalidEnvInfo {
3677         const Unit *unit;
3678         const char *path;
3679 } InvalidEnvInfo;
3680
3681 static void invalid_env(const char *p, void *userdata) {
3682         InvalidEnvInfo *info = userdata;
3683
3684         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3685 }
3686
3687 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3688         assert(c);
3689
3690         switch (fd_index) {
3691
3692         case STDIN_FILENO:
3693                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3694                         return NULL;
3695
3696                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3697
3698         case STDOUT_FILENO:
3699                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3700                         return NULL;
3701
3702                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3703
3704         case STDERR_FILENO:
3705                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3706                         return NULL;
3707
3708                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3709
3710         default:
3711                 return NULL;
3712         }
3713 }
3714
3715 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3716         size_t i, targets;
3717         const char* stdio_fdname[3];
3718         size_t n_fds;
3719
3720         assert(c);
3721         assert(p);
3722
3723         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3724                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3725                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3726
3727         for (i = 0; i < 3; i++)
3728                 stdio_fdname[i] = exec_context_fdname(c, i);
3729
3730         n_fds = p->n_storage_fds + p->n_socket_fds;
3731
3732         for (i = 0; i < n_fds  && targets > 0; i++)
3733                 if (named_iofds[STDIN_FILENO] < 0 &&
3734                     c->std_input == EXEC_INPUT_NAMED_FD &&
3735                     stdio_fdname[STDIN_FILENO] &&
3736                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3737
3738                         named_iofds[STDIN_FILENO] = p->fds[i];
3739                         targets--;
3740
3741                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3742                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3743                            stdio_fdname[STDOUT_FILENO] &&
3744                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3745
3746                         named_iofds[STDOUT_FILENO] = p->fds[i];
3747                         targets--;
3748
3749                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3750                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3751                            stdio_fdname[STDERR_FILENO] &&
3752                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3753
3754                         named_iofds[STDERR_FILENO] = p->fds[i];
3755                         targets--;
3756                 }
3757
3758         return targets == 0 ? 0 : -ENOENT;
3759 }
3760
3761 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3762         char **i, **r = NULL;
3763
3764         assert(c);
3765         assert(l);
3766
3767         STRV_FOREACH(i, c->environment_files) {
3768                 char *fn;
3769                 int k;
3770                 unsigned n;
3771                 bool ignore = false;
3772                 char **p;
3773                 _cleanup_globfree_ glob_t pglob = {};
3774
3775                 fn = *i;
3776
3777                 if (fn[0] == '-') {
3778                         ignore = true;
3779                         fn++;
3780                 }
3781
3782                 if (!path_is_absolute(fn)) {
3783                         if (ignore)
3784                                 continue;
3785
3786                         strv_free(r);
3787                         return -EINVAL;
3788                 }
3789
3790                 /* Filename supports globbing, take all matching files */
3791                 k = safe_glob(fn, 0, &pglob);
3792                 if (k < 0) {
3793                         if (ignore)
3794                                 continue;
3795
3796                         strv_free(r);
3797                         return k;
3798                 }
3799
3800                 /* When we don't match anything, -ENOENT should be returned */
3801                 assert(pglob.gl_pathc > 0);
3802
3803                 for (n = 0; n < pglob.gl_pathc; n++) {
3804                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3805                         if (k < 0) {
3806                                 if (ignore)
3807                                         continue;
3808
3809                                 strv_free(r);
3810                                 return k;
3811                         }
3812                         /* Log invalid environment variables with filename */
3813                         if (p) {
3814                                 InvalidEnvInfo info = {
3815                                         .unit = unit,
3816                                         .path = pglob.gl_pathv[n]
3817                                 };
3818
3819                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3820                         }
3821
3822                         if (!r)
3823                                 r = p;
3824                         else {
3825                                 char **m;
3826
3827                                 m = strv_env_merge(2, r, p);
3828                                 strv_free(r);
3829                                 strv_free(p);
3830                                 if (!m)
3831                                         return -ENOMEM;
3832
3833                                 r = m;
3834                         }
3835                 }
3836         }
3837
3838         *l = r;
3839
3840         return 0;
3841 }
3842
3843 static bool tty_may_match_dev_console(const char *tty) {
3844         _cleanup_free_ char *resolved = NULL;
3845
3846         if (!tty)
3847                 return true;
3848
3849         tty = skip_dev_prefix(tty);
3850
3851         /* trivial identity? */
3852         if (streq(tty, "console"))
3853                 return true;
3854
3855         if (resolve_dev_console(&resolved) < 0)
3856                 return true; /* if we could not resolve, assume it may */
3857
3858         /* "tty0" means the active VC, so it may be the same sometimes */
3859         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3860 }
3861
3862 bool exec_context_may_touch_console(const ExecContext *ec) {
3863
3864         return (ec->tty_reset ||
3865                 ec->tty_vhangup ||
3866                 ec->tty_vt_disallocate ||
3867                 is_terminal_input(ec->std_input) ||
3868                 is_terminal_output(ec->std_output) ||
3869                 is_terminal_output(ec->std_error)) &&
3870                tty_may_match_dev_console(exec_context_tty_path(ec));
3871 }
3872
3873 static void strv_fprintf(FILE *f, char **l) {
3874         char **g;
3875
3876         assert(f);
3877
3878         STRV_FOREACH(g, l)
3879                 fprintf(f, " %s", *g);
3880 }
3881
3882 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3883         ExecDirectoryType dt;
3884         char **e, **d;
3885         unsigned i;
3886         int r;
3887
3888         assert(c);
3889         assert(f);
3890
3891         prefix = strempty(prefix);
3892
3893         fprintf(f,
3894                 "%sUMask: %04o\n"
3895                 "%sWorkingDirectory: %s\n"
3896                 "%sRootDirectory: %s\n"
3897                 "%sNonBlocking: %s\n"
3898                 "%sPrivateTmp: %s\n"
3899                 "%sPrivateDevices: %s\n"
3900                 "%sProtectKernelTunables: %s\n"
3901                 "%sProtectKernelModules: %s\n"
3902                 "%sProtectControlGroups: %s\n"
3903                 "%sPrivateNetwork: %s\n"
3904                 "%sPrivateUsers: %s\n"
3905                 "%sProtectHome: %s\n"
3906                 "%sProtectSystem: %s\n"
3907                 "%sMountAPIVFS: %s\n"
3908                 "%sIgnoreSIGPIPE: %s\n"
3909                 "%sMemoryDenyWriteExecute: %s\n"
3910                 "%sRestrictRealtime: %s\n"
3911                 "%sKeyringMode: %s\n",
3912                 prefix, c->umask,
3913                 prefix, c->working_directory ? c->working_directory : "/",
3914                 prefix, c->root_directory ? c->root_directory : "/",
3915                 prefix, yes_no(c->non_blocking),
3916                 prefix, yes_no(c->private_tmp),
3917                 prefix, yes_no(c->private_devices),
3918                 prefix, yes_no(c->protect_kernel_tunables),
3919                 prefix, yes_no(c->protect_kernel_modules),
3920                 prefix, yes_no(c->protect_control_groups),
3921                 prefix, yes_no(c->private_network),
3922                 prefix, yes_no(c->private_users),
3923                 prefix, protect_home_to_string(c->protect_home),
3924                 prefix, protect_system_to_string(c->protect_system),
3925                 prefix, yes_no(c->mount_apivfs),
3926                 prefix, yes_no(c->ignore_sigpipe),
3927                 prefix, yes_no(c->memory_deny_write_execute),
3928                 prefix, yes_no(c->restrict_realtime),
3929                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3930
3931         if (c->root_image)
3932                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3933
3934         STRV_FOREACH(e, c->environment)
3935                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3936
3937         STRV_FOREACH(e, c->environment_files)
3938                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3939
3940         STRV_FOREACH(e, c->pass_environment)
3941                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3942
3943         STRV_FOREACH(e, c->unset_environment)
3944                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3945
3946         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3947
3948         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3949                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3950
3951                 STRV_FOREACH(d, c->directories[dt].paths)
3952                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3953         }
3954
3955         if (c->nice_set)
3956                 fprintf(f,
3957                         "%sNice: %i\n",
3958                         prefix, c->nice);
3959
3960         if (c->oom_score_adjust_set)
3961                 fprintf(f,
3962                         "%sOOMScoreAdjust: %i\n",
3963                         prefix, c->oom_score_adjust);
3964
3965         for (i = 0; i < RLIM_NLIMITS; i++)
3966                 if (c->rlimit[i]) {
3967                         fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
3968                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3969                         fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
3970                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3971                 }
3972
3973         if (c->ioprio_set) {
3974                 _cleanup_free_ char *class_str = NULL;
3975
3976                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3977                 if (r >= 0)
3978                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3979
3980                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3981         }
3982
3983         if (c->cpu_sched_set) {
3984                 _cleanup_free_ char *policy_str = NULL;
3985
3986                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3987                 if (r >= 0)
3988                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3989
3990                 fprintf(f,
3991                         "%sCPUSchedulingPriority: %i\n"
3992                         "%sCPUSchedulingResetOnFork: %s\n",
3993                         prefix, c->cpu_sched_priority,
3994                         prefix, yes_no(c->cpu_sched_reset_on_fork));
3995         }
3996
3997         if (c->cpuset) {
3998                 fprintf(f, "%sCPUAffinity:", prefix);
3999                 for (i = 0; i < c->cpuset_ncpus; i++)
4000                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4001                                 fprintf(f, " %u", i);
4002                 fputs("\n", f);
4003         }
4004
4005         if (c->timer_slack_nsec != NSEC_INFINITY)
4006                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4007
4008         fprintf(f,
4009                 "%sStandardInput: %s\n"
4010                 "%sStandardOutput: %s\n"
4011                 "%sStandardError: %s\n",
4012                 prefix, exec_input_to_string(c->std_input),
4013                 prefix, exec_output_to_string(c->std_output),
4014                 prefix, exec_output_to_string(c->std_error));
4015
4016         if (c->std_input == EXEC_INPUT_NAMED_FD)
4017                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4018         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4019                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4020         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4021                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4022
4023         if (c->std_input == EXEC_INPUT_FILE)
4024                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4025         if (c->std_output == EXEC_OUTPUT_FILE)
4026                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4027         if (c->std_error == EXEC_OUTPUT_FILE)
4028                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4029
4030         if (c->tty_path)
4031                 fprintf(f,
4032                         "%sTTYPath: %s\n"
4033                         "%sTTYReset: %s\n"
4034                         "%sTTYVHangup: %s\n"
4035                         "%sTTYVTDisallocate: %s\n",
4036                         prefix, c->tty_path,
4037                         prefix, yes_no(c->tty_reset),
4038                         prefix, yes_no(c->tty_vhangup),
4039                         prefix, yes_no(c->tty_vt_disallocate));
4040
4041         if (IN_SET(c->std_output,
4042                    EXEC_OUTPUT_SYSLOG,
4043                    EXEC_OUTPUT_KMSG,
4044                    EXEC_OUTPUT_JOURNAL,
4045                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4046                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4047                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4048             IN_SET(c->std_error,
4049                    EXEC_OUTPUT_SYSLOG,
4050                    EXEC_OUTPUT_KMSG,
4051                    EXEC_OUTPUT_JOURNAL,
4052                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4053                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4054                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4055
4056                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4057
4058                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4059                 if (r >= 0)
4060                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4061
4062                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4063                 if (r >= 0)
4064                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4065         }
4066
4067         if (c->log_level_max >= 0) {
4068                 _cleanup_free_ char *t = NULL;
4069
4070                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4071
4072                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4073         }
4074
4075         if (c->n_log_extra_fields > 0) {
4076                 size_t j;
4077
4078                 for (j = 0; j < c->n_log_extra_fields; j++) {
4079                         fprintf(f, "%sLogExtraFields: ", prefix);
4080                         fwrite(c->log_extra_fields[j].iov_base,
4081                                1, c->log_extra_fields[j].iov_len,
4082                                f);
4083                         fputc('\n', f);
4084                 }
4085         }
4086
4087         if (c->secure_bits) {
4088                 _cleanup_free_ char *str = NULL;
4089
4090                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4091                 if (r >= 0)
4092                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4093         }
4094
4095         if (c->capability_bounding_set != CAP_ALL) {
4096                 _cleanup_free_ char *str = NULL;
4097
4098                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4099                 if (r >= 0)
4100                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4101         }
4102
4103         if (c->capability_ambient_set != 0) {
4104                 _cleanup_free_ char *str = NULL;
4105
4106                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4107                 if (r >= 0)
4108                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4109         }
4110
4111         if (c->user)
4112                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4113         if (c->group)
4114                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4115
4116         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4117
4118         if (!strv_isempty(c->supplementary_groups)) {
4119                 fprintf(f, "%sSupplementaryGroups:", prefix);
4120                 strv_fprintf(f, c->supplementary_groups);
4121                 fputs("\n", f);
4122         }
4123
4124         if (c->pam_name)
4125                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4126
4127         if (!strv_isempty(c->read_write_paths)) {
4128                 fprintf(f, "%sReadWritePaths:", prefix);
4129                 strv_fprintf(f, c->read_write_paths);
4130                 fputs("\n", f);
4131         }
4132
4133         if (!strv_isempty(c->read_only_paths)) {
4134                 fprintf(f, "%sReadOnlyPaths:", prefix);
4135                 strv_fprintf(f, c->read_only_paths);
4136                 fputs("\n", f);
4137         }
4138
4139         if (!strv_isempty(c->inaccessible_paths)) {
4140                 fprintf(f, "%sInaccessiblePaths:", prefix);
4141                 strv_fprintf(f, c->inaccessible_paths);
4142                 fputs("\n", f);
4143         }
4144
4145         if (c->n_bind_mounts > 0)
4146                 for (i = 0; i < c->n_bind_mounts; i++)
4147                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4148                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4149                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4150                                 c->bind_mounts[i].source,
4151                                 c->bind_mounts[i].destination,
4152                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4153
4154         if (c->n_temporary_filesystems > 0)
4155                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4156                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4157
4158                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4159                                 t->path,
4160                                 isempty(t->options) ? "" : ":",
4161                                 strempty(t->options));
4162                 }
4163
4164         if (c->utmp_id)
4165                 fprintf(f,
4166                         "%sUtmpIdentifier: %s\n",
4167                         prefix, c->utmp_id);
4168
4169         if (c->selinux_context)
4170                 fprintf(f,
4171                         "%sSELinuxContext: %s%s\n",
4172                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4173
4174         if (c->apparmor_profile)
4175                 fprintf(f,
4176                         "%sAppArmorProfile: %s%s\n",
4177                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4178
4179         if (c->smack_process_label)
4180                 fprintf(f,
4181                         "%sSmackProcessLabel: %s%s\n",
4182                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4183
4184         if (c->personality != PERSONALITY_INVALID)
4185                 fprintf(f,
4186                         "%sPersonality: %s\n",
4187                         prefix, strna(personality_to_string(c->personality)));
4188
4189         fprintf(f,
4190                 "%sLockPersonality: %s\n",
4191                 prefix, yes_no(c->lock_personality));
4192
4193         if (c->syscall_filter) {
4194 #if HAVE_SECCOMP
4195                 Iterator j;
4196                 void *id, *val;
4197                 bool first = true;
4198 #endif
4199
4200                 fprintf(f,
4201                         "%sSystemCallFilter: ",
4202                         prefix);
4203
4204                 if (!c->syscall_whitelist)
4205                         fputc('~', f);
4206
4207 #if HAVE_SECCOMP
4208                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4209                         _cleanup_free_ char *name = NULL;
4210                         const char *errno_name = NULL;
4211                         int num = PTR_TO_INT(val);
4212
4213                         if (first)
4214                                 first = false;
4215                         else
4216                                 fputc(' ', f);
4217
4218                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4219                         fputs(strna(name), f);
4220
4221                         if (num >= 0) {
4222                                 errno_name = errno_to_name(num);
4223                                 if (errno_name)
4224                                         fprintf(f, ":%s", errno_name);
4225                                 else
4226                                         fprintf(f, ":%d", num);
4227                         }
4228                 }
4229 #endif
4230
4231                 fputc('\n', f);
4232         }
4233
4234         if (c->syscall_archs) {
4235 #if HAVE_SECCOMP
4236                 Iterator j;
4237                 void *id;
4238 #endif
4239
4240                 fprintf(f,
4241                         "%sSystemCallArchitectures:",
4242                         prefix);
4243
4244 #if HAVE_SECCOMP
4245                 SET_FOREACH(id, c->syscall_archs, j)
4246                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4247 #endif
4248                 fputc('\n', f);
4249         }
4250
4251         if (exec_context_restrict_namespaces_set(c)) {
4252                 _cleanup_free_ char *s = NULL;
4253
4254                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4255                 if (r >= 0)
4256                         fprintf(f, "%sRestrictNamespaces: %s\n",
4257                                 prefix, s);
4258         }
4259
4260         if (c->syscall_errno > 0) {
4261                 const char *errno_name;
4262
4263                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4264
4265                 errno_name = errno_to_name(c->syscall_errno);
4266                 if (errno_name)
4267                         fprintf(f, "%s\n", errno_name);
4268                 else
4269                         fprintf(f, "%d\n", c->syscall_errno);
4270         }
4271
4272         if (c->apparmor_profile)
4273                 fprintf(f,
4274                         "%sAppArmorProfile: %s%s\n",
4275                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4276 }
4277
4278 bool exec_context_maintains_privileges(const ExecContext *c) {
4279         assert(c);
4280
4281         /* Returns true if the process forked off would run under
4282          * an unchanged UID or as root. */
4283
4284         if (!c->user)
4285                 return true;
4286
4287         if (streq(c->user, "root") || streq(c->user, "0"))
4288                 return true;
4289
4290         return false;
4291 }
4292
4293 int exec_context_get_effective_ioprio(const ExecContext *c) {
4294         int p;
4295
4296         assert(c);
4297
4298         if (c->ioprio_set)
4299                 return c->ioprio;
4300
4301         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4302         if (p < 0)
4303                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4304
4305         return p;
4306 }
4307
4308 void exec_context_free_log_extra_fields(ExecContext *c) {
4309         size_t l;
4310
4311         assert(c);
4312
4313         for (l = 0; l < c->n_log_extra_fields; l++)
4314                 free(c->log_extra_fields[l].iov_base);
4315         c->log_extra_fields = mfree(c->log_extra_fields);
4316         c->n_log_extra_fields = 0;
4317 }
4318
4319 void exec_status_start(ExecStatus *s, pid_t pid) {
4320         assert(s);
4321
4322         zero(*s);
4323         s->pid = pid;
4324         dual_timestamp_get(&s->start_timestamp);
4325 }
4326
4327 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4328         assert(s);
4329
4330         if (s->pid && s->pid != pid)
4331                 zero(*s);
4332
4333         s->pid = pid;
4334         dual_timestamp_get(&s->exit_timestamp);
4335
4336         s->code = code;
4337         s->status = status;
4338
4339         if (context) {
4340                 if (context->utmp_id)
4341                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4342
4343                 exec_context_tty_reset(context, NULL);
4344         }
4345 }
4346
4347 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4348         char buf[FORMAT_TIMESTAMP_MAX];
4349
4350         assert(s);
4351         assert(f);
4352
4353         if (s->pid <= 0)
4354                 return;
4355
4356         prefix = strempty(prefix);
4357
4358         fprintf(f,
4359                 "%sPID: "PID_FMT"\n",
4360                 prefix, s->pid);
4361
4362         if (dual_timestamp_is_set(&s->start_timestamp))
4363                 fprintf(f,
4364                         "%sStart Timestamp: %s\n",
4365                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4366
4367         if (dual_timestamp_is_set(&s->exit_timestamp))
4368                 fprintf(f,
4369                         "%sExit Timestamp: %s\n"
4370                         "%sExit Code: %s\n"
4371                         "%sExit Status: %i\n",
4372                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4373                         prefix, sigchld_code_to_string(s->code),
4374                         prefix, s->status);
4375 }
4376
4377 static char *exec_command_line(char **argv) {
4378         size_t k;
4379         char *n, *p, **a;
4380         bool first = true;
4381
4382         assert(argv);
4383
4384         k = 1;
4385         STRV_FOREACH(a, argv)
4386                 k += strlen(*a)+3;
4387
4388         n = new(char, k);
4389         if (!n)
4390                 return NULL;
4391
4392         p = n;
4393         STRV_FOREACH(a, argv) {
4394
4395                 if (!first)
4396                         *(p++) = ' ';
4397                 else
4398                         first = false;
4399
4400                 if (strpbrk(*a, WHITESPACE)) {
4401                         *(p++) = '\'';
4402                         p = stpcpy(p, *a);
4403                         *(p++) = '\'';
4404                 } else
4405                         p = stpcpy(p, *a);
4406
4407         }
4408
4409         *p = 0;
4410
4411         /* FIXME: this doesn't really handle arguments that have
4412          * spaces and ticks in them */
4413
4414         return n;
4415 }
4416
4417 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4418         _cleanup_free_ char *cmd = NULL;
4419         const char *prefix2;
4420
4421         assert(c);
4422         assert(f);
4423
4424         prefix = strempty(prefix);
4425         prefix2 = strjoina(prefix, "\t");
4426
4427         cmd = exec_command_line(c->argv);
4428         fprintf(f,
4429                 "%sCommand Line: %s\n",
4430                 prefix, cmd ? cmd : strerror(ENOMEM));
4431
4432         exec_status_dump(&c->exec_status, f, prefix2);
4433 }
4434
4435 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4436         assert(f);
4437
4438         prefix = strempty(prefix);
4439
4440         LIST_FOREACH(command, c, c)
4441                 exec_command_dump(c, f, prefix);
4442 }
4443
4444 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4445         ExecCommand *end;
4446
4447         assert(l);
4448         assert(e);
4449
4450         if (*l) {
4451                 /* It's kind of important, that we keep the order here */
4452                 LIST_FIND_TAIL(command, *l, end);
4453                 LIST_INSERT_AFTER(command, *l, end, e);
4454         } else
4455               *l = e;
4456 }
4457
4458 int exec_command_set(ExecCommand *c, const char *path, ...) {
4459         va_list ap;
4460         char **l, *p;
4461
4462         assert(c);
4463         assert(path);
4464
4465         va_start(ap, path);
4466         l = strv_new_ap(path, ap);
4467         va_end(ap);
4468
4469         if (!l)
4470                 return -ENOMEM;
4471
4472         p = strdup(path);
4473         if (!p) {
4474                 strv_free(l);
4475                 return -ENOMEM;
4476         }
4477
4478         free(c->path);
4479         c->path = p;
4480
4481         return strv_free_and_replace(c->argv, l);
4482 }
4483
4484 int exec_command_append(ExecCommand *c, const char *path, ...) {
4485         _cleanup_strv_free_ char **l = NULL;
4486         va_list ap;
4487         int r;
4488
4489         assert(c);
4490         assert(path);
4491
4492         va_start(ap, path);
4493         l = strv_new_ap(path, ap);
4494         va_end(ap);
4495
4496         if (!l)
4497                 return -ENOMEM;
4498
4499         r = strv_extend_strv(&c->argv, l, false);
4500         if (r < 0)
4501                 return r;
4502
4503         return 0;
4504 }
4505
4506 static void *remove_tmpdir_thread(void *p) {
4507         _cleanup_free_ char *path = p;
4508
4509         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4510         return NULL;
4511 }
4512
4513 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4514         int r;
4515
4516         if (!rt)
4517                 return NULL;
4518
4519         if (rt->manager)
4520                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4521
4522         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4523         if (destroy && rt->tmp_dir) {
4524                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4525
4526                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4527                 if (r < 0) {
4528                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4529                         free(rt->tmp_dir);
4530                 }
4531
4532                 rt->tmp_dir = NULL;
4533         }
4534
4535         if (destroy && rt->var_tmp_dir) {
4536                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4537
4538                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4539                 if (r < 0) {
4540                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4541                         free(rt->var_tmp_dir);
4542                 }
4543
4544                 rt->var_tmp_dir = NULL;
4545         }
4546
4547         rt->id = mfree(rt->id);
4548         rt->tmp_dir = mfree(rt->tmp_dir);
4549         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4550         safe_close_pair(rt->netns_storage_socket);
4551         return mfree(rt);
4552 }
4553
4554 static void exec_runtime_freep(ExecRuntime **rt) {
4555         if (*rt)
4556                 (void) exec_runtime_free(*rt, false);
4557 }
4558
4559 static int exec_runtime_allocate(ExecRuntime **rt) {
4560         assert(rt);
4561
4562         *rt = new0(ExecRuntime, 1);
4563         if (!*rt)
4564                 return -ENOMEM;
4565
4566         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4567         return 0;
4568 }
4569
4570 static int exec_runtime_add(
4571                 Manager *m,
4572                 const char *id,
4573                 const char *tmp_dir,
4574                 const char *var_tmp_dir,
4575                 const int netns_storage_socket[2],
4576                 ExecRuntime **ret) {
4577
4578         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4579         int r;
4580
4581         assert(m);
4582         assert(id);
4583
4584         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4585         if (r < 0)
4586                 return r;
4587
4588         r = exec_runtime_allocate(&rt);
4589         if (r < 0)
4590                 return r;
4591
4592         rt->id = strdup(id);
4593         if (!rt->id)
4594                 return -ENOMEM;
4595
4596         if (tmp_dir) {
4597                 rt->tmp_dir = strdup(tmp_dir);
4598                 if (!rt->tmp_dir)
4599                         return -ENOMEM;
4600
4601                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4602                 assert(var_tmp_dir);
4603                 rt->var_tmp_dir = strdup(var_tmp_dir);
4604                 if (!rt->var_tmp_dir)
4605                         return -ENOMEM;
4606         }
4607
4608         if (netns_storage_socket) {
4609                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4610                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4611         }
4612
4613         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4614         if (r < 0)
4615                 return r;
4616
4617         rt->manager = m;
4618
4619         if (ret)
4620                 *ret = rt;
4621
4622         /* do not remove created ExecRuntime object when the operation succeeds. */
4623         rt = NULL;
4624         return 0;
4625 }
4626
4627 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4628         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4629         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4630         int r;
4631
4632         assert(m);
4633         assert(c);
4634         assert(id);
4635
4636         /* It is not necessary to create ExecRuntime object. */
4637         if (!c->private_network && !c->private_tmp)
4638                 return 0;
4639
4640         if (c->private_tmp) {
4641                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4642                 if (r < 0)
4643                         return r;
4644         }
4645
4646         if (c->private_network) {
4647                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4648                         return -errno;
4649         }
4650
4651         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4652         if (r < 0)
4653                 return r;
4654
4655         /* Avoid cleanup */
4656         netns_storage_socket[0] = -1;
4657         netns_storage_socket[1] = -1;
4658         return 1;
4659 }
4660
4661 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4662         ExecRuntime *rt;
4663         int r;
4664
4665         assert(m);
4666         assert(id);
4667         assert(ret);
4668
4669         rt = hashmap_get(m->exec_runtime_by_id, id);
4670         if (rt)
4671                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4672                 goto ref;
4673
4674         if (!create)
4675                 return 0;
4676
4677         /* If not found, then create a new object. */
4678         r = exec_runtime_make(m, c, id, &rt);
4679         if (r <= 0)
4680                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4681                 return r;
4682
4683 ref:
4684         /* increment reference counter. */
4685         rt->n_ref++;
4686         *ret = rt;
4687         return 1;
4688 }
4689
4690 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4691         if (!rt)
4692                 return NULL;
4693
4694         assert(rt->n_ref > 0);
4695
4696         rt->n_ref--;
4697         if (rt->n_ref > 0)
4698                 return NULL;
4699
4700         return exec_runtime_free(rt, destroy);
4701 }
4702
4703 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4704         ExecRuntime *rt;
4705         Iterator i;
4706
4707         assert(m);
4708         assert(f);
4709         assert(fds);
4710
4711         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4712                 fprintf(f, "exec-runtime=%s", rt->id);
4713
4714                 if (rt->tmp_dir)
4715                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4716
4717                 if (rt->var_tmp_dir)
4718                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4719
4720                 if (rt->netns_storage_socket[0] >= 0) {
4721                         int copy;
4722
4723                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4724                         if (copy < 0)
4725                                 return copy;
4726
4727                         fprintf(f, " netns-socket-0=%i", copy);
4728                 }
4729
4730                 if (rt->netns_storage_socket[1] >= 0) {
4731                         int copy;
4732
4733                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4734                         if (copy < 0)
4735                                 return copy;
4736
4737                         fprintf(f, " netns-socket-1=%i", copy);
4738                 }
4739
4740                 fputc('\n', f);
4741         }
4742
4743         return 0;
4744 }
4745
4746 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4747         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4748         ExecRuntime *rt;
4749         int r;
4750
4751         /* This is for the migration from old (v237 or earlier) deserialization text.
4752          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4753          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4754          * so or not from the serialized text, then we always creates a new object owned by this. */
4755
4756         assert(u);
4757         assert(key);
4758         assert(value);
4759
4760         /* Manager manages ExecRuntime objects by the unit id.
4761          * So, we omit the serialized text when the unit does not have id (yet?)... */
4762         if (isempty(u->id)) {
4763                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4764                 return 0;
4765         }
4766
4767         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4768         if (r < 0) {
4769                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4770                 return 0;
4771         }
4772
4773         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4774         if (!rt) {
4775                 r = exec_runtime_allocate(&rt_create);
4776                 if (r < 0)
4777                         return log_oom();
4778
4779                 rt_create->id = strdup(u->id);
4780                 if (!rt_create->id)
4781                         return log_oom();
4782
4783                 rt = rt_create;
4784         }
4785
4786         if (streq(key, "tmp-dir")) {
4787                 char *copy;
4788
4789                 copy = strdup(value);
4790                 if (!copy)
4791                         return log_oom();
4792
4793                 free_and_replace(rt->tmp_dir, copy);
4794
4795         } else if (streq(key, "var-tmp-dir")) {
4796                 char *copy;
4797
4798                 copy = strdup(value);
4799                 if (!copy)
4800                         return log_oom();
4801
4802                 free_and_replace(rt->var_tmp_dir, copy);
4803
4804         } else if (streq(key, "netns-socket-0")) {
4805                 int fd;
4806
4807                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4808                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4809                         return 0;
4810                 }
4811
4812                 safe_close(rt->netns_storage_socket[0]);
4813                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4814
4815         } else if (streq(key, "netns-socket-1")) {
4816                 int fd;
4817
4818                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4819                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4820                         return 0;
4821                 }
4822
4823                 safe_close(rt->netns_storage_socket[1]);
4824                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4825         } else
4826                 return 0;
4827
4828         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4829         if (rt_create) {
4830                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4831                 if (r < 0) {
4832                         log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4833                         return 0;
4834                 }
4835
4836                 rt_create->manager = u->manager;
4837
4838                 /* Avoid cleanup */
4839                 rt_create = NULL;
4840         }
4841
4842         return 1;
4843 }
4844
4845 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4846         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4847         int r, fd0 = -1, fd1 = -1;
4848         const char *p, *v = value;
4849         size_t n;
4850
4851         assert(m);
4852         assert(value);
4853         assert(fds);
4854
4855         n = strcspn(v, " ");
4856         id = strndupa(v, n);
4857         if (v[n] != ' ')
4858                 goto finalize;
4859         p = v + n + 1;
4860
4861         v = startswith(p, "tmp-dir=");
4862         if (v) {
4863                 n = strcspn(v, " ");
4864                 tmp_dir = strndupa(v, n);
4865                 if (v[n] != ' ')
4866                         goto finalize;
4867                 p = v + n + 1;
4868         }
4869
4870         v = startswith(p, "var-tmp-dir=");
4871         if (v) {
4872                 n = strcspn(v, " ");
4873                 var_tmp_dir = strndupa(v, n);
4874                 if (v[n] != ' ')
4875                         goto finalize;
4876                 p = v + n + 1;
4877         }
4878
4879         v = startswith(p, "netns-socket-0=");
4880         if (v) {
4881                 char *buf;
4882
4883                 n = strcspn(v, " ");
4884                 buf = strndupa(v, n);
4885                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4886                         log_debug("Unable to process exec-runtime netns fd specification.");
4887                         return;
4888                 }
4889                 fd0 = fdset_remove(fds, fd0);
4890                 if (v[n] != ' ')
4891                         goto finalize;
4892                 p = v + n + 1;
4893         }
4894
4895         v = startswith(p, "netns-socket-1=");
4896         if (v) {
4897                 char *buf;
4898
4899                 n = strcspn(v, " ");
4900                 buf = strndupa(v, n);
4901                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4902                         log_debug("Unable to process exec-runtime netns fd specification.");
4903                         return;
4904                 }
4905                 fd1 = fdset_remove(fds, fd1);
4906         }
4907
4908 finalize:
4909
4910         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4911         if (r < 0) {
4912                 log_debug_errno(r, "Failed to add exec-runtime: %m");
4913                 return;
4914         }
4915 }
4916
4917 void exec_runtime_vacuum(Manager *m) {
4918         ExecRuntime *rt;
4919         Iterator i;
4920
4921         assert(m);
4922
4923         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4924
4925         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4926                 if (rt->n_ref > 0)
4927                         continue;
4928
4929                 (void) exec_runtime_free(rt, false);
4930         }
4931 }
4932
4933 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4934         [EXEC_INPUT_NULL] = "null",
4935         [EXEC_INPUT_TTY] = "tty",
4936         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4937         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4938         [EXEC_INPUT_SOCKET] = "socket",
4939         [EXEC_INPUT_NAMED_FD] = "fd",
4940         [EXEC_INPUT_DATA] = "data",
4941         [EXEC_INPUT_FILE] = "file",
4942 };
4943
4944 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4945
4946 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4947         [EXEC_OUTPUT_INHERIT] = "inherit",
4948         [EXEC_OUTPUT_NULL] = "null",
4949         [EXEC_OUTPUT_TTY] = "tty",
4950         [EXEC_OUTPUT_SYSLOG] = "syslog",
4951         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4952         [EXEC_OUTPUT_KMSG] = "kmsg",
4953         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4954         [EXEC_OUTPUT_JOURNAL] = "journal",
4955         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4956         [EXEC_OUTPUT_SOCKET] = "socket",
4957         [EXEC_OUTPUT_NAMED_FD] = "fd",
4958         [EXEC_OUTPUT_FILE] = "file",
4959 };
4960
4961 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4962
4963 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4964         [EXEC_UTMP_INIT] = "init",
4965         [EXEC_UTMP_LOGIN] = "login",
4966         [EXEC_UTMP_USER] = "user",
4967 };
4968
4969 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4970
4971 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4972         [EXEC_PRESERVE_NO] = "no",
4973         [EXEC_PRESERVE_YES] = "yes",
4974         [EXEC_PRESERVE_RESTART] = "restart",
4975 };
4976
4977 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4978
4979 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4980         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4981         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4982         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4983         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4984         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4985 };
4986
4987 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4988
4989 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4990         [EXEC_KEYRING_INHERIT] = "inherit",
4991         [EXEC_KEYRING_PRIVATE] = "private",
4992         [EXEC_KEYRING_SHARED] = "shared",
4993 };
4994
4995 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);