src/core/execute.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <glob.h>
   6 #include <grp.h>
   7 #include <poll.h>
   8 #include <signal.h>
   9 #include <string.h>
  10 #include <sys/capability.h>
  11 #include <sys/eventfd.h>
  12 #include <sys/mman.h>
  13 #include <sys/personality.h>
  14 #include <sys/prctl.h>
  15 #include <sys/shm.h>
  16 #include <sys/socket.h>
  17 #include <sys/stat.h>
  18 #include <sys/types.h>
  19 #include <sys/un.h>
  20 #include <unistd.h>
  21 #include <utmpx.h>
  22
  23 #if HAVE_PAM
  24 #include <security/pam_appl.h>
  25 #endif
  26
  27 #if HAVE_SELINUX
  28 #include <selinux/selinux.h>
  29 #endif
  30
  31 #if HAVE_SECCOMP
  32 #include <seccomp.h>
  33 #endif
  34
  35 #if HAVE_APPARMOR
  36 #include <sys/apparmor.h>
  37 #endif
  38
  39 #include "sd-messages.h"
  40
  41 #include "af-list.h"
  42 #include "alloc-util.h"
  43 #if HAVE_APPARMOR
  44 #include "apparmor-util.h"
  45 #endif
  46 #include "async.h"
  47 #include "barrier.h"
  48 #include "cap-list.h"
  49 #include "capability-util.h"
  50 #include "chown-recursive.h"
  51 #include "cpu-set-util.h"
  52 #include "def.h"
  53 #include "env-util.h"
  54 #include "errno-list.h"
  55 #include "execute.h"
  56 #include "exit-status.h"
  57 #include "fd-util.h"
  58 #include "fileio.h"
  59 #include "format-util.h"
  60 #include "fs-util.h"
  61 #include "glob-util.h"
  62 #include "io-util.h"
  63 #include "ioprio.h"
  64 #include "label.h"
  65 #include "log.h"
  66 #include "macro.h"
  67 #include "manager.h"
  68 #include "missing.h"
  69 #include "mkdir.h"
  70 #include "namespace.h"
  71 #include "parse-util.h"
  72 #include "path-util.h"
  73 #include "process-util.h"
  74 #include "rlimit-util.h"
  75 #include "rm-rf.h"
  76 #if HAVE_SECCOMP
  77 #include "seccomp-util.h"
  78 #endif
  79 #include "securebits.h"
  80 #include "securebits-util.h"
  81 #include "selinux-util.h"
  82 #include "signal-util.h"
  83 #include "smack-util.h"
  84 #include "socket-util.h"
  85 #include "special.h"
  86 #include "stat-util.h"
  87 #include "string-table.h"
  88 #include "string-util.h"
  89 #include "strv.h"
  90 #include "syslog-util.h"
  91 #include "terminal-util.h"
  92 #include "umask-util.h"
  93 #include "unit.h"
  94 #include "user-util.h"
  95 #include "util.h"
  96 #include "utmp-wtmp.h"
  97
  98 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
  99 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 100
 101 /* This assumes there is a 'tty' group */
 102 #define TTY_MODE 0620
 103
 104 #define SNDBUF_SIZE (8*1024*1024)
 105
 106 static int shift_fds(int fds[], size_t n_fds) {
 107         int start, restart_from;
 108
 109         if (n_fds <= 0)
 110                 return 0;
 111
 112         /* Modifies the fds array! (sorts it) */
 113
 114         assert(fds);
 115
 116         start = 0;
 117         for (;;) {
 118                 int i;
 119
 120                 restart_from = -1;
 121
 122                 for (i = start; i < (int) n_fds; i++) {
 123                         int nfd;
 124
 125                         /* Already at right index? */
 126                         if (fds[i] == i+3)
 127                                 continue;
 128
 129                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 130                         if (nfd < 0)
 131                                 return -errno;
 132
 133                         safe_close(fds[i]);
 134                         fds[i] = nfd;
 135
 136                         /* Hmm, the fd we wanted isn't free? Then
 137                          * let's remember that and try again from here */
 138                         if (nfd != i+3 && restart_from < 0)
 139                                 restart_from = i;
 140                 }
 141
 142                 if (restart_from < 0)
 143                         break;
 144
 145                 start = restart_from;
 146         }
 147
 148         return 0;
 149 }
 150
 151 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
 152         size_t i, n_fds;
 153         int r;
 154
 155         n_fds = n_socket_fds + n_storage_fds;
 156         if (n_fds <= 0)
 157                 return 0;
 158
 159         assert(fds);
 160
 161         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 162          * O_NONBLOCK only applies to socket activation though. */
 163
 164         for (i = 0; i < n_fds; i++) {
 165
 166                 if (i < n_socket_fds) {
 167                         r = fd_nonblock(fds[i], nonblock);
 168                         if (r < 0)
 169                                 return r;
 170                 }
 171
 172                 /* We unconditionally drop FD_CLOEXEC from the fds,
 173                  * since after all we want to pass these fds to our
 174                  * children */
 175
 176                 r = fd_cloexec(fds[i], false);
 177                 if (r < 0)
 178                         return r;
 179         }
 180
 181         return 0;
 182 }
 183
 184 static const char *exec_context_tty_path(const ExecContext *context) {
 185         assert(context);
 186
 187         if (context->stdio_as_fds)
 188                 return NULL;
 189
 190         if (context->tty_path)
 191                 return context->tty_path;
 192
 193         return "/dev/console";
 194 }
 195
 196 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 197         const char *path;
 198
 199         assert(context);
 200
 201         path = exec_context_tty_path(context);
 202
 203         if (context->tty_vhangup) {
 204                 if (p && p->stdin_fd >= 0)
 205                         (void) terminal_vhangup_fd(p->stdin_fd);
 206                 else if (path)
 207                         (void) terminal_vhangup(path);
 208         }
 209
 210         if (context->tty_reset) {
 211                 if (p && p->stdin_fd >= 0)
 212                         (void) reset_terminal_fd(p->stdin_fd, true);
 213                 else if (path)
 214                         (void) reset_terminal(path);
 215         }
 216
 217         if (context->tty_vt_disallocate && path)
 218                 (void) vt_disallocate(path);
 219 }
 220
 221 static bool is_terminal_input(ExecInput i) {
 222         return IN_SET(i,
 223                       EXEC_INPUT_TTY,
 224                       EXEC_INPUT_TTY_FORCE,
 225                       EXEC_INPUT_TTY_FAIL);
 226 }
 227
 228 static bool is_terminal_output(ExecOutput o) {
 229         return IN_SET(o,
 230                       EXEC_OUTPUT_TTY,
 231                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 232                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 233                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 234 }
 235
 236 static bool is_syslog_output(ExecOutput o) {
 237         return IN_SET(o,
 238                       EXEC_OUTPUT_SYSLOG,
 239                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 240 }
 241
 242 static bool is_kmsg_output(ExecOutput o) {
 243         return IN_SET(o,
 244                       EXEC_OUTPUT_KMSG,
 245                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 246 }
 247
 248 static bool exec_context_needs_term(const ExecContext *c) {
 249         assert(c);
 250
 251         /* Return true if the execution context suggests we should set $TERM to something useful. */
 252
 253         if (is_terminal_input(c->std_input))
 254                 return true;
 255
 256         if (is_terminal_output(c->std_output))
 257                 return true;
 258
 259         if (is_terminal_output(c->std_error))
 260                 return true;
 261
 262         return !!c->tty_path;
 263 }
 264
 265 static int open_null_as(int flags, int nfd) {
 266         int fd;
 267
 268         assert(nfd >= 0);
 269
 270         fd = open("/dev/null", flags|O_NOCTTY);
 271         if (fd < 0)
 272                 return -errno;
 273
 274         return move_fd(fd, nfd, false);
 275 }
 276
 277 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 278         static const union sockaddr_union sa = {
 279                 .un.sun_family = AF_UNIX,
 280                 .un.sun_path = "/run/systemd/journal/stdout",
 281         };
 282         uid_t olduid = UID_INVALID;
 283         gid_t oldgid = GID_INVALID;
 284         int r;
 285
 286         if (gid_is_valid(gid)) {
 287                 oldgid = getgid();
 288
 289                 if (setegid(gid) < 0)
 290                         return -errno;
 291         }
 292
 293         if (uid_is_valid(uid)) {
 294                 olduid = getuid();
 295
 296                 if (seteuid(uid) < 0) {
 297                         r = -errno;
 298                         goto restore_gid;
 299                 }
 300         }
 301
 302         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 303
 304         /* If we fail to restore the uid or gid, things will likely
 305            fail later on. This should only happen if an LSM interferes. */
 306
 307         if (uid_is_valid(uid))
 308                 (void) seteuid(olduid);
 309
 310  restore_gid:
 311         if (gid_is_valid(gid))
 312                 (void) setegid(oldgid);
 313
 314         return r;
 315 }
 316
 317 static int connect_logger_as(
 318                 const Unit *unit,
 319                 const ExecContext *context,
 320                 const ExecParameters *params,
 321                 ExecOutput output,
 322                 const char *ident,
 323                 int nfd,
 324                 uid_t uid,
 325                 gid_t gid) {
 326
 327         int fd, r;
 328
 329         assert(context);
 330         assert(params);
 331         assert(output < _EXEC_OUTPUT_MAX);
 332         assert(ident);
 333         assert(nfd >= 0);
 334
 335         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = connect_journal_socket(fd, uid, gid);
 340         if (r < 0)
 341                 return r;
 342
 343         if (shutdown(fd, SHUT_RD) < 0) {
 344                 safe_close(fd);
 345                 return -errno;
 346         }
 347
 348         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 349
 350         dprintf(fd,
 351                 "%s\n"
 352                 "%s\n"
 353                 "%i\n"
 354                 "%i\n"
 355                 "%i\n"
 356                 "%i\n"
 357                 "%i\n",
 358                 context->syslog_identifier ?: ident,
 359                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 360                 context->syslog_priority,
 361                 !!context->syslog_level_prefix,
 362                 is_syslog_output(output),
 363                 is_kmsg_output(output),
 364                 is_terminal_output(output));
 365
 366         return move_fd(fd, nfd, false);
 367 }
 368 static int open_terminal_as(const char *path, int flags, int nfd) {
 369         int fd;
 370
 371         assert(path);
 372         assert(nfd >= 0);
 373
 374         fd = open_terminal(path, flags | O_NOCTTY);
 375         if (fd < 0)
 376                 return fd;
 377
 378         return move_fd(fd, nfd, false);
 379 }
 380
 381 static int acquire_path(const char *path, int flags, mode_t mode) {
 382         union sockaddr_union sa = {
 383                 .sa.sa_family = AF_UNIX,
 384         };
 385         int fd, r;
 386
 387         assert(path);
 388
 389         if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
 390                 flags |= O_CREAT;
 391
 392         fd = open(path, flags|O_NOCTTY, mode);
 393         if (fd >= 0)
 394                 return fd;
 395
 396         if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
 397                 return -errno;
 398         if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
 399                 return -ENXIO;
 400
 401         /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
 402
 403         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 404         if (fd < 0)
 405                 return -errno;
 406
 407         strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
 408         if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
 409                 safe_close(fd);
 410                 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
 411                                                            * indication that his wasn't an AF_UNIX socket after all */
 412         }
 413
 414         if ((flags & O_ACCMODE) == O_RDONLY)
 415                 r = shutdown(fd, SHUT_WR);
 416         else if ((flags & O_ACCMODE) == O_WRONLY)
 417                 r = shutdown(fd, SHUT_RD);
 418         else
 419                 return fd;
 420         if (r < 0) {
 421                 safe_close(fd);
 422                 return -errno;
 423         }
 424
 425         return fd;
 426 }
 427
 428 static int fixup_input(
 429                 const ExecContext *context,
 430                 int socket_fd,
 431                 bool apply_tty_stdin) {
 432
 433         ExecInput std_input;
 434
 435         assert(context);
 436
 437         std_input = context->std_input;
 438
 439         if (is_terminal_input(std_input) && !apply_tty_stdin)
 440                 return EXEC_INPUT_NULL;
 441
 442         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 443                 return EXEC_INPUT_NULL;
 444
 445         if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
 446                 return EXEC_INPUT_NULL;
 447
 448         return std_input;
 449 }
 450
 451 static int fixup_output(ExecOutput std_output, int socket_fd) {
 452
 453         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 454                 return EXEC_OUTPUT_INHERIT;
 455
 456         return std_output;
 457 }
 458
 459 static int setup_input(
 460                 const ExecContext *context,
 461                 const ExecParameters *params,
 462                 int socket_fd,
 463                 int named_iofds[3]) {
 464
 465         ExecInput i;
 466
 467         assert(context);
 468         assert(params);
 469
 470         if (params->stdin_fd >= 0) {
 471                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 472                         return -errno;
 473
 474                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 475                 if (isatty(STDIN_FILENO)) {
 476                         (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 477                         (void) reset_terminal_fd(STDIN_FILENO, true);
 478                 }
 479
 480                 return STDIN_FILENO;
 481         }
 482
 483         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 484
 485         switch (i) {
 486
 487         case EXEC_INPUT_NULL:
 488                 return open_null_as(O_RDONLY, STDIN_FILENO);
 489
 490         case EXEC_INPUT_TTY:
 491         case EXEC_INPUT_TTY_FORCE:
 492         case EXEC_INPUT_TTY_FAIL: {
 493                 int fd;
 494
 495                 fd = acquire_terminal(exec_context_tty_path(context),
 496                                       i == EXEC_INPUT_TTY_FAIL  ? ACQUIRE_TERMINAL_TRY :
 497                                       i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
 498                                                                   ACQUIRE_TERMINAL_WAIT,
 499                                       USEC_INFINITY);
 500                 if (fd < 0)
 501                         return fd;
 502
 503                 return move_fd(fd, STDIN_FILENO, false);
 504         }
 505
 506         case EXEC_INPUT_SOCKET:
 507                 assert(socket_fd >= 0);
 508
 509                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 510
 511         case EXEC_INPUT_NAMED_FD:
 512                 assert(named_iofds[STDIN_FILENO] >= 0);
 513
 514                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 515                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 516
 517         case EXEC_INPUT_DATA: {
 518                 int fd;
 519
 520                 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
 521                 if (fd < 0)
 522                         return fd;
 523
 524                 return move_fd(fd, STDIN_FILENO, false);
 525         }
 526
 527         case EXEC_INPUT_FILE: {
 528                 bool rw;
 529                 int fd;
 530
 531                 assert(context->stdio_file[STDIN_FILENO]);
 532
 533                 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
 534                         (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
 535
 536                 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
 537                 if (fd < 0)
 538                         return fd;
 539
 540                 return move_fd(fd, STDIN_FILENO, false);
 541         }
 542
 543         default:
 544                 assert_not_reached("Unknown input type");
 545         }
 546 }
 547
 548 static int setup_output(
 549                 const Unit *unit,
 550                 const ExecContext *context,
 551                 const ExecParameters *params,
 552                 int fileno,
 553                 int socket_fd,
 554                 int named_iofds[3],
 555                 const char *ident,
 556                 uid_t uid,
 557                 gid_t gid,
 558                 dev_t *journal_stream_dev,
 559                 ino_t *journal_stream_ino) {
 560
 561         ExecOutput o;
 562         ExecInput i;
 563         int r;
 564
 565         assert(unit);
 566         assert(context);
 567         assert(params);
 568         assert(ident);
 569         assert(journal_stream_dev);
 570         assert(journal_stream_ino);
 571
 572         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 573
 574                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 575                         return -errno;
 576
 577                 return STDOUT_FILENO;
 578         }
 579
 580         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 581                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 582                         return -errno;
 583
 584                 return STDERR_FILENO;
 585         }
 586
 587         i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 588         o = fixup_output(context->std_output, socket_fd);
 589
 590         if (fileno == STDERR_FILENO) {
 591                 ExecOutput e;
 592                 e = fixup_output(context->std_error, socket_fd);
 593
 594                 /* This expects the input and output are already set up */
 595
 596                 /* Don't change the stderr file descriptor if we inherit all
 597                  * the way and are not on a tty */
 598                 if (e == EXEC_OUTPUT_INHERIT &&
 599                     o == EXEC_OUTPUT_INHERIT &&
 600                     i == EXEC_INPUT_NULL &&
 601                     !is_terminal_input(context->std_input) &&
 602                     getppid () != 1)
 603                         return fileno;
 604
 605                 /* Duplicate from stdout if possible */
 606                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 607                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 608
 609                 o = e;
 610
 611         } else if (o == EXEC_OUTPUT_INHERIT) {
 612                 /* If input got downgraded, inherit the original value */
 613                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 614                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 615
 616                 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
 617                 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
 618                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 619
 620                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 621                 if (getppid() != 1)
 622                         return fileno;
 623
 624                 /* We need to open /dev/null here anew, to get the right access mode. */
 625                 return open_null_as(O_WRONLY, fileno);
 626         }
 627
 628         switch (o) {
 629
 630         case EXEC_OUTPUT_NULL:
 631                 return open_null_as(O_WRONLY, fileno);
 632
 633         case EXEC_OUTPUT_TTY:
 634                 if (is_terminal_input(i))
 635                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 636
 637                 /* We don't reset the terminal if this is just about output */
 638                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 639
 640         case EXEC_OUTPUT_SYSLOG:
 641         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 642         case EXEC_OUTPUT_KMSG:
 643         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 644         case EXEC_OUTPUT_JOURNAL:
 645         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 646                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 647                 if (r < 0) {
 648                         log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 649                         r = open_null_as(O_WRONLY, fileno);
 650                 } else {
 651                         struct stat st;
 652
 653                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 654                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 655                          * services to detect whether they are connected to the journal or not.
 656                          *
 657                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 658                          * about STDERR as that's usually the best way to do logging. */
 659
 660                         if (fstat(fileno, &st) >= 0 &&
 661                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 662                                 *journal_stream_dev = st.st_dev;
 663                                 *journal_stream_ino = st.st_ino;
 664                         }
 665                 }
 666                 return r;
 667
 668         case EXEC_OUTPUT_SOCKET:
 669                 assert(socket_fd >= 0);
 670
 671                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 672
 673         case EXEC_OUTPUT_NAMED_FD:
 674                 assert(named_iofds[fileno] >= 0);
 675
 676                 (void) fd_nonblock(named_iofds[fileno], false);
 677                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 678
 679         case EXEC_OUTPUT_FILE:
 680         case EXEC_OUTPUT_FILE_APPEND: {
 681                 bool rw;
 682                 int fd, flags;
 683
 684                 assert(context->stdio_file[fileno]);
 685
 686                 rw = context->std_input == EXEC_INPUT_FILE &&
 687                         streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
 688
 689                 if (rw)
 690                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 691
 692                 flags = O_WRONLY;
 693                 if (o == EXEC_OUTPUT_FILE_APPEND)
 694                         flags |= O_APPEND;
 695
 696                 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
 697
 698                 if (fd < 0)
 699                         return fd;
 700
 701                 return move_fd(fd, fileno, 0);
 702         }
 703
 704         default:
 705                 assert_not_reached("Unknown error type");
 706         }
 707 }
 708
 709 static int chown_terminal(int fd, uid_t uid) {
 710         struct stat st;
 711
 712         assert(fd >= 0);
 713
 714         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 715         if (isatty(fd) < 1)
 716                 return 0;
 717
 718         /* This might fail. What matters are the results. */
 719         (void) fchown(fd, uid, -1);
 720         (void) fchmod(fd, TTY_MODE);
 721
 722         if (fstat(fd, &st) < 0)
 723                 return -errno;
 724
 725         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 726                 return -EPERM;
 727
 728         return 0;
 729 }
 730
 731 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 732         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 733         int r;
 734
 735         assert(_saved_stdin);
 736         assert(_saved_stdout);
 737
 738         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 739         if (saved_stdin < 0)
 740                 return -errno;
 741
 742         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 743         if (saved_stdout < 0)
 744                 return -errno;
 745
 746         fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
 747         if (fd < 0)
 748                 return fd;
 749
 750         r = chown_terminal(fd, getuid());
 751         if (r < 0)
 752                 return r;
 753
 754         r = reset_terminal_fd(fd, true);
 755         if (r < 0)
 756                 return r;
 757
 758         r = rearrange_stdio(fd, fd, STDERR_FILENO);
 759         fd = -1;
 760         if (r < 0)
 761                 return r;
 762
 763         *_saved_stdin = saved_stdin;
 764         *_saved_stdout = saved_stdout;
 765
 766         saved_stdin = saved_stdout = -1;
 767
 768         return 0;
 769 }
 770
 771 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 772         assert(err < 0);
 773
 774         if (err == -ETIMEDOUT)
 775                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 776         else {
 777                 errno = -err;
 778                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 779         }
 780 }
 781
 782 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 783         _cleanup_close_ int fd = -1;
 784
 785         assert(vc);
 786
 787         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 788         if (fd < 0)
 789                 return;
 790
 791         write_confirm_error_fd(err, fd, u);
 792 }
 793
 794 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 795         int r = 0;
 796
 797         assert(saved_stdin);
 798         assert(saved_stdout);
 799
 800         release_terminal();
 801
 802         if (*saved_stdin >= 0)
 803                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 804                         r = -errno;
 805
 806         if (*saved_stdout >= 0)
 807                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 808                         r = -errno;
 809
 810         *saved_stdin = safe_close(*saved_stdin);
 811         *saved_stdout = safe_close(*saved_stdout);
 812
 813         return r;
 814 }
 815
 816 enum {
 817         CONFIRM_PRETEND_FAILURE = -1,
 818         CONFIRM_PRETEND_SUCCESS =  0,
 819         CONFIRM_EXECUTE = 1,
 820 };
 821
 822 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 823         int saved_stdout = -1, saved_stdin = -1, r;
 824         _cleanup_free_ char *e = NULL;
 825         char c;
 826
 827         /* For any internal errors, assume a positive response. */
 828         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 829         if (r < 0) {
 830                 write_confirm_error(r, vc, u);
 831                 return CONFIRM_EXECUTE;
 832         }
 833
 834         /* confirm_spawn might have been disabled while we were sleeping. */
 835         if (manager_is_confirm_spawn_disabled(u->manager)) {
 836                 r = 1;
 837                 goto restore_stdio;
 838         }
 839
 840         e = ellipsize(cmdline, 60, 100);
 841         if (!e) {
 842                 log_oom();
 843                 r = CONFIRM_EXECUTE;
 844                 goto restore_stdio;
 845         }
 846
 847         for (;;) {
 848                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 849                 if (r < 0) {
 850                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 851                         r = CONFIRM_EXECUTE;
 852                         goto restore_stdio;
 853                 }
 854
 855                 switch (c) {
 856                 case 'c':
 857                         printf("Resuming normal execution.\n");
 858                         manager_disable_confirm_spawn();
 859                         r = 1;
 860                         break;
 861                 case 'D':
 862                         unit_dump(u, stdout, "  ");
 863                         continue; /* ask again */
 864                 case 'f':
 865                         printf("Failing execution.\n");
 866                         r = CONFIRM_PRETEND_FAILURE;
 867                         break;
 868                 case 'h':
 869                         printf("  c - continue, proceed without asking anymore\n"
 870                                "  D - dump, show the state of the unit\n"
 871                                "  f - fail, don't execute the command and pretend it failed\n"
 872                                "  h - help\n"
 873                                "  i - info, show a short summary of the unit\n"
 874                                "  j - jobs, show jobs that are in progress\n"
 875                                "  s - skip, don't execute the command and pretend it succeeded\n"
 876                                "  y - yes, execute the command\n");
 877                         continue; /* ask again */
 878                 case 'i':
 879                         printf("  Description: %s\n"
 880                                "  Unit:        %s\n"
 881                                "  Command:     %s\n",
 882                                u->id, u->description, cmdline);
 883                         continue; /* ask again */
 884                 case 'j':
 885                         manager_dump_jobs(u->manager, stdout, "  ");
 886                         continue; /* ask again */
 887                 case 'n':
 888                         /* 'n' was removed in favor of 'f'. */
 889                         printf("Didn't understand 'n', did you mean 'f'?\n");
 890                         continue; /* ask again */
 891                 case 's':
 892                         printf("Skipping execution.\n");
 893                         r = CONFIRM_PRETEND_SUCCESS;
 894                         break;
 895                 case 'y':
 896                         r = CONFIRM_EXECUTE;
 897                         break;
 898                 default:
 899                         assert_not_reached("Unhandled choice");
 900                 }
 901                 break;
 902         }
 903
 904 restore_stdio:
 905         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 906         return r;
 907 }
 908
 909 static int get_fixed_user(const ExecContext *c, const char **user,
 910                           uid_t *uid, gid_t *gid,
 911                           const char **home, const char **shell) {
 912         int r;
 913         const char *name;
 914
 915         assert(c);
 916
 917         if (!c->user)
 918                 return 0;
 919
 920         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 921          * (i.e. are "/" or "/bin/nologin"). */
 922
 923         name = c->user;
 924         r = get_user_creds_clean(&name, uid, gid, home, shell);
 925         if (r < 0)
 926                 return r;
 927
 928         *user = name;
 929         return 0;
 930 }
 931
 932 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 933         int r;
 934         const char *name;
 935
 936         assert(c);
 937
 938         if (!c->group)
 939                 return 0;
 940
 941         name = c->group;
 942         r = get_group_creds(&name, gid);
 943         if (r < 0)
 944                 return r;
 945
 946         *group = name;
 947         return 0;
 948 }
 949
 950 static int get_supplementary_groups(const ExecContext *c, const char *user,
 951                                     const char *group, gid_t gid,
 952                                     gid_t **supplementary_gids, int *ngids) {
 953         char **i;
 954         int r, k = 0;
 955         int ngroups_max;
 956         bool keep_groups = false;
 957         gid_t *groups = NULL;
 958         _cleanup_free_ gid_t *l_gids = NULL;
 959
 960         assert(c);
 961
 962         /*
 963          * If user is given, then lookup GID and supplementary groups list.
 964          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 965          * here and as early as possible so we keep the list of supplementary
 966          * groups of the caller.
 967          */
 968         if (user && gid_is_valid(gid) && gid != 0) {
 969                 /* First step, initialize groups from /etc/groups */
 970                 if (initgroups(user, gid) < 0)
 971                         return -errno;
 972
 973                 keep_groups = true;
 974         }
 975
 976         if (strv_isempty(c->supplementary_groups))
 977                 return 0;
 978
 979         /*
 980          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 981          * be positive, otherwise fail.
 982          */
 983         errno = 0;
 984         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 985         if (ngroups_max <= 0) {
 986                 if (errno > 0)
 987                         return -errno;
 988                 else
 989                         return -EOPNOTSUPP; /* For all other values */
 990         }
 991
 992         l_gids = new(gid_t, ngroups_max);
 993         if (!l_gids)
 994                 return -ENOMEM;
 995
 996         if (keep_groups) {
 997                 /*
 998                  * Lookup the list of groups that the user belongs to, we
 999                  * avoid NSS lookups here too for gid=0.
1000                  */
1001                 k = ngroups_max;
1002                 if (getgrouplist(user, gid, l_gids, &k) < 0)
1003                         return -EINVAL;
1004         } else
1005                 k = 0;
1006
1007         STRV_FOREACH(i, c->supplementary_groups) {
1008                 const char *g;
1009
1010                 if (k >= ngroups_max)
1011                         return -E2BIG;
1012
1013                 g = *i;
1014                 r = get_group_creds(&g, l_gids+k);
1015                 if (r < 0)
1016                         return r;
1017
1018                 k++;
1019         }
1020
1021         /*
1022          * Sets ngids to zero to drop all supplementary groups, happens
1023          * when we are under root and SupplementaryGroups= is empty.
1024          */
1025         if (k == 0) {
1026                 *ngids = 0;
1027                 return 0;
1028         }
1029
1030         /* Otherwise get the final list of supplementary groups */
1031         groups = memdup(l_gids, sizeof(gid_t) * k);
1032         if (!groups)
1033                 return -ENOMEM;
1034
1035         *supplementary_gids = groups;
1036         *ngids = k;
1037
1038         groups = NULL;
1039
1040         return 0;
1041 }
1042
1043 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1044         int r;
1045
1046         /* Handle SupplementaryGroups= if it is not empty */
1047         if (ngids > 0) {
1048                 r = maybe_setgroups(ngids, supplementary_gids);
1049                 if (r < 0)
1050                         return r;
1051         }
1052
1053         if (gid_is_valid(gid)) {
1054                 /* Then set our gids */
1055                 if (setresgid(gid, gid, gid) < 0)
1056                         return -errno;
1057         }
1058
1059         return 0;
1060 }
1061
1062 static int enforce_user(const ExecContext *context, uid_t uid) {
1063         assert(context);
1064
1065         if (!uid_is_valid(uid))
1066                 return 0;
1067
1068         /* Sets (but doesn't look up) the uid and make sure we keep the
1069          * capabilities while doing so. */
1070
1071         if (context->capability_ambient_set != 0) {
1072
1073                 /* First step: If we need to keep capabilities but
1074                  * drop privileges we need to make sure we keep our
1075                  * caps, while we drop privileges. */
1076                 if (uid != 0) {
1077                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1078
1079                         if (prctl(PR_GET_SECUREBITS) != sb)
1080                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1081                                         return -errno;
1082                 }
1083         }
1084
1085         /* Second step: actually set the uids */
1086         if (setresuid(uid, uid, uid) < 0)
1087                 return -errno;
1088
1089         /* At this point we should have all necessary capabilities but
1090            are otherwise a normal user. However, the caps might got
1091            corrupted due to the setresuid() so we need clean them up
1092            later. This is done outside of this call. */
1093
1094         return 0;
1095 }
1096
1097 #if HAVE_PAM
1098
1099 static int null_conv(
1100                 int num_msg,
1101                 const struct pam_message **msg,
1102                 struct pam_response **resp,
1103                 void *appdata_ptr) {
1104
1105         /* We don't support conversations */
1106
1107         return PAM_CONV_ERR;
1108 }
1109
1110 #endif
1111
1112 static int setup_pam(
1113                 const char *name,
1114                 const char *user,
1115                 uid_t uid,
1116                 gid_t gid,
1117                 const char *tty,
1118                 char ***env,
1119                 int fds[], size_t n_fds) {
1120
1121 #if HAVE_PAM
1122
1123         static const struct pam_conv conv = {
1124                 .conv = null_conv,
1125                 .appdata_ptr = NULL
1126         };
1127
1128         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1129         pam_handle_t *handle = NULL;
1130         sigset_t old_ss;
1131         int pam_code = PAM_SUCCESS, r;
1132         char **nv, **e = NULL;
1133         bool close_session = false;
1134         pid_t pam_pid = 0, parent_pid;
1135         int flags = 0;
1136
1137         assert(name);
1138         assert(user);
1139         assert(env);
1140
1141         /* We set up PAM in the parent process, then fork. The child
1142          * will then stay around until killed via PR_GET_PDEATHSIG or
1143          * systemd via the cgroup logic. It will then remove the PAM
1144          * session again. The parent process will exec() the actual
1145          * daemon. We do things this way to ensure that the main PID
1146          * of the daemon is the one we initially fork()ed. */
1147
1148         r = barrier_create(&barrier);
1149         if (r < 0)
1150                 goto fail;
1151
1152         if (log_get_max_level() < LOG_DEBUG)
1153                 flags |= PAM_SILENT;
1154
1155         pam_code = pam_start(name, user, &conv, &handle);
1156         if (pam_code != PAM_SUCCESS) {
1157                 handle = NULL;
1158                 goto fail;
1159         }
1160
1161         if (tty) {
1162                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1163                 if (pam_code != PAM_SUCCESS)
1164                         goto fail;
1165         }
1166
1167         STRV_FOREACH(nv, *env) {
1168                 pam_code = pam_putenv(handle, *nv);
1169                 if (pam_code != PAM_SUCCESS)
1170                         goto fail;
1171         }
1172
1173         pam_code = pam_acct_mgmt(handle, flags);
1174         if (pam_code != PAM_SUCCESS)
1175                 goto fail;
1176
1177         pam_code = pam_open_session(handle, flags);
1178         if (pam_code != PAM_SUCCESS)
1179                 goto fail;
1180
1181         close_session = true;
1182
1183         e = pam_getenvlist(handle);
1184         if (!e) {
1185                 pam_code = PAM_BUF_ERR;
1186                 goto fail;
1187         }
1188
1189         /* Block SIGTERM, so that we know that it won't get lost in
1190          * the child */
1191
1192         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1193
1194         parent_pid = getpid_cached();
1195
1196         r = safe_fork("(sd-pam)", 0, &pam_pid);
1197         if (r < 0)
1198                 goto fail;
1199         if (r == 0) {
1200                 int sig, ret = EXIT_PAM;
1201
1202                 /* The child's job is to reset the PAM session on
1203                  * termination */
1204                 barrier_set_role(&barrier, BARRIER_CHILD);
1205
1206                 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1207                  * are open here that have been opened by PAM. */
1208                 (void) close_many(fds, n_fds);
1209
1210                 /* Drop privileges - we don't need any to pam_close_session
1211                  * and this will make PR_SET_PDEATHSIG work in most cases.
1212                  * If this fails, ignore the error - but expect sd-pam threads
1213                  * to fail to exit normally */
1214
1215                 r = maybe_setgroups(0, NULL);
1216                 if (r < 0)
1217                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1218                 if (setresgid(gid, gid, gid) < 0)
1219                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1220                 if (setresuid(uid, uid, uid) < 0)
1221                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1222
1223                 (void) ignore_signals(SIGPIPE, -1);
1224
1225                 /* Wait until our parent died. This will only work if
1226                  * the above setresuid() succeeds, otherwise the kernel
1227                  * will not allow unprivileged parents kill their privileged
1228                  * children this way. We rely on the control groups kill logic
1229                  * to do the rest for us. */
1230                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1231                         goto child_finish;
1232
1233                 /* Tell the parent that our setup is done. This is especially
1234                  * important regarding dropping privileges. Otherwise, unit
1235                  * setup might race against our setresuid(2) call.
1236                  *
1237                  * If the parent aborted, we'll detect this below, hence ignore
1238                  * return failure here. */
1239                 (void) barrier_place(&barrier);
1240
1241                 /* Check if our parent process might already have died? */
1242                 if (getppid() == parent_pid) {
1243                         sigset_t ss;
1244
1245                         assert_se(sigemptyset(&ss) >= 0);
1246                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1247
1248                         for (;;) {
1249                                 if (sigwait(&ss, &sig) < 0) {
1250                                         if (errno == EINTR)
1251                                                 continue;
1252
1253                                         goto child_finish;
1254                                 }
1255
1256                                 assert(sig == SIGTERM);
1257                                 break;
1258                         }
1259                 }
1260
1261                 /* If our parent died we'll end the session */
1262                 if (getppid() != parent_pid) {
1263                         pam_code = pam_close_session(handle, flags);
1264                         if (pam_code != PAM_SUCCESS)
1265                                 goto child_finish;
1266                 }
1267
1268                 ret = 0;
1269
1270         child_finish:
1271                 pam_end(handle, pam_code | flags);
1272                 _exit(ret);
1273         }
1274
1275         barrier_set_role(&barrier, BARRIER_PARENT);
1276
1277         /* If the child was forked off successfully it will do all the
1278          * cleanups, so forget about the handle here. */
1279         handle = NULL;
1280
1281         /* Unblock SIGTERM again in the parent */
1282         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1283
1284         /* We close the log explicitly here, since the PAM modules
1285          * might have opened it, but we don't want this fd around. */
1286         closelog();
1287
1288         /* Synchronously wait for the child to initialize. We don't care for
1289          * errors as we cannot recover. However, warn loudly if it happens. */
1290         if (!barrier_place_and_sync(&barrier))
1291                 log_error("PAM initialization failed");
1292
1293         return strv_free_and_replace(*env, e);
1294
1295 fail:
1296         if (pam_code != PAM_SUCCESS) {
1297                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1298                 r = -EPERM;  /* PAM errors do not map to errno */
1299         } else
1300                 log_error_errno(r, "PAM failed: %m");
1301
1302         if (handle) {
1303                 if (close_session)
1304                         pam_code = pam_close_session(handle, flags);
1305
1306                 pam_end(handle, pam_code | flags);
1307         }
1308
1309         strv_free(e);
1310         closelog();
1311
1312         return r;
1313 #else
1314         return 0;
1315 #endif
1316 }
1317
1318 static void rename_process_from_path(const char *path) {
1319         char process_name[11];
1320         const char *p;
1321         size_t l;
1322
1323         /* This resulting string must fit in 10 chars (i.e. the length
1324          * of "/sbin/init") to look pretty in /bin/ps */
1325
1326         p = basename(path);
1327         if (isempty(p)) {
1328                 rename_process("(...)");
1329                 return;
1330         }
1331
1332         l = strlen(p);
1333         if (l > 8) {
1334                 /* The end of the process name is usually more
1335                  * interesting, since the first bit might just be
1336                  * "systemd-" */
1337                 p = p + l - 8;
1338                 l = 8;
1339         }
1340
1341         process_name[0] = '(';
1342         memcpy(process_name+1, p, l);
1343         process_name[1+l] = ')';
1344         process_name[1+l+1] = 0;
1345
1346         rename_process(process_name);
1347 }
1348
1349 static bool context_has_address_families(const ExecContext *c) {
1350         assert(c);
1351
1352         return c->address_families_whitelist ||
1353                 !set_isempty(c->address_families);
1354 }
1355
1356 static bool context_has_syscall_filters(const ExecContext *c) {
1357         assert(c);
1358
1359         return c->syscall_whitelist ||
1360                 !hashmap_isempty(c->syscall_filter);
1361 }
1362
1363 static bool context_has_no_new_privileges(const ExecContext *c) {
1364         assert(c);
1365
1366         if (c->no_new_privileges)
1367                 return true;
1368
1369         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1370                 return false;
1371
1372         /* We need NNP if we have any form of seccomp and are unprivileged */
1373         return context_has_address_families(c) ||
1374                 c->memory_deny_write_execute ||
1375                 c->restrict_realtime ||
1376                 exec_context_restrict_namespaces_set(c) ||
1377                 c->protect_kernel_tunables ||
1378                 c->protect_kernel_modules ||
1379                 c->private_devices ||
1380                 context_has_syscall_filters(c) ||
1381                 !set_isempty(c->syscall_archs) ||
1382                 c->lock_personality;
1383 }
1384
1385 #if HAVE_SECCOMP
1386
1387 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1388
1389         if (is_seccomp_available())
1390                 return false;
1391
1392         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1393         return true;
1394 }
1395
1396 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1397         uint32_t negative_action, default_action, action;
1398         int r;
1399
1400         assert(u);
1401         assert(c);
1402
1403         if (!context_has_syscall_filters(c))
1404                 return 0;
1405
1406         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1407                 return 0;
1408
1409         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1410
1411         if (c->syscall_whitelist) {
1412                 default_action = negative_action;
1413                 action = SCMP_ACT_ALLOW;
1414         } else {
1415                 default_action = SCMP_ACT_ALLOW;
1416                 action = negative_action;
1417         }
1418
1419         if (needs_ambient_hack) {
1420                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1421                 if (r < 0)
1422                         return r;
1423         }
1424
1425         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1426 }
1427
1428 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1429         assert(u);
1430         assert(c);
1431
1432         if (set_isempty(c->syscall_archs))
1433                 return 0;
1434
1435         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1436                 return 0;
1437
1438         return seccomp_restrict_archs(c->syscall_archs);
1439 }
1440
1441 static int apply_address_families(const Unit* u, const ExecContext *c) {
1442         assert(u);
1443         assert(c);
1444
1445         if (!context_has_address_families(c))
1446                 return 0;
1447
1448         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1449                 return 0;
1450
1451         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1452 }
1453
1454 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1455         assert(u);
1456         assert(c);
1457
1458         if (!c->memory_deny_write_execute)
1459                 return 0;
1460
1461         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1462                 return 0;
1463
1464         return seccomp_memory_deny_write_execute();
1465 }
1466
1467 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1468         assert(u);
1469         assert(c);
1470
1471         if (!c->restrict_realtime)
1472                 return 0;
1473
1474         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1475                 return 0;
1476
1477         return seccomp_restrict_realtime();
1478 }
1479
1480 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1481         assert(u);
1482         assert(c);
1483
1484         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1485          * let's protect even those systems where this is left on in the kernel. */
1486
1487         if (!c->protect_kernel_tunables)
1488                 return 0;
1489
1490         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1491                 return 0;
1492
1493         return seccomp_protect_sysctl();
1494 }
1495
1496 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1497         assert(u);
1498         assert(c);
1499
1500         /* Turn off module syscalls on ProtectKernelModules=yes */
1501
1502         if (!c->protect_kernel_modules)
1503                 return 0;
1504
1505         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1506                 return 0;
1507
1508         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1509 }
1510
1511 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1512         assert(u);
1513         assert(c);
1514
1515         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1516
1517         if (!c->private_devices)
1518                 return 0;
1519
1520         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1521                 return 0;
1522
1523         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1524 }
1525
1526 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1527         assert(u);
1528         assert(c);
1529
1530         if (!exec_context_restrict_namespaces_set(c))
1531                 return 0;
1532
1533         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1534                 return 0;
1535
1536         return seccomp_restrict_namespaces(c->restrict_namespaces);
1537 }
1538
1539 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1540         unsigned long personality;
1541         int r;
1542
1543         assert(u);
1544         assert(c);
1545
1546         if (!c->lock_personality)
1547                 return 0;
1548
1549         if (skip_seccomp_unavailable(u, "LockPersonality="))
1550                 return 0;
1551
1552         personality = c->personality;
1553
1554         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1555         if (personality == PERSONALITY_INVALID) {
1556
1557                 r = opinionated_personality(&personality);
1558                 if (r < 0)
1559                         return r;
1560         }
1561
1562         return seccomp_lock_personality(personality);
1563 }
1564
1565 #endif
1566
1567 static void do_idle_pipe_dance(int idle_pipe[4]) {
1568         assert(idle_pipe);
1569
1570         idle_pipe[1] = safe_close(idle_pipe[1]);
1571         idle_pipe[2] = safe_close(idle_pipe[2]);
1572
1573         if (idle_pipe[0] >= 0) {
1574                 int r;
1575
1576                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1577
1578                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1579                         ssize_t n;
1580
1581                         /* Signal systemd that we are bored and want to continue. */
1582                         n = write(idle_pipe[3], "x", 1);
1583                         if (n > 0)
1584                                 /* Wait for systemd to react to the signal above. */
1585                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1586                 }
1587
1588                 idle_pipe[0] = safe_close(idle_pipe[0]);
1589
1590         }
1591
1592         idle_pipe[3] = safe_close(idle_pipe[3]);
1593 }
1594
1595 static int build_environment(
1596                 const Unit *u,
1597                 const ExecContext *c,
1598                 const ExecParameters *p,
1599                 size_t n_fds,
1600                 const char *home,
1601                 const char *username,
1602                 const char *shell,
1603                 dev_t journal_stream_dev,
1604                 ino_t journal_stream_ino,
1605                 char ***ret) {
1606
1607         _cleanup_strv_free_ char **our_env = NULL;
1608         size_t n_env = 0;
1609         char *x;
1610
1611         assert(u);
1612         assert(c);
1613         assert(ret);
1614
1615         our_env = new0(char*, 14);
1616         if (!our_env)
1617                 return -ENOMEM;
1618
1619         if (n_fds > 0) {
1620                 _cleanup_free_ char *joined = NULL;
1621
1622                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1623                         return -ENOMEM;
1624                 our_env[n_env++] = x;
1625
1626                 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1627                         return -ENOMEM;
1628                 our_env[n_env++] = x;
1629
1630                 joined = strv_join(p->fd_names, ":");
1631                 if (!joined)
1632                         return -ENOMEM;
1633
1634                 x = strjoin("LISTEN_FDNAMES=", joined);
1635                 if (!x)
1636                         return -ENOMEM;
1637                 our_env[n_env++] = x;
1638         }
1639
1640         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1641                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1642                         return -ENOMEM;
1643                 our_env[n_env++] = x;
1644
1645                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1646                         return -ENOMEM;
1647                 our_env[n_env++] = x;
1648         }
1649
1650         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1651          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1652          * check the database directly. */
1653         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1654                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1655                 if (!x)
1656                         return -ENOMEM;
1657                 our_env[n_env++] = x;
1658         }
1659
1660         if (home) {
1661                 x = strappend("HOME=", home);
1662                 if (!x)
1663                         return -ENOMEM;
1664                 our_env[n_env++] = x;
1665         }
1666
1667         if (username) {
1668                 x = strappend("LOGNAME=", username);
1669                 if (!x)
1670                         return -ENOMEM;
1671                 our_env[n_env++] = x;
1672
1673                 x = strappend("USER=", username);
1674                 if (!x)
1675                         return -ENOMEM;
1676                 our_env[n_env++] = x;
1677         }
1678
1679         if (shell) {
1680                 x = strappend("SHELL=", shell);
1681                 if (!x)
1682                         return -ENOMEM;
1683                 our_env[n_env++] = x;
1684         }
1685
1686         if (!sd_id128_is_null(u->invocation_id)) {
1687                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1688                         return -ENOMEM;
1689
1690                 our_env[n_env++] = x;
1691         }
1692
1693         if (exec_context_needs_term(c)) {
1694                 const char *tty_path, *term = NULL;
1695
1696                 tty_path = exec_context_tty_path(c);
1697
1698                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1699                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1700                  * passes to PID 1 ends up all the way in the console login shown. */
1701
1702                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1703                         term = getenv("TERM");
1704                 if (!term)
1705                         term = default_term_for_tty(tty_path);
1706
1707                 x = strappend("TERM=", term);
1708                 if (!x)
1709                         return -ENOMEM;
1710                 our_env[n_env++] = x;
1711         }
1712
1713         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1714                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1715                         return -ENOMEM;
1716
1717                 our_env[n_env++] = x;
1718         }
1719
1720         our_env[n_env++] = NULL;
1721         assert(n_env <= 12);
1722
1723         *ret = TAKE_PTR(our_env);
1724
1725         return 0;
1726 }
1727
1728 static int build_pass_environment(const ExecContext *c, char ***ret) {
1729         _cleanup_strv_free_ char **pass_env = NULL;
1730         size_t n_env = 0, n_bufsize = 0;
1731         char **i;
1732
1733         STRV_FOREACH(i, c->pass_environment) {
1734                 _cleanup_free_ char *x = NULL;
1735                 char *v;
1736
1737                 v = getenv(*i);
1738                 if (!v)
1739                         continue;
1740                 x = strjoin(*i, "=", v);
1741                 if (!x)
1742                         return -ENOMEM;
1743
1744                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1745                         return -ENOMEM;
1746
1747                 pass_env[n_env++] = TAKE_PTR(x);
1748                 pass_env[n_env] = NULL;
1749         }
1750
1751         *ret = TAKE_PTR(pass_env);
1752
1753         return 0;
1754 }
1755
1756 static bool exec_needs_mount_namespace(
1757                 const ExecContext *context,
1758                 const ExecParameters *params,
1759                 const ExecRuntime *runtime) {
1760
1761         assert(context);
1762         assert(params);
1763
1764         if (context->root_image)
1765                 return true;
1766
1767         if (!strv_isempty(context->read_write_paths) ||
1768             !strv_isempty(context->read_only_paths) ||
1769             !strv_isempty(context->inaccessible_paths))
1770                 return true;
1771
1772         if (context->n_bind_mounts > 0)
1773                 return true;
1774
1775         if (context->n_temporary_filesystems > 0)
1776                 return true;
1777
1778         if (context->mount_flags != 0)
1779                 return true;
1780
1781         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1782                 return true;
1783
1784         if (context->private_devices ||
1785             context->private_mounts ||
1786             context->protect_system != PROTECT_SYSTEM_NO ||
1787             context->protect_home != PROTECT_HOME_NO ||
1788             context->protect_kernel_tunables ||
1789             context->protect_kernel_modules ||
1790             context->protect_control_groups)
1791                 return true;
1792
1793         if (context->root_directory) {
1794                 ExecDirectoryType t;
1795
1796                 if (context->mount_apivfs)
1797                         return true;
1798
1799                 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1800                         if (!params->prefix[t])
1801                                 continue;
1802
1803                         if (!strv_isempty(context->directories[t].paths))
1804                                 return true;
1805                 }
1806         }
1807
1808         if (context->dynamic_user &&
1809             (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1810              !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1811              !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1812                 return true;
1813
1814         return false;
1815 }
1816
1817 static int setup_private_users(uid_t uid, gid_t gid) {
1818         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1819         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1820         _cleanup_close_ int unshare_ready_fd = -1;
1821         _cleanup_(sigkill_waitp) pid_t pid = 0;
1822         uint64_t c = 1;
1823         ssize_t n;
1824         int r;
1825
1826         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1827          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1828          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1829          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1830          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1831          * continues execution normally. */
1832
1833         if (uid != 0 && uid_is_valid(uid)) {
1834                 r = asprintf(&uid_map,
1835                              "0 0 1\n"                      /* Map root → root */
1836                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1837                              uid, uid);
1838                 if (r < 0)
1839                         return -ENOMEM;
1840         } else {
1841                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1842                 if (!uid_map)
1843                         return -ENOMEM;
1844         }
1845
1846         if (gid != 0 && gid_is_valid(gid)) {
1847                 r = asprintf(&gid_map,
1848                              "0 0 1\n"                      /* Map root → root */
1849                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1850                              gid, gid);
1851                 if (r < 0)
1852                         return -ENOMEM;
1853         } else {
1854                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1855                 if (!gid_map)
1856                         return -ENOMEM;
1857         }
1858
1859         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1860          * namespace. */
1861         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1862         if (unshare_ready_fd < 0)
1863                 return -errno;
1864
1865         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1866          * failed. */
1867         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1868                 return -errno;
1869
1870         r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1871         if (r < 0)
1872                 return r;
1873         if (r == 0) {
1874                 _cleanup_close_ int fd = -1;
1875                 const char *a;
1876                 pid_t ppid;
1877
1878                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1879                  * here, after the parent opened its own user namespace. */
1880
1881                 ppid = getppid();
1882                 errno_pipe[0] = safe_close(errno_pipe[0]);
1883
1884                 /* Wait until the parent unshared the user namespace */
1885                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1886                         r = -errno;
1887                         goto child_fail;
1888                 }
1889
1890                 /* Disable the setgroups() system call in the child user namespace, for good. */
1891                 a = procfs_file_alloca(ppid, "setgroups");
1892                 fd = open(a, O_WRONLY|O_CLOEXEC);
1893                 if (fd < 0) {
1894                         if (errno != ENOENT) {
1895                                 r = -errno;
1896                                 goto child_fail;
1897                         }
1898
1899                         /* If the file is missing the kernel is too old, let's continue anyway. */
1900                 } else {
1901                         if (write(fd, "deny\n", 5) < 0) {
1902                                 r = -errno;
1903                                 goto child_fail;
1904                         }
1905
1906                         fd = safe_close(fd);
1907                 }
1908
1909                 /* First write the GID map */
1910                 a = procfs_file_alloca(ppid, "gid_map");
1911                 fd = open(a, O_WRONLY|O_CLOEXEC);
1912                 if (fd < 0) {
1913                         r = -errno;
1914                         goto child_fail;
1915                 }
1916                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1917                         r = -errno;
1918                         goto child_fail;
1919                 }
1920                 fd = safe_close(fd);
1921
1922                 /* The write the UID map */
1923                 a = procfs_file_alloca(ppid, "uid_map");
1924                 fd = open(a, O_WRONLY|O_CLOEXEC);
1925                 if (fd < 0) {
1926                         r = -errno;
1927                         goto child_fail;
1928                 }
1929                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1930                         r = -errno;
1931                         goto child_fail;
1932                 }
1933
1934                 _exit(EXIT_SUCCESS);
1935
1936         child_fail:
1937                 (void) write(errno_pipe[1], &r, sizeof(r));
1938                 _exit(EXIT_FAILURE);
1939         }
1940
1941         errno_pipe[1] = safe_close(errno_pipe[1]);
1942
1943         if (unshare(CLONE_NEWUSER) < 0)
1944                 return -errno;
1945
1946         /* Let the child know that the namespace is ready now */
1947         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1948                 return -errno;
1949
1950         /* Try to read an error code from the child */
1951         n = read(errno_pipe[0], &r, sizeof(r));
1952         if (n < 0)
1953                 return -errno;
1954         if (n == sizeof(r)) { /* an error code was sent to us */
1955                 if (r < 0)
1956                         return r;
1957                 return -EIO;
1958         }
1959         if (n != 0) /* on success we should have read 0 bytes */
1960                 return -EIO;
1961
1962         r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1963         pid = 0;
1964         if (r < 0)
1965                 return r;
1966         if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
1967                 return -EIO;
1968
1969         return 0;
1970 }
1971
1972 static int setup_exec_directory(
1973                 const ExecContext *context,
1974                 const ExecParameters *params,
1975                 uid_t uid,
1976                 gid_t gid,
1977                 ExecDirectoryType type,
1978                 int *exit_status) {
1979
1980         static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1981                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1982                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1983                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1984                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1985                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1986         };
1987         char **rt;
1988         int r;
1989
1990         assert(context);
1991         assert(params);
1992         assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
1993         assert(exit_status);
1994
1995         if (!params->prefix[type])
1996                 return 0;
1997
1998         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1999                 if (!uid_is_valid(uid))
2000                         uid = 0;
2001                 if (!gid_is_valid(gid))
2002                         gid = 0;
2003         }
2004
2005         STRV_FOREACH(rt, context->directories[type].paths) {
2006                 _cleanup_free_ char *p = NULL, *pp = NULL;
2007
2008                 p = strjoin(params->prefix[type], "/", *rt);
2009                 if (!p) {
2010                         r = -ENOMEM;
2011                         goto fail;
2012                 }
2013
2014                 r = mkdir_parents_label(p, 0755);
2015                 if (r < 0)
2016                         goto fail;
2017
2018                 if (context->dynamic_user &&
2019                     !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2020                         _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2021
2022                         /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2023                          * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2024                          * whose UID is later on reused. To lock this down we use the same trick used by container
2025                          * managers to prohibit host users to get access to files of the same UID in containers: we
2026                          * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2027                          * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2028                          * to make this directory permeable for the service itself.
2029                          *
2030                          * Specifically: for a service which wants a special directory "foo/" we first create a
2031                          * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2032                          * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2033                          * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2034                          * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2035                          * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2036                          * disabling the access boundary for the service and making sure it only gets access to the
2037                          * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2038                          *
2039                          * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2040                          * owned by the service itself.
2041                          * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2042                          * files or sockets with other services. */
2043
2044                         private_root = strjoin(params->prefix[type], "/private");
2045                         if (!private_root) {
2046                                 r = -ENOMEM;
2047                                 goto fail;
2048                         }
2049
2050                         /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2051                         r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
2052                         if (r < 0)
2053                                 goto fail;
2054
2055                         pp = strjoin(private_root, "/", *rt);
2056                         if (!pp) {
2057                                 r = -ENOMEM;
2058                                 goto fail;
2059                         }
2060
2061                         /* Create all directories between the configured directory and this private root, and mark them 0755 */
2062                         r = mkdir_parents_label(pp, 0755);
2063                         if (r < 0)
2064                                 goto fail;
2065
2066                         if (is_dir(p, false) > 0 &&
2067                             (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2068
2069                                 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2070                                  * it over. Most likely the service has been upgraded from one that didn't use
2071                                  * DynamicUser=1, to one that does. */
2072
2073                                 if (rename(p, pp) < 0) {
2074                                         r = -errno;
2075                                         goto fail;
2076                                 }
2077                         } else {
2078                                 /* Otherwise, create the actual directory for the service */
2079
2080                                 r = mkdir_label(pp, context->directories[type].mode);
2081                                 if (r < 0 && r != -EEXIST)
2082                                         goto fail;
2083                         }
2084
2085                         parent = dirname_malloc(p);
2086                         if (!parent) {
2087                                 r = -ENOMEM;
2088                                 goto fail;
2089                         }
2090
2091                         r = path_make_relative(parent, pp, &relative);
2092                         if (r < 0)
2093                                 goto fail;
2094
2095                         /* And link it up from the original place */
2096                         r = symlink_idempotent(relative, p);
2097                         if (r < 0)
2098                                 goto fail;
2099
2100                         /* Lock down the access mode */
2101                         if (chmod(pp, context->directories[type].mode) < 0) {
2102                                 r = -errno;
2103                                 goto fail;
2104                         }
2105                 } else {
2106                         r = mkdir_label(p, context->directories[type].mode);
2107                         if (r < 0 && r != -EEXIST)
2108                                 goto fail;
2109                         if (r == -EEXIST && !context->dynamic_user)
2110                                 continue;
2111                 }
2112
2113                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2114                  * a service, and shall not be writable. */
2115                 if (type == EXEC_DIRECTORY_CONFIGURATION)
2116                         continue;
2117
2118                 /* Then, change the ownership of the whole tree, if necessary */
2119                 r = path_chown_recursive(pp ?: p, uid, gid);
2120                 if (r < 0)
2121                         goto fail;
2122         }
2123
2124         return 0;
2125
2126 fail:
2127         *exit_status = exit_status_table[type];
2128         return r;
2129 }
2130
2131 #if ENABLE_SMACK
2132 static int setup_smack(
2133                 const ExecContext *context,
2134                 const ExecCommand *command) {
2135
2136         int r;
2137
2138         assert(context);
2139         assert(command);
2140
2141         if (context->smack_process_label) {
2142                 r = mac_smack_apply_pid(0, context->smack_process_label);
2143                 if (r < 0)
2144                         return r;
2145         }
2146 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2147         else {
2148                 _cleanup_free_ char *exec_label = NULL;
2149
2150                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2151                 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2152                         return r;
2153
2154                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2155                 if (r < 0)
2156                         return r;
2157         }
2158 #endif
2159
2160         return 0;
2161 }
2162 #endif
2163
2164 static int compile_bind_mounts(
2165                 const ExecContext *context,
2166                 const ExecParameters *params,
2167                 BindMount **ret_bind_mounts,
2168                 size_t *ret_n_bind_mounts,
2169                 char ***ret_empty_directories) {
2170
2171         _cleanup_strv_free_ char **empty_directories = NULL;
2172         BindMount *bind_mounts;
2173         size_t n, h = 0, i;
2174         ExecDirectoryType t;
2175         int r;
2176
2177         assert(context);
2178         assert(params);
2179         assert(ret_bind_mounts);
2180         assert(ret_n_bind_mounts);
2181         assert(ret_empty_directories);
2182
2183         n = context->n_bind_mounts;
2184         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2185                 if (!params->prefix[t])
2186                         continue;
2187
2188                 n += strv_length(context->directories[t].paths);
2189         }
2190
2191         if (n <= 0) {
2192                 *ret_bind_mounts = NULL;
2193                 *ret_n_bind_mounts = 0;
2194                 *ret_empty_directories = NULL;
2195                 return 0;
2196         }
2197
2198         bind_mounts = new(BindMount, n);
2199         if (!bind_mounts)
2200                 return -ENOMEM;
2201
2202         for (i = 0; i < context->n_bind_mounts; i++) {
2203                 BindMount *item = context->bind_mounts + i;
2204                 char *s, *d;
2205
2206                 s = strdup(item->source);
2207                 if (!s) {
2208                         r = -ENOMEM;
2209                         goto finish;
2210                 }
2211
2212                 d = strdup(item->destination);
2213                 if (!d) {
2214                         free(s);
2215                         r = -ENOMEM;
2216                         goto finish;
2217                 }
2218
2219                 bind_mounts[h++] = (BindMount) {
2220                         .source = s,
2221                         .destination = d,
2222                         .read_only = item->read_only,
2223                         .recursive = item->recursive,
2224                         .ignore_enoent = item->ignore_enoent,
2225                 };
2226         }
2227
2228         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2229                 char **suffix;
2230
2231                 if (!params->prefix[t])
2232                         continue;
2233
2234                 if (strv_isempty(context->directories[t].paths))
2235                         continue;
2236
2237                 if (context->dynamic_user &&
2238                     !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2239                     !(context->root_directory || context->root_image)) {
2240                         char *private_root;
2241
2242                         /* So this is for a dynamic user, and we need to make sure the process can access its own
2243                          * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2244                          * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2245
2246                         private_root = strjoin(params->prefix[t], "/private");
2247                         if (!private_root) {
2248                                 r = -ENOMEM;
2249                                 goto finish;
2250                         }
2251
2252                         r = strv_consume(&empty_directories, private_root);
2253                         if (r < 0)
2254                                 goto finish;
2255                 }
2256
2257                 STRV_FOREACH(suffix, context->directories[t].paths) {
2258                         char *s, *d;
2259
2260                         if (context->dynamic_user &&
2261                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2262                                 s = strjoin(params->prefix[t], "/private/", *suffix);
2263                         else
2264                                 s = strjoin(params->prefix[t], "/", *suffix);
2265                         if (!s) {
2266                                 r = -ENOMEM;
2267                                 goto finish;
2268                         }
2269
2270                         if (context->dynamic_user &&
2271                             !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION) &&
2272                             (context->root_directory || context->root_image))
2273                                 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2274                                  * directory is not created on the root directory. So, let's bind-mount the directory
2275                                  * on the 'non-private' place. */
2276                                 d = strjoin(params->prefix[t], "/", *suffix);
2277                         else
2278                                 d = strdup(s);
2279                         if (!d) {
2280                                 free(s);
2281                                 r = -ENOMEM;
2282                                 goto finish;
2283                         }
2284
2285                         bind_mounts[h++] = (BindMount) {
2286                                 .source = s,
2287                                 .destination = d,
2288                                 .read_only = false,
2289                                 .recursive = true,
2290                                 .ignore_enoent = false,
2291                         };
2292                 }
2293         }
2294
2295         assert(h == n);
2296
2297         *ret_bind_mounts = bind_mounts;
2298         *ret_n_bind_mounts = n;
2299         *ret_empty_directories = TAKE_PTR(empty_directories);
2300
2301         return (int) n;
2302
2303 finish:
2304         bind_mount_free_many(bind_mounts, h);
2305         return r;
2306 }
2307
2308 static int apply_mount_namespace(
2309                 const Unit *u,
2310                 const ExecCommand *command,
2311                 const ExecContext *context,
2312                 const ExecParameters *params,
2313                 const ExecRuntime *runtime) {
2314
2315         _cleanup_strv_free_ char **empty_directories = NULL;
2316         char *tmp = NULL, *var = NULL;
2317         const char *root_dir = NULL, *root_image = NULL;
2318         NamespaceInfo ns_info;
2319         bool needs_sandboxing;
2320         BindMount *bind_mounts = NULL;
2321         size_t n_bind_mounts = 0;
2322         int r;
2323
2324         assert(context);
2325
2326         /* The runtime struct only contains the parent of the private /tmp,
2327          * which is non-accessible to world users. Inside of it there's a /tmp
2328          * that is sticky, and that's the one we want to use here. */
2329
2330         if (context->private_tmp && runtime) {
2331                 if (runtime->tmp_dir)
2332                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2333                 if (runtime->var_tmp_dir)
2334                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2335         }
2336
2337         if (params->flags & EXEC_APPLY_CHROOT) {
2338                 root_image = context->root_image;
2339
2340                 if (!root_image)
2341                         root_dir = context->root_directory;
2342         }
2343
2344         r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2345         if (r < 0)
2346                 return r;
2347
2348         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2349         if (needs_sandboxing)
2350                 ns_info = (NamespaceInfo) {
2351                         .ignore_protect_paths = false,
2352                         .private_dev = context->private_devices,
2353                         .protect_control_groups = context->protect_control_groups,
2354                         .protect_kernel_tunables = context->protect_kernel_tunables,
2355                         .protect_kernel_modules = context->protect_kernel_modules,
2356                         .mount_apivfs = context->mount_apivfs,
2357                         .private_mounts = context->private_mounts,
2358                 };
2359         else if (!context->dynamic_user && root_dir)
2360                 /*
2361                  * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2362                  * sandbox info, otherwise enforce it, don't ignore protected paths and
2363                  * fail if we are enable to apply the sandbox inside the mount namespace.
2364                  */
2365                 ns_info = (NamespaceInfo) {
2366                         .ignore_protect_paths = true,
2367                 };
2368         else
2369                 ns_info = (NamespaceInfo) {};
2370
2371         r = setup_namespace(root_dir, root_image,
2372                             &ns_info, context->read_write_paths,
2373                             needs_sandboxing ? context->read_only_paths : NULL,
2374                             needs_sandboxing ? context->inaccessible_paths : NULL,
2375                             empty_directories,
2376                             bind_mounts,
2377                             n_bind_mounts,
2378                             context->temporary_filesystems,
2379                             context->n_temporary_filesystems,
2380                             tmp,
2381                             var,
2382                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2383                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2384                             context->mount_flags,
2385                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2386
2387         bind_mount_free_many(bind_mounts, n_bind_mounts);
2388
2389         /* If we couldn't set up the namespace this is probably due to a
2390          * missing capability. In this case, silently proceeed. */
2391         if (IN_SET(r, -EPERM, -EACCES)) {
2392                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2393                 return 0;
2394         }
2395
2396         return r;
2397 }
2398
2399 static int apply_working_directory(
2400                 const ExecContext *context,
2401                 const ExecParameters *params,
2402                 const char *home,
2403                 const bool needs_mount_ns,
2404                 int *exit_status) {
2405
2406         const char *d, *wd;
2407
2408         assert(context);
2409         assert(exit_status);
2410
2411         if (context->working_directory_home) {
2412
2413                 if (!home) {
2414                         *exit_status = EXIT_CHDIR;
2415                         return -ENXIO;
2416                 }
2417
2418                 wd = home;
2419
2420         } else if (context->working_directory)
2421                 wd = context->working_directory;
2422         else
2423                 wd = "/";
2424
2425         if (params->flags & EXEC_APPLY_CHROOT) {
2426                 if (!needs_mount_ns && context->root_directory)
2427                         if (chroot(context->root_directory) < 0) {
2428                                 *exit_status = EXIT_CHROOT;
2429                                 return -errno;
2430                         }
2431
2432                 d = wd;
2433         } else
2434                 d = prefix_roota(context->root_directory, wd);
2435
2436         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2437                 *exit_status = EXIT_CHDIR;
2438                 return -errno;
2439         }
2440
2441         return 0;
2442 }
2443
2444 static int setup_keyring(
2445                 const Unit *u,
2446                 const ExecContext *context,
2447                 const ExecParameters *p,
2448                 uid_t uid, gid_t gid) {
2449
2450         key_serial_t keyring;
2451         int r = 0;
2452         uid_t saved_uid;
2453         gid_t saved_gid;
2454
2455         assert(u);
2456         assert(context);
2457         assert(p);
2458
2459         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2460          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2461          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2462          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2463          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2464          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2465
2466         if (!(p->flags & EXEC_NEW_KEYRING))
2467                 return 0;
2468
2469         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2470                 return 0;
2471
2472         /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2473          * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2474          * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2475          * & group is just as nasty as acquiring a reference to the user keyring. */
2476
2477         saved_uid = getuid();
2478         saved_gid = getgid();
2479
2480         if (gid_is_valid(gid) && gid != saved_gid) {
2481                 if (setregid(gid, -1) < 0)
2482                         return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2483         }
2484
2485         if (uid_is_valid(uid) && uid != saved_uid) {
2486                 if (setreuid(uid, -1) < 0) {
2487                         r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2488                         goto out;
2489                 }
2490         }
2491
2492         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2493         if (keyring == -1) {
2494                 if (errno == ENOSYS)
2495                         log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2496                 else if (IN_SET(errno, EACCES, EPERM))
2497                         log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2498                 else if (errno == EDQUOT)
2499                         log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2500                 else
2501                         r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2502
2503                 goto out;
2504         }
2505
2506         /* When requested link the user keyring into the session keyring. */
2507         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2508
2509                 if (keyctl(KEYCTL_LINK,
2510                            KEY_SPEC_USER_KEYRING,
2511                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2512                         r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2513                         goto out;
2514                 }
2515         }
2516
2517         /* Restore uid/gid back */
2518         if (uid_is_valid(uid) && uid != saved_uid) {
2519                 if (setreuid(saved_uid, -1) < 0) {
2520                         r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2521                         goto out;
2522                 }
2523         }
2524
2525         if (gid_is_valid(gid) && gid != saved_gid) {
2526                 if (setregid(saved_gid, -1) < 0)
2527                         return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2528         }
2529
2530         /* Populate they keyring with the invocation ID by default, as original saved_uid. */
2531         if (!sd_id128_is_null(u->invocation_id)) {
2532                 key_serial_t key;
2533
2534                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2535                 if (key == -1)
2536                         log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2537                 else {
2538                         if (keyctl(KEYCTL_SETPERM, key,
2539                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2540                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2541                                 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2542                 }
2543         }
2544
2545 out:
2546         /* Revert back uid & gid for the the last time, and exit */
2547         /* no extra logging, as only the first already reported error matters */
2548         if (getuid() != saved_uid)
2549                 (void) setreuid(saved_uid, -1);
2550
2551         if (getgid() != saved_gid)
2552                 (void) setregid(saved_gid, -1);
2553
2554         return r;
2555 }
2556
2557 static void append_socket_pair(int *array, size_t *n, const int pair[2]) {
2558         assert(array);
2559         assert(n);
2560
2561         if (!pair)
2562                 return;
2563
2564         if (pair[0] >= 0)
2565                 array[(*n)++] = pair[0];
2566         if (pair[1] >= 0)
2567                 array[(*n)++] = pair[1];
2568 }
2569
2570 static int close_remaining_fds(
2571                 const ExecParameters *params,
2572                 const ExecRuntime *runtime,
2573                 const DynamicCreds *dcreds,
2574                 int user_lookup_fd,
2575                 int socket_fd,
2576                 int exec_fd,
2577                 int *fds, size_t n_fds) {
2578
2579         size_t n_dont_close = 0;
2580         int dont_close[n_fds + 12];
2581
2582         assert(params);
2583
2584         if (params->stdin_fd >= 0)
2585                 dont_close[n_dont_close++] = params->stdin_fd;
2586         if (params->stdout_fd >= 0)
2587                 dont_close[n_dont_close++] = params->stdout_fd;
2588         if (params->stderr_fd >= 0)
2589                 dont_close[n_dont_close++] = params->stderr_fd;
2590
2591         if (socket_fd >= 0)
2592                 dont_close[n_dont_close++] = socket_fd;
2593         if (exec_fd >= 0)
2594                 dont_close[n_dont_close++] = exec_fd;
2595         if (n_fds > 0) {
2596                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2597                 n_dont_close += n_fds;
2598         }
2599
2600         if (runtime)
2601                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2602
2603         if (dcreds) {
2604                 if (dcreds->user)
2605                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2606                 if (dcreds->group)
2607                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2608         }
2609
2610         if (user_lookup_fd >= 0)
2611                 dont_close[n_dont_close++] = user_lookup_fd;
2612
2613         return close_all_fds(dont_close, n_dont_close);
2614 }
2615
2616 static int send_user_lookup(
2617                 Unit *unit,
2618                 int user_lookup_fd,
2619                 uid_t uid,
2620                 gid_t gid) {
2621
2622         assert(unit);
2623
2624         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2625          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2626          * specified. */
2627
2628         if (user_lookup_fd < 0)
2629                 return 0;
2630
2631         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2632                 return 0;
2633
2634         if (writev(user_lookup_fd,
2635                (struct iovec[]) {
2636                            IOVEC_INIT(&uid, sizeof(uid)),
2637                            IOVEC_INIT(&gid, sizeof(gid)),
2638                            IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2639                 return -errno;
2640
2641         return 0;
2642 }
2643
2644 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2645         int r;
2646
2647         assert(c);
2648         assert(home);
2649         assert(buf);
2650
2651         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2652
2653         if (*home)
2654                 return 0;
2655
2656         if (!c->working_directory_home)
2657                 return 0;
2658
2659         if (uid == 0) {
2660                 /* Hardcode /root as home directory for UID 0 */
2661                 *home = "/root";
2662                 return 1;
2663         }
2664
2665         r = get_home_dir(buf);
2666         if (r < 0)
2667                 return r;
2668
2669         *home = *buf;
2670         return 1;
2671 }
2672
2673 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2674         _cleanup_strv_free_ char ** list = NULL;
2675         ExecDirectoryType t;
2676         int r;
2677
2678         assert(c);
2679         assert(p);
2680         assert(ret);
2681
2682         assert(c->dynamic_user);
2683
2684         /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2685          * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2686          * directories. */
2687
2688         for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2689                 char **i;
2690
2691                 if (t == EXEC_DIRECTORY_CONFIGURATION)
2692                         continue;
2693
2694                 if (!p->prefix[t])
2695                         continue;
2696
2697                 STRV_FOREACH(i, c->directories[t].paths) {
2698                         char *e;
2699
2700                         if (t == EXEC_DIRECTORY_RUNTIME)
2701                                 e = strjoin(p->prefix[t], "/", *i);
2702                         else
2703                                 e = strjoin(p->prefix[t], "/private/", *i);
2704                         if (!e)
2705                                 return -ENOMEM;
2706
2707                         r = strv_consume(&list, e);
2708                         if (r < 0)
2709                                 return r;
2710                 }
2711         }
2712
2713         *ret = TAKE_PTR(list);
2714
2715         return 0;
2716 }
2717
2718 static char *exec_command_line(char **argv);
2719
2720 static int exec_child(
2721                 Unit *unit,
2722                 const ExecCommand *command,
2723                 const ExecContext *context,
2724                 const ExecParameters *params,
2725                 ExecRuntime *runtime,
2726                 DynamicCreds *dcreds,
2727                 int socket_fd,
2728                 int named_iofds[3],
2729                 int *fds,
2730                 size_t n_socket_fds,
2731                 size_t n_storage_fds,
2732                 char **files_env,
2733                 int user_lookup_fd,
2734                 int *exit_status) {
2735
2736         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2737         int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
2738         _cleanup_free_ gid_t *supplementary_gids = NULL;
2739         const char *username = NULL, *groupname = NULL;
2740         _cleanup_free_ char *home_buffer = NULL;
2741         const char *home = NULL, *shell = NULL;
2742         dev_t journal_stream_dev = 0;
2743         ino_t journal_stream_ino = 0;
2744         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2745                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2746                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2747                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2748 #if HAVE_SELINUX
2749         _cleanup_free_ char *mac_selinux_context_net = NULL;
2750         bool use_selinux = false;
2751 #endif
2752 #if ENABLE_SMACK
2753         bool use_smack = false;
2754 #endif
2755 #if HAVE_APPARMOR
2756         bool use_apparmor = false;
2757 #endif
2758         uid_t uid = UID_INVALID;
2759         gid_t gid = GID_INVALID;
2760         size_t n_fds;
2761         ExecDirectoryType dt;
2762         int secure_bits;
2763
2764         assert(unit);
2765         assert(command);
2766         assert(context);
2767         assert(params);
2768         assert(exit_status);
2769
2770         rename_process_from_path(command->path);
2771
2772         /* We reset exactly these signals, since they are the
2773          * only ones we set to SIG_IGN in the main daemon. All
2774          * others we leave untouched because we set them to
2775          * SIG_DFL or a valid handler initially, both of which
2776          * will be demoted to SIG_DFL. */
2777         (void) default_signals(SIGNALS_CRASH_HANDLER,
2778                                SIGNALS_IGNORE, -1);
2779
2780         if (context->ignore_sigpipe)
2781                 (void) ignore_signals(SIGPIPE, -1);
2782
2783         r = reset_signal_mask();
2784         if (r < 0) {
2785                 *exit_status = EXIT_SIGNAL_MASK;
2786                 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2787         }
2788
2789         if (params->idle_pipe)
2790                 do_idle_pipe_dance(params->idle_pipe);
2791
2792         /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2793          * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2794          * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2795          * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2796
2797         log_forget_fds();
2798         log_set_open_when_needed(true);
2799
2800         /* In case anything used libc syslog(), close this here, too */
2801         closelog();
2802
2803         n_fds = n_socket_fds + n_storage_fds;
2804         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
2805         if (r < 0) {
2806                 *exit_status = EXIT_FDS;
2807                 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2808         }
2809
2810         if (!context->same_pgrp)
2811                 if (setsid() < 0) {
2812                         *exit_status = EXIT_SETSID;
2813                         return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2814                 }
2815
2816         exec_context_tty_reset(context, params);
2817
2818         if (unit_shall_confirm_spawn(unit)) {
2819                 const char *vc = params->confirm_spawn;
2820                 _cleanup_free_ char *cmdline = NULL;
2821
2822                 cmdline = exec_command_line(command->argv);
2823                 if (!cmdline) {
2824                         *exit_status = EXIT_MEMORY;
2825                         return log_oom();
2826                 }
2827
2828                 r = ask_for_confirmation(vc, unit, cmdline);
2829                 if (r != CONFIRM_EXECUTE) {
2830                         if (r == CONFIRM_PRETEND_SUCCESS) {
2831                                 *exit_status = EXIT_SUCCESS;
2832                                 return 0;
2833                         }
2834                         *exit_status = EXIT_CONFIRM;
2835                         log_unit_error(unit, "Execution cancelled by the user");
2836                         return -ECANCELED;
2837                 }
2838         }
2839
2840         /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
2841          * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
2842          * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
2843          * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
2844          * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
2845         if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
2846             setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
2847                 *exit_status = EXIT_MEMORY;
2848                 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2849         }
2850
2851         if (context->dynamic_user && dcreds) {
2852                 _cleanup_strv_free_ char **suggested_paths = NULL;
2853
2854                 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
2855                  * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
2856                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2857                         *exit_status = EXIT_USER;
2858                         return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2859                 }
2860
2861                 r = compile_suggested_paths(context, params, &suggested_paths);
2862                 if (r < 0) {
2863                         *exit_status = EXIT_MEMORY;
2864                         return log_oom();
2865                 }
2866
2867                 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2868                 if (r < 0) {
2869                         *exit_status = EXIT_USER;
2870                         if (r == -EILSEQ) {
2871                                 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2872                                 return -EOPNOTSUPP;
2873                         }
2874                         return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2875                 }
2876
2877                 if (!uid_is_valid(uid)) {
2878                         *exit_status = EXIT_USER;
2879                         log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2880                         return -ESRCH;
2881                 }
2882
2883                 if (!gid_is_valid(gid)) {
2884                         *exit_status = EXIT_USER;
2885                         log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2886                         return -ESRCH;
2887                 }
2888
2889                 if (dcreds->user)
2890                         username = dcreds->user->name;
2891
2892         } else {
2893                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2894                 if (r < 0) {
2895                         *exit_status = EXIT_USER;
2896                         return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2897                 }
2898
2899                 r = get_fixed_group(context, &groupname, &gid);
2900                 if (r < 0) {
2901                         *exit_status = EXIT_GROUP;
2902                         return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2903                 }
2904         }
2905
2906         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2907         r = get_supplementary_groups(context, username, groupname, gid,
2908                                      &supplementary_gids, &ngids);
2909         if (r < 0) {
2910                 *exit_status = EXIT_GROUP;
2911                 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2912         }
2913
2914         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2915         if (r < 0) {
2916                 *exit_status = EXIT_USER;
2917                 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2918         }
2919
2920         user_lookup_fd = safe_close(user_lookup_fd);
2921
2922         r = acquire_home(context, uid, &home, &home_buffer);
2923         if (r < 0) {
2924                 *exit_status = EXIT_CHDIR;
2925                 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2926         }
2927
2928         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2929          * must sure to drop O_NONBLOCK */
2930         if (socket_fd >= 0)
2931                 (void) fd_nonblock(socket_fd, false);
2932
2933         r = setup_input(context, params, socket_fd, named_iofds);
2934         if (r < 0) {
2935                 *exit_status = EXIT_STDIN;
2936                 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2937         }
2938
2939         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2940         if (r < 0) {
2941                 *exit_status = EXIT_STDOUT;
2942                 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2943         }
2944
2945         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2946         if (r < 0) {
2947                 *exit_status = EXIT_STDERR;
2948                 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2949         }
2950
2951         if (params->cgroup_path) {
2952                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2953                 if (r < 0) {
2954                         *exit_status = EXIT_CGROUP;
2955                         return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2956                 }
2957         }
2958
2959         if (context->oom_score_adjust_set) {
2960                 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
2961                  * prohibit write access to this file, and we shouldn't trip up over that. */
2962                 r = set_oom_score_adjust(context->oom_score_adjust);
2963                 if (IN_SET(r, -EPERM, -EACCES))
2964                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2965                 else if (r < 0) {
2966                         *exit_status = EXIT_OOM_ADJUST;
2967                         return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2968                 }
2969         }
2970
2971         if (context->nice_set)
2972                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2973                         *exit_status = EXIT_NICE;
2974                         return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2975                 }
2976
2977         if (context->cpu_sched_set) {
2978                 struct sched_param param = {
2979                         .sched_priority = context->cpu_sched_priority,
2980                 };
2981
2982                 r = sched_setscheduler(0,
2983                                        context->cpu_sched_policy |
2984                                        (context->cpu_sched_reset_on_fork ?
2985                                         SCHED_RESET_ON_FORK : 0),
2986                                        &param);
2987                 if (r < 0) {
2988                         *exit_status = EXIT_SETSCHEDULER;
2989                         return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2990                 }
2991         }
2992
2993         if (context->cpuset)
2994                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2995                         *exit_status = EXIT_CPUAFFINITY;
2996                         return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2997                 }
2998
2999         if (context->ioprio_set)
3000                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
3001                         *exit_status = EXIT_IOPRIO;
3002                         return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
3003                 }
3004
3005         if (context->timer_slack_nsec != NSEC_INFINITY)
3006                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
3007                         *exit_status = EXIT_TIMERSLACK;
3008                         return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
3009                 }
3010
3011         if (context->personality != PERSONALITY_INVALID) {
3012                 r = safe_personality(context->personality);
3013                 if (r < 0) {
3014                         *exit_status = EXIT_PERSONALITY;
3015                         return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
3016                 }
3017         }
3018
3019         if (context->utmp_id)
3020                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
3021                                       context->tty_path,
3022                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
3023                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3024                                       USER_PROCESS,
3025                                       username);
3026
3027         if (context->user) {
3028                 r = chown_terminal(STDIN_FILENO, uid);
3029                 if (r < 0) {
3030                         *exit_status = EXIT_STDIN;
3031                         return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3032                 }
3033         }
3034
3035         /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3036          * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3037          * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3038          * touch a single hierarchy too. */
3039         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3040                 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3041                 if (r < 0) {
3042                         *exit_status = EXIT_CGROUP;
3043                         return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3044                 }
3045         }
3046
3047         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3048                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3049                 if (r < 0)
3050                         return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3051         }
3052
3053         r = build_environment(
3054                         unit,
3055                         context,
3056                         params,
3057                         n_fds,
3058                         home,
3059                         username,
3060                         shell,
3061                         journal_stream_dev,
3062                         journal_stream_ino,
3063                         &our_env);
3064         if (r < 0) {
3065                 *exit_status = EXIT_MEMORY;
3066                 return log_oom();
3067         }
3068
3069         r = build_pass_environment(context, &pass_env);
3070         if (r < 0) {
3071                 *exit_status = EXIT_MEMORY;
3072                 return log_oom();
3073         }
3074
3075         accum_env = strv_env_merge(5,
3076                                    params->environment,
3077                                    our_env,
3078                                    pass_env,
3079                                    context->environment,
3080                                    files_env,
3081                                    NULL);
3082         if (!accum_env) {
3083                 *exit_status = EXIT_MEMORY;
3084                 return log_oom();
3085         }
3086         accum_env = strv_env_clean(accum_env);
3087
3088         (void) umask(context->umask);
3089
3090         r = setup_keyring(unit, context, params, uid, gid);
3091         if (r < 0) {
3092                 *exit_status = EXIT_KEYRING;
3093                 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3094         }
3095
3096         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3097         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3098
3099         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3100         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3101
3102         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3103         if (needs_ambient_hack)
3104                 needs_setuid = false;
3105         else
3106                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3107
3108         if (needs_sandboxing) {
3109                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3110                  * present. The actual MAC context application will happen later, as late as possible, to avoid
3111                  * impacting our own code paths. */
3112
3113 #if HAVE_SELINUX
3114                 use_selinux = mac_selinux_use();
3115 #endif
3116 #if ENABLE_SMACK
3117                 use_smack = mac_smack_use();
3118 #endif
3119 #if HAVE_APPARMOR
3120                 use_apparmor = mac_apparmor_use();
3121 #endif
3122         }
3123
3124         if (needs_setuid) {
3125                 if (context->pam_name && username) {
3126                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3127                         if (r < 0) {
3128                                 *exit_status = EXIT_PAM;
3129                                 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3130                         }
3131                 }
3132         }
3133
3134         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3135                 if (ns_type_supported(NAMESPACE_NET)) {
3136                         r = setup_netns(runtime->netns_storage_socket);
3137                         if (r < 0) {
3138                                 *exit_status = EXIT_NETWORK;
3139                                 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3140                         }
3141                 } else
3142                         log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3143         }
3144
3145         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3146         if (needs_mount_namespace) {
3147                 r = apply_mount_namespace(unit, command, context, params, runtime);
3148                 if (r < 0) {
3149                         *exit_status = EXIT_NAMESPACE;
3150                         return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3151                 }
3152         }
3153
3154         /* Apply just after mount namespace setup */
3155         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3156         if (r < 0)
3157                 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3158
3159         /* Drop groups as early as possbile */
3160         if (needs_setuid) {
3161                 r = enforce_groups(gid, supplementary_gids, ngids);
3162                 if (r < 0) {
3163                         *exit_status = EXIT_GROUP;
3164                         return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3165                 }
3166         }
3167
3168         if (needs_sandboxing) {
3169 #if HAVE_SELINUX
3170                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3171                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3172                         if (r < 0) {
3173                                 *exit_status = EXIT_SELINUX_CONTEXT;
3174                                 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3175                         }
3176                 }
3177 #endif
3178
3179                 if (context->private_users) {
3180                         r = setup_private_users(uid, gid);
3181                         if (r < 0) {
3182                                 *exit_status = EXIT_USER;
3183                                 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3184                         }
3185                 }
3186         }
3187
3188         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3189          * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3190          * however if we have it as we want to keep it open until the final execve(). */
3191
3192         if (params->exec_fd >= 0) {
3193                 exec_fd = params->exec_fd;
3194
3195                 if (exec_fd < 3 + (int) n_fds) {
3196                         int moved_fd;
3197
3198                         /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3199                          * process we are about to execute. */
3200
3201                         moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3202                         if (moved_fd < 0) {
3203                                 *exit_status = EXIT_FDS;
3204                                 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3205                         }
3206
3207                         safe_close(exec_fd);
3208                         exec_fd = moved_fd;
3209                 } else {
3210                         /* This fd should be FD_CLOEXEC already, but let's make sure. */
3211                         r = fd_cloexec(exec_fd, true);
3212                         if (r < 0) {
3213                                 *exit_status = EXIT_FDS;
3214                                 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3215                         }
3216                 }
3217
3218                 fds_with_exec_fd = newa(int, n_fds + 1);
3219                 memcpy(fds_with_exec_fd, fds, n_fds * sizeof(int));
3220                 fds_with_exec_fd[n_fds] = exec_fd;
3221                 n_fds_with_exec_fd = n_fds + 1;
3222         } else {
3223                 fds_with_exec_fd = fds;
3224                 n_fds_with_exec_fd = n_fds;
3225         }
3226
3227         r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
3228         if (r >= 0)
3229                 r = shift_fds(fds, n_fds);
3230         if (r >= 0)
3231                 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
3232         if (r < 0) {
3233                 *exit_status = EXIT_FDS;
3234                 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3235         }
3236
3237         /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3238          * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3239          * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3240          * came this far. */
3241
3242         secure_bits = context->secure_bits;
3243
3244         if (needs_sandboxing) {
3245                 uint64_t bset;
3246                 int which_failed;
3247
3248                 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3249                 if (r < 0) {
3250                         *exit_status = EXIT_LIMITS;
3251                         return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3252                 }
3253
3254                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3255                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3256                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3257                                 *exit_status = EXIT_LIMITS;
3258                                 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3259                         }
3260                 }
3261
3262 #if ENABLE_SMACK
3263                 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3264                  * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3265                 if (use_smack) {
3266                         r = setup_smack(context, command);
3267                         if (r < 0) {
3268                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3269                                 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3270                         }
3271                 }
3272 #endif
3273
3274                 bset = context->capability_bounding_set;
3275                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3276                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3277                  * instead of us doing that */
3278                 if (needs_ambient_hack)
3279                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
3280                                 (UINT64_C(1) << CAP_SETUID) |
3281                                 (UINT64_C(1) << CAP_SETGID);
3282
3283                 if (!cap_test_all(bset)) {
3284                         r = capability_bounding_set_drop(bset, false);
3285                         if (r < 0) {
3286                                 *exit_status = EXIT_CAPABILITIES;
3287                                 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3288                         }
3289                 }
3290
3291                 /* This is done before enforce_user, but ambient set
3292                  * does not survive over setresuid() if keep_caps is not set. */
3293                 if (!needs_ambient_hack &&
3294                     context->capability_ambient_set != 0) {
3295                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
3296                         if (r < 0) {
3297                                 *exit_status = EXIT_CAPABILITIES;
3298                                 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3299                         }
3300                 }
3301         }
3302
3303         if (needs_setuid) {
3304                 if (context->user) {
3305                         r = enforce_user(context, uid);
3306                         if (r < 0) {
3307                                 *exit_status = EXIT_USER;
3308                                 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3309                         }
3310
3311                         if (!needs_ambient_hack &&
3312                             context->capability_ambient_set != 0) {
3313
3314                                 /* Fix the ambient capabilities after user change. */
3315                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3316                                 if (r < 0) {
3317                                         *exit_status = EXIT_CAPABILITIES;
3318                                         return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3319                                 }
3320
3321                                 /* If we were asked to change user and ambient capabilities
3322                                  * were requested, we had to add keep-caps to the securebits
3323                                  * so that we would maintain the inherited capability set
3324                                  * through the setresuid(). Make sure that the bit is added
3325                                  * also to the context secure_bits so that we don't try to
3326                                  * drop the bit away next. */
3327
3328                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
3329                         }
3330                 }
3331         }
3332
3333         if (needs_sandboxing) {
3334                 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3335                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3336                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3337                  * are restricted. */
3338
3339 #if HAVE_SELINUX
3340                 if (use_selinux) {
3341                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3342
3343                         if (exec_context) {
3344                                 r = setexeccon(exec_context);
3345                                 if (r < 0) {
3346                                         *exit_status = EXIT_SELINUX_CONTEXT;
3347                                         return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3348                                 }
3349                         }
3350                 }
3351 #endif
3352
3353 #if HAVE_APPARMOR
3354                 if (use_apparmor && context->apparmor_profile) {
3355                         r = aa_change_onexec(context->apparmor_profile);
3356                         if (r < 0 && !context->apparmor_profile_ignore) {
3357                                 *exit_status = EXIT_APPARMOR_PROFILE;
3358                                 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3359                         }
3360                 }
3361 #endif
3362
3363                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3364                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3365                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3366                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3367                                 *exit_status = EXIT_SECUREBITS;
3368                                 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3369                         }
3370
3371                 if (context_has_no_new_privileges(context))
3372                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3373                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3374                                 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3375                         }
3376
3377 #if HAVE_SECCOMP
3378                 r = apply_address_families(unit, context);
3379                 if (r < 0) {
3380                         *exit_status = EXIT_ADDRESS_FAMILIES;
3381                         return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3382                 }
3383
3384                 r = apply_memory_deny_write_execute(unit, context);
3385                 if (r < 0) {
3386                         *exit_status = EXIT_SECCOMP;
3387                         return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3388                 }
3389
3390                 r = apply_restrict_realtime(unit, context);
3391                 if (r < 0) {
3392                         *exit_status = EXIT_SECCOMP;
3393                         return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3394                 }
3395
3396                 r = apply_restrict_namespaces(unit, context);
3397                 if (r < 0) {
3398                         *exit_status = EXIT_SECCOMP;
3399                         return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3400                 }
3401
3402                 r = apply_protect_sysctl(unit, context);
3403                 if (r < 0) {
3404                         *exit_status = EXIT_SECCOMP;
3405                         return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3406                 }
3407
3408                 r = apply_protect_kernel_modules(unit, context);
3409                 if (r < 0) {
3410                         *exit_status = EXIT_SECCOMP;
3411                         return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3412                 }
3413
3414                 r = apply_private_devices(unit, context);
3415                 if (r < 0) {
3416                         *exit_status = EXIT_SECCOMP;
3417                         return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3418                 }
3419
3420                 r = apply_syscall_archs(unit, context);
3421                 if (r < 0) {
3422                         *exit_status = EXIT_SECCOMP;
3423                         return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3424                 }
3425
3426                 r = apply_lock_personality(unit, context);
3427                 if (r < 0) {
3428                         *exit_status = EXIT_SECCOMP;
3429                         return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3430                 }
3431
3432                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3433                  * by the filter as little as possible. */
3434                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3435                 if (r < 0) {
3436                         *exit_status = EXIT_SECCOMP;
3437                         return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3438                 }
3439 #endif
3440         }
3441
3442         if (!strv_isempty(context->unset_environment)) {
3443                 char **ee = NULL;
3444
3445                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3446                 if (!ee) {
3447                         *exit_status = EXIT_MEMORY;
3448                         return log_oom();
3449                 }
3450
3451                 strv_free_and_replace(accum_env, ee);
3452         }
3453
3454         final_argv = replace_env_argv(command->argv, accum_env);
3455         if (!final_argv) {
3456                 *exit_status = EXIT_MEMORY;
3457                 return log_oom();
3458         }
3459
3460         if (DEBUG_LOGGING) {
3461                 _cleanup_free_ char *line;
3462
3463                 line = exec_command_line(final_argv);
3464                 if (line)
3465                         log_struct(LOG_DEBUG,
3466                                    "EXECUTABLE=%s", command->path,
3467                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3468                                    LOG_UNIT_ID(unit),
3469                                    LOG_UNIT_INVOCATION_ID(unit));
3470         }
3471
3472         if (exec_fd >= 0) {
3473                 uint8_t hot = 1;
3474
3475                 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3476                  * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3477
3478                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3479                         *exit_status = EXIT_EXEC;
3480                         return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3481                 }
3482         }
3483
3484         execve(command->path, final_argv, accum_env);
3485         r = -errno;
3486
3487         if (exec_fd >= 0) {
3488                 uint8_t hot = 0;
3489
3490                 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3491                  * that POLLHUP on it no longer means execve() succeeded. */
3492
3493                 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3494                         *exit_status = EXIT_EXEC;
3495                         return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3496                 }
3497         }
3498
3499         if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3500                 log_struct_errno(LOG_INFO, r,
3501                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3502                                  LOG_UNIT_ID(unit),
3503                                  LOG_UNIT_INVOCATION_ID(unit),
3504                                  LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3505                                                   command->path),
3506                                  "EXECUTABLE=%s", command->path);
3507                 return 0;
3508         }
3509
3510         *exit_status = EXIT_EXEC;
3511         return log_unit_error_errno(unit, r, "Failed to execute command: %m");
3512 }
3513
3514 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3515 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3516
3517 int exec_spawn(Unit *unit,
3518                ExecCommand *command,
3519                const ExecContext *context,
3520                const ExecParameters *params,
3521                ExecRuntime *runtime,
3522                DynamicCreds *dcreds,
3523                pid_t *ret) {
3524
3525         int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
3526         _cleanup_strv_free_ char **files_env = NULL;
3527         size_t n_storage_fds = 0, n_socket_fds = 0;
3528         _cleanup_free_ char *line = NULL;
3529         pid_t pid;
3530
3531         assert(unit);
3532         assert(command);
3533         assert(context);
3534         assert(ret);
3535         assert(params);
3536         assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
3537
3538         if (context->std_input == EXEC_INPUT_SOCKET ||
3539             context->std_output == EXEC_OUTPUT_SOCKET ||
3540             context->std_error == EXEC_OUTPUT_SOCKET) {
3541
3542                 if (params->n_socket_fds > 1) {
3543                         log_unit_error(unit, "Got more than one socket.");
3544                         return -EINVAL;
3545                 }
3546
3547                 if (params->n_socket_fds == 0) {
3548                         log_unit_error(unit, "Got no socket.");
3549                         return -EINVAL;
3550                 }
3551
3552                 socket_fd = params->fds[0];
3553         } else {
3554                 socket_fd = -1;
3555                 fds = params->fds;
3556                 n_socket_fds = params->n_socket_fds;
3557                 n_storage_fds = params->n_storage_fds;
3558         }
3559
3560         r = exec_context_named_iofds(context, params, named_iofds);
3561         if (r < 0)
3562                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3563
3564         r = exec_context_load_environment(unit, context, &files_env);
3565         if (r < 0)
3566                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3567
3568         line = exec_command_line(command->argv);
3569         if (!line)
3570                 return log_oom();
3571
3572         log_struct(LOG_DEBUG,
3573                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3574                    "EXECUTABLE=%s", command->path,
3575                    LOG_UNIT_ID(unit),
3576                    LOG_UNIT_INVOCATION_ID(unit));
3577
3578         pid = fork();
3579         if (pid < 0)
3580                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3581
3582         if (pid == 0) {
3583                 int exit_status = EXIT_SUCCESS;
3584
3585                 r = exec_child(unit,
3586                                command,
3587                                context,
3588                                params,
3589                                runtime,
3590                                dcreds,
3591                                socket_fd,
3592                                named_iofds,
3593                                fds,
3594                                n_socket_fds,
3595                                n_storage_fds,
3596                                files_env,
3597                                unit->manager->user_lookup_fds[1],
3598                                &exit_status);
3599
3600                 if (r < 0)
3601                         log_struct_errno(LOG_ERR, r,
3602                                          "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3603                                          LOG_UNIT_ID(unit),
3604                                          LOG_UNIT_INVOCATION_ID(unit),
3605                                          LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3606                                                           exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3607                                                           command->path),
3608                                          "EXECUTABLE=%s", command->path);
3609
3610                 _exit(exit_status);
3611         }
3612
3613         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3614
3615         /* We add the new process to the cgroup both in the child (so
3616          * that we can be sure that no user code is ever executed
3617          * outside of the cgroup) and in the parent (so that we can be
3618          * sure that when we kill the cgroup the process will be
3619          * killed too). */
3620         if (params->cgroup_path)
3621                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3622
3623         exec_status_start(&command->exec_status, pid);
3624
3625         *ret = pid;
3626         return 0;
3627 }
3628
3629 void exec_context_init(ExecContext *c) {
3630         ExecDirectoryType i;
3631
3632         assert(c);
3633
3634         c->umask = 0022;
3635         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3636         c->cpu_sched_policy = SCHED_OTHER;
3637         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3638         c->syslog_level_prefix = true;
3639         c->ignore_sigpipe = true;
3640         c->timer_slack_nsec = NSEC_INFINITY;
3641         c->personality = PERSONALITY_INVALID;
3642         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3643                 c->directories[i].mode = 0755;
3644         c->capability_bounding_set = CAP_ALL;
3645         assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3646         c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
3647         c->log_level_max = -1;
3648 }
3649
3650 void exec_context_done(ExecContext *c) {
3651         ExecDirectoryType i;
3652         size_t l;
3653
3654         assert(c);
3655
3656         c->environment = strv_free(c->environment);
3657         c->environment_files = strv_free(c->environment_files);
3658         c->pass_environment = strv_free(c->pass_environment);
3659         c->unset_environment = strv_free(c->unset_environment);
3660
3661         rlimit_free_all(c->rlimit);
3662
3663         for (l = 0; l < 3; l++) {
3664                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3665                 c->stdio_file[l] = mfree(c->stdio_file[l]);
3666         }
3667
3668         c->working_directory = mfree(c->working_directory);
3669         c->root_directory = mfree(c->root_directory);
3670         c->root_image = mfree(c->root_image);
3671         c->tty_path = mfree(c->tty_path);
3672         c->syslog_identifier = mfree(c->syslog_identifier);
3673         c->user = mfree(c->user);
3674         c->group = mfree(c->group);
3675
3676         c->supplementary_groups = strv_free(c->supplementary_groups);
3677
3678         c->pam_name = mfree(c->pam_name);
3679
3680         c->read_only_paths = strv_free(c->read_only_paths);
3681         c->read_write_paths = strv_free(c->read_write_paths);
3682         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3683
3684         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3685         c->bind_mounts = NULL;
3686         c->n_bind_mounts = 0;
3687         temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3688         c->temporary_filesystems = NULL;
3689         c->n_temporary_filesystems = 0;
3690
3691         c->cpuset = cpu_set_mfree(c->cpuset);
3692
3693         c->utmp_id = mfree(c->utmp_id);
3694         c->selinux_context = mfree(c->selinux_context);
3695         c->apparmor_profile = mfree(c->apparmor_profile);
3696         c->smack_process_label = mfree(c->smack_process_label);
3697
3698         c->syscall_filter = hashmap_free(c->syscall_filter);
3699         c->syscall_archs = set_free(c->syscall_archs);
3700         c->address_families = set_free(c->address_families);
3701
3702         for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3703                 c->directories[i].paths = strv_free(c->directories[i].paths);
3704
3705         c->log_level_max = -1;
3706
3707         exec_context_free_log_extra_fields(c);
3708
3709         c->stdin_data = mfree(c->stdin_data);
3710         c->stdin_data_size = 0;
3711 }
3712
3713 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
3714         char **i;
3715
3716         assert(c);
3717
3718         if (!runtime_prefix)
3719                 return 0;
3720
3721         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3722                 _cleanup_free_ char *p;
3723
3724                 p = strjoin(runtime_prefix, "/", *i);
3725                 if (!p)
3726                         return -ENOMEM;
3727
3728                 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3729                  * next. */
3730                 (void) rm_rf(p, REMOVE_ROOT);
3731         }
3732
3733         return 0;
3734 }
3735
3736 static void exec_command_done(ExecCommand *c) {
3737         assert(c);
3738
3739         c->path = mfree(c->path);
3740         c->argv = strv_free(c->argv);
3741 }
3742
3743 void exec_command_done_array(ExecCommand *c, size_t n) {
3744         size_t i;
3745
3746         for (i = 0; i < n; i++)
3747                 exec_command_done(c+i);
3748 }
3749
3750 ExecCommand* exec_command_free_list(ExecCommand *c) {
3751         ExecCommand *i;
3752
3753         while ((i = c)) {
3754                 LIST_REMOVE(command, c, i);
3755                 exec_command_done(i);
3756                 free(i);
3757         }
3758
3759         return NULL;
3760 }
3761
3762 void exec_command_free_array(ExecCommand **c, size_t n) {
3763         size_t i;
3764
3765         for (i = 0; i < n; i++)
3766                 c[i] = exec_command_free_list(c[i]);
3767 }
3768
3769 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
3770         size_t i;
3771
3772         for (i = 0; i < n; i++)
3773                 exec_status_reset(&c[i].exec_status);
3774 }
3775
3776 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
3777         size_t i;
3778
3779         for (i = 0; i < n; i++) {
3780                 ExecCommand *z;
3781
3782                 LIST_FOREACH(command, z, c[i])
3783                         exec_status_reset(&z->exec_status);
3784         }
3785 }
3786
3787 typedef struct InvalidEnvInfo {
3788         const Unit *unit;
3789         const char *path;
3790 } InvalidEnvInfo;
3791
3792 static void invalid_env(const char *p, void *userdata) {
3793         InvalidEnvInfo *info = userdata;
3794
3795         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3796 }
3797
3798 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3799         assert(c);
3800
3801         switch (fd_index) {
3802
3803         case STDIN_FILENO:
3804                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3805                         return NULL;
3806
3807                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3808
3809         case STDOUT_FILENO:
3810                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3811                         return NULL;
3812
3813                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3814
3815         case STDERR_FILENO:
3816                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3817                         return NULL;
3818
3819                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3820
3821         default:
3822                 return NULL;
3823         }
3824 }
3825
3826 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3827         size_t i, targets;
3828         const char* stdio_fdname[3];
3829         size_t n_fds;
3830
3831         assert(c);
3832         assert(p);
3833
3834         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3835                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3836                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3837
3838         for (i = 0; i < 3; i++)
3839                 stdio_fdname[i] = exec_context_fdname(c, i);
3840
3841         n_fds = p->n_storage_fds + p->n_socket_fds;
3842
3843         for (i = 0; i < n_fds  && targets > 0; i++)
3844                 if (named_iofds[STDIN_FILENO] < 0 &&
3845                     c->std_input == EXEC_INPUT_NAMED_FD &&
3846                     stdio_fdname[STDIN_FILENO] &&
3847                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3848
3849                         named_iofds[STDIN_FILENO] = p->fds[i];
3850                         targets--;
3851
3852                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3853                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3854                            stdio_fdname[STDOUT_FILENO] &&
3855                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3856
3857                         named_iofds[STDOUT_FILENO] = p->fds[i];
3858                         targets--;
3859
3860                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3861                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3862                            stdio_fdname[STDERR_FILENO] &&
3863                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3864
3865                         named_iofds[STDERR_FILENO] = p->fds[i];
3866                         targets--;
3867                 }
3868
3869         return targets == 0 ? 0 : -ENOENT;
3870 }
3871
3872 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
3873         char **i, **r = NULL;
3874
3875         assert(c);
3876         assert(l);
3877
3878         STRV_FOREACH(i, c->environment_files) {
3879                 char *fn;
3880                 int k;
3881                 unsigned n;
3882                 bool ignore = false;
3883                 char **p;
3884                 _cleanup_globfree_ glob_t pglob = {};
3885
3886                 fn = *i;
3887
3888                 if (fn[0] == '-') {
3889                         ignore = true;
3890                         fn++;
3891                 }
3892
3893                 if (!path_is_absolute(fn)) {
3894                         if (ignore)
3895                                 continue;
3896
3897                         strv_free(r);
3898                         return -EINVAL;
3899                 }
3900
3901                 /* Filename supports globbing, take all matching files */
3902                 k = safe_glob(fn, 0, &pglob);
3903                 if (k < 0) {
3904                         if (ignore)
3905                                 continue;
3906
3907                         strv_free(r);
3908                         return k;
3909                 }
3910
3911                 /* When we don't match anything, -ENOENT should be returned */
3912                 assert(pglob.gl_pathc > 0);
3913
3914                 for (n = 0; n < pglob.gl_pathc; n++) {
3915                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3916                         if (k < 0) {
3917                                 if (ignore)
3918                                         continue;
3919
3920                                 strv_free(r);
3921                                 return k;
3922                         }
3923                         /* Log invalid environment variables with filename */
3924                         if (p) {
3925                                 InvalidEnvInfo info = {
3926                                         .unit = unit,
3927                                         .path = pglob.gl_pathv[n]
3928                                 };
3929
3930                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3931                         }
3932
3933                         if (!r)
3934                                 r = p;
3935                         else {
3936                                 char **m;
3937
3938                                 m = strv_env_merge(2, r, p);
3939                                 strv_free(r);
3940                                 strv_free(p);
3941                                 if (!m)
3942                                         return -ENOMEM;
3943
3944                                 r = m;
3945                         }
3946                 }
3947         }
3948
3949         *l = r;
3950
3951         return 0;
3952 }
3953
3954 static bool tty_may_match_dev_console(const char *tty) {
3955         _cleanup_free_ char *resolved = NULL;
3956
3957         if (!tty)
3958                 return true;
3959
3960         tty = skip_dev_prefix(tty);
3961
3962         /* trivial identity? */
3963         if (streq(tty, "console"))
3964                 return true;
3965
3966         if (resolve_dev_console(&resolved) < 0)
3967                 return true; /* if we could not resolve, assume it may */
3968
3969         /* "tty0" means the active VC, so it may be the same sometimes */
3970         return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
3971 }
3972
3973 bool exec_context_may_touch_console(const ExecContext *ec) {
3974
3975         return (ec->tty_reset ||
3976                 ec->tty_vhangup ||
3977                 ec->tty_vt_disallocate ||
3978                 is_terminal_input(ec->std_input) ||
3979                 is_terminal_output(ec->std_output) ||
3980                 is_terminal_output(ec->std_error)) &&
3981                tty_may_match_dev_console(exec_context_tty_path(ec));
3982 }
3983
3984 static void strv_fprintf(FILE *f, char **l) {
3985         char **g;
3986
3987         assert(f);
3988
3989         STRV_FOREACH(g, l)
3990                 fprintf(f, " %s", *g);
3991 }
3992
3993 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
3994         ExecDirectoryType dt;
3995         char **e, **d;
3996         unsigned i;
3997         int r;
3998
3999         assert(c);
4000         assert(f);
4001
4002         prefix = strempty(prefix);
4003
4004         fprintf(f,
4005                 "%sUMask: %04o\n"
4006                 "%sWorkingDirectory: %s\n"
4007                 "%sRootDirectory: %s\n"
4008                 "%sNonBlocking: %s\n"
4009                 "%sPrivateTmp: %s\n"
4010                 "%sPrivateDevices: %s\n"
4011                 "%sProtectKernelTunables: %s\n"
4012                 "%sProtectKernelModules: %s\n"
4013                 "%sProtectControlGroups: %s\n"
4014                 "%sPrivateNetwork: %s\n"
4015                 "%sPrivateUsers: %s\n"
4016                 "%sProtectHome: %s\n"
4017                 "%sProtectSystem: %s\n"
4018                 "%sMountAPIVFS: %s\n"
4019                 "%sIgnoreSIGPIPE: %s\n"
4020                 "%sMemoryDenyWriteExecute: %s\n"
4021                 "%sRestrictRealtime: %s\n"
4022                 "%sKeyringMode: %s\n",
4023                 prefix, c->umask,
4024                 prefix, c->working_directory ? c->working_directory : "/",
4025                 prefix, c->root_directory ? c->root_directory : "/",
4026                 prefix, yes_no(c->non_blocking),
4027                 prefix, yes_no(c->private_tmp),
4028                 prefix, yes_no(c->private_devices),
4029                 prefix, yes_no(c->protect_kernel_tunables),
4030                 prefix, yes_no(c->protect_kernel_modules),
4031                 prefix, yes_no(c->protect_control_groups),
4032                 prefix, yes_no(c->private_network),
4033                 prefix, yes_no(c->private_users),
4034                 prefix, protect_home_to_string(c->protect_home),
4035                 prefix, protect_system_to_string(c->protect_system),
4036                 prefix, yes_no(c->mount_apivfs),
4037                 prefix, yes_no(c->ignore_sigpipe),
4038                 prefix, yes_no(c->memory_deny_write_execute),
4039                 prefix, yes_no(c->restrict_realtime),
4040                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
4041
4042         if (c->root_image)
4043                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4044
4045         STRV_FOREACH(e, c->environment)
4046                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4047
4048         STRV_FOREACH(e, c->environment_files)
4049                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
4050
4051         STRV_FOREACH(e, c->pass_environment)
4052                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4053
4054         STRV_FOREACH(e, c->unset_environment)
4055                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4056
4057         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4058
4059         for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4060                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4061
4062                 STRV_FOREACH(d, c->directories[dt].paths)
4063                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4064         }
4065
4066         if (c->nice_set)
4067                 fprintf(f,
4068                         "%sNice: %i\n",
4069                         prefix, c->nice);
4070
4071         if (c->oom_score_adjust_set)
4072                 fprintf(f,
4073                         "%sOOMScoreAdjust: %i\n",
4074                         prefix, c->oom_score_adjust);
4075
4076         for (i = 0; i < RLIM_NLIMITS; i++)
4077                 if (c->rlimit[i]) {
4078                         fprintf(f, "Limit%s%s: " RLIM_FMT "\n",
4079                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4080                         fprintf(f, "Limit%s%sSoft: " RLIM_FMT "\n",
4081                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4082                 }
4083
4084         if (c->ioprio_set) {
4085                 _cleanup_free_ char *class_str = NULL;
4086
4087                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4088                 if (r >= 0)
4089                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4090
4091                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4092         }
4093
4094         if (c->cpu_sched_set) {
4095                 _cleanup_free_ char *policy_str = NULL;
4096
4097                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4098                 if (r >= 0)
4099                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4100
4101                 fprintf(f,
4102                         "%sCPUSchedulingPriority: %i\n"
4103                         "%sCPUSchedulingResetOnFork: %s\n",
4104                         prefix, c->cpu_sched_priority,
4105                         prefix, yes_no(c->cpu_sched_reset_on_fork));
4106         }
4107
4108         if (c->cpuset) {
4109                 fprintf(f, "%sCPUAffinity:", prefix);
4110                 for (i = 0; i < c->cpuset_ncpus; i++)
4111                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4112                                 fprintf(f, " %u", i);
4113                 fputs("\n", f);
4114         }
4115
4116         if (c->timer_slack_nsec != NSEC_INFINITY)
4117                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4118
4119         fprintf(f,
4120                 "%sStandardInput: %s\n"
4121                 "%sStandardOutput: %s\n"
4122                 "%sStandardError: %s\n",
4123                 prefix, exec_input_to_string(c->std_input),
4124                 prefix, exec_output_to_string(c->std_output),
4125                 prefix, exec_output_to_string(c->std_error));
4126
4127         if (c->std_input == EXEC_INPUT_NAMED_FD)
4128                 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4129         if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4130                 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4131         if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4132                 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4133
4134         if (c->std_input == EXEC_INPUT_FILE)
4135                 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4136         if (c->std_output == EXEC_OUTPUT_FILE)
4137                 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4138         if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4139                 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4140         if (c->std_error == EXEC_OUTPUT_FILE)
4141                 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4142         if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4143                 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4144
4145         if (c->tty_path)
4146                 fprintf(f,
4147                         "%sTTYPath: %s\n"
4148                         "%sTTYReset: %s\n"
4149                         "%sTTYVHangup: %s\n"
4150                         "%sTTYVTDisallocate: %s\n",
4151                         prefix, c->tty_path,
4152                         prefix, yes_no(c->tty_reset),
4153                         prefix, yes_no(c->tty_vhangup),
4154                         prefix, yes_no(c->tty_vt_disallocate));
4155
4156         if (IN_SET(c->std_output,
4157                    EXEC_OUTPUT_SYSLOG,
4158                    EXEC_OUTPUT_KMSG,
4159                    EXEC_OUTPUT_JOURNAL,
4160                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4161                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4162                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4163             IN_SET(c->std_error,
4164                    EXEC_OUTPUT_SYSLOG,
4165                    EXEC_OUTPUT_KMSG,
4166                    EXEC_OUTPUT_JOURNAL,
4167                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4168                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
4169                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4170
4171                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4172
4173                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4174                 if (r >= 0)
4175                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4176
4177                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4178                 if (r >= 0)
4179                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4180         }
4181
4182         if (c->log_level_max >= 0) {
4183                 _cleanup_free_ char *t = NULL;
4184
4185                 (void) log_level_to_string_alloc(c->log_level_max, &t);
4186
4187                 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4188         }
4189
4190         if (c->n_log_extra_fields > 0) {
4191                 size_t j;
4192
4193                 for (j = 0; j < c->n_log_extra_fields; j++) {
4194                         fprintf(f, "%sLogExtraFields: ", prefix);
4195                         fwrite(c->log_extra_fields[j].iov_base,
4196                                1, c->log_extra_fields[j].iov_len,
4197                                f);
4198                         fputc('\n', f);
4199                 }
4200         }
4201
4202         if (c->secure_bits) {
4203                 _cleanup_free_ char *str = NULL;
4204
4205                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4206                 if (r >= 0)
4207                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4208         }
4209
4210         if (c->capability_bounding_set != CAP_ALL) {
4211                 _cleanup_free_ char *str = NULL;
4212
4213                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4214                 if (r >= 0)
4215                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4216         }
4217
4218         if (c->capability_ambient_set != 0) {
4219                 _cleanup_free_ char *str = NULL;
4220
4221                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4222                 if (r >= 0)
4223                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4224         }
4225
4226         if (c->user)
4227                 fprintf(f, "%sUser: %s\n", prefix, c->user);
4228         if (c->group)
4229                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4230
4231         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4232
4233         if (!strv_isempty(c->supplementary_groups)) {
4234                 fprintf(f, "%sSupplementaryGroups:", prefix);
4235                 strv_fprintf(f, c->supplementary_groups);
4236                 fputs("\n", f);
4237         }
4238
4239         if (c->pam_name)
4240                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4241
4242         if (!strv_isempty(c->read_write_paths)) {
4243                 fprintf(f, "%sReadWritePaths:", prefix);
4244                 strv_fprintf(f, c->read_write_paths);
4245                 fputs("\n", f);
4246         }
4247
4248         if (!strv_isempty(c->read_only_paths)) {
4249                 fprintf(f, "%sReadOnlyPaths:", prefix);
4250                 strv_fprintf(f, c->read_only_paths);
4251                 fputs("\n", f);
4252         }
4253
4254         if (!strv_isempty(c->inaccessible_paths)) {
4255                 fprintf(f, "%sInaccessiblePaths:", prefix);
4256                 strv_fprintf(f, c->inaccessible_paths);
4257                 fputs("\n", f);
4258         }
4259
4260         if (c->n_bind_mounts > 0)
4261                 for (i = 0; i < c->n_bind_mounts; i++)
4262                         fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
4263                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4264                                 c->bind_mounts[i].ignore_enoent ? "-": "",
4265                                 c->bind_mounts[i].source,
4266                                 c->bind_mounts[i].destination,
4267                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4268
4269         if (c->n_temporary_filesystems > 0)
4270                 for (i = 0; i < c->n_temporary_filesystems; i++) {
4271                         TemporaryFileSystem *t = c->temporary_filesystems + i;
4272
4273                         fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4274                                 t->path,
4275                                 isempty(t->options) ? "" : ":",
4276                                 strempty(t->options));
4277                 }
4278
4279         if (c->utmp_id)
4280                 fprintf(f,
4281                         "%sUtmpIdentifier: %s\n",
4282                         prefix, c->utmp_id);
4283
4284         if (c->selinux_context)
4285                 fprintf(f,
4286                         "%sSELinuxContext: %s%s\n",
4287                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4288
4289         if (c->apparmor_profile)
4290                 fprintf(f,
4291                         "%sAppArmorProfile: %s%s\n",
4292                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4293
4294         if (c->smack_process_label)
4295                 fprintf(f,
4296                         "%sSmackProcessLabel: %s%s\n",
4297                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4298
4299         if (c->personality != PERSONALITY_INVALID)
4300                 fprintf(f,
4301                         "%sPersonality: %s\n",
4302                         prefix, strna(personality_to_string(c->personality)));
4303
4304         fprintf(f,
4305                 "%sLockPersonality: %s\n",
4306                 prefix, yes_no(c->lock_personality));
4307
4308         if (c->syscall_filter) {
4309 #if HAVE_SECCOMP
4310                 Iterator j;
4311                 void *id, *val;
4312                 bool first = true;
4313 #endif
4314
4315                 fprintf(f,
4316                         "%sSystemCallFilter: ",
4317                         prefix);
4318
4319                 if (!c->syscall_whitelist)
4320                         fputc('~', f);
4321
4322 #if HAVE_SECCOMP
4323                 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4324                         _cleanup_free_ char *name = NULL;
4325                         const char *errno_name = NULL;
4326                         int num = PTR_TO_INT(val);
4327
4328                         if (first)
4329                                 first = false;
4330                         else
4331                                 fputc(' ', f);
4332
4333                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4334                         fputs(strna(name), f);
4335
4336                         if (num >= 0) {
4337                                 errno_name = errno_to_name(num);
4338                                 if (errno_name)
4339                                         fprintf(f, ":%s", errno_name);
4340                                 else
4341                                         fprintf(f, ":%d", num);
4342                         }
4343                 }
4344 #endif
4345
4346                 fputc('\n', f);
4347         }
4348
4349         if (c->syscall_archs) {
4350 #if HAVE_SECCOMP
4351                 Iterator j;
4352                 void *id;
4353 #endif
4354
4355                 fprintf(f,
4356                         "%sSystemCallArchitectures:",
4357                         prefix);
4358
4359 #if HAVE_SECCOMP
4360                 SET_FOREACH(id, c->syscall_archs, j)
4361                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4362 #endif
4363                 fputc('\n', f);
4364         }
4365
4366         if (exec_context_restrict_namespaces_set(c)) {
4367                 _cleanup_free_ char *s = NULL;
4368
4369                 r = namespace_flags_to_string(c->restrict_namespaces, &s);
4370                 if (r >= 0)
4371                         fprintf(f, "%sRestrictNamespaces: %s\n",
4372                                 prefix, s);
4373         }
4374
4375         if (c->syscall_errno > 0) {
4376                 const char *errno_name;
4377
4378                 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4379
4380                 errno_name = errno_to_name(c->syscall_errno);
4381                 if (errno_name)
4382                         fprintf(f, "%s\n", errno_name);
4383                 else
4384                         fprintf(f, "%d\n", c->syscall_errno);
4385         }
4386
4387         if (c->apparmor_profile)
4388                 fprintf(f,
4389                         "%sAppArmorProfile: %s%s\n",
4390                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4391 }
4392
4393 bool exec_context_maintains_privileges(const ExecContext *c) {
4394         assert(c);
4395
4396         /* Returns true if the process forked off would run under
4397          * an unchanged UID or as root. */
4398
4399         if (!c->user)
4400                 return true;
4401
4402         if (streq(c->user, "root") || streq(c->user, "0"))
4403                 return true;
4404
4405         return false;
4406 }
4407
4408 int exec_context_get_effective_ioprio(const ExecContext *c) {
4409         int p;
4410
4411         assert(c);
4412
4413         if (c->ioprio_set)
4414                 return c->ioprio;
4415
4416         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4417         if (p < 0)
4418                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4419
4420         return p;
4421 }
4422
4423 void exec_context_free_log_extra_fields(ExecContext *c) {
4424         size_t l;
4425
4426         assert(c);
4427
4428         for (l = 0; l < c->n_log_extra_fields; l++)
4429                 free(c->log_extra_fields[l].iov_base);
4430         c->log_extra_fields = mfree(c->log_extra_fields);
4431         c->n_log_extra_fields = 0;
4432 }
4433
4434 void exec_status_start(ExecStatus *s, pid_t pid) {
4435         assert(s);
4436
4437         *s = (ExecStatus) {
4438                 .pid = pid,
4439         };
4440
4441         dual_timestamp_get(&s->start_timestamp);
4442 }
4443
4444 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
4445         assert(s);
4446
4447         if (s->pid != pid) {
4448                 *s = (ExecStatus) {
4449                         .pid = pid,
4450                 };
4451         }
4452
4453         dual_timestamp_get(&s->exit_timestamp);
4454
4455         s->code = code;
4456         s->status = status;
4457
4458         if (context) {
4459                 if (context->utmp_id)
4460                         (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
4461
4462                 exec_context_tty_reset(context, NULL);
4463         }
4464 }
4465
4466 void exec_status_reset(ExecStatus *s) {
4467         assert(s);
4468
4469         *s = (ExecStatus) {};
4470 }
4471
4472 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
4473         char buf[FORMAT_TIMESTAMP_MAX];
4474
4475         assert(s);
4476         assert(f);
4477
4478         if (s->pid <= 0)
4479                 return;
4480
4481         prefix = strempty(prefix);
4482
4483         fprintf(f,
4484                 "%sPID: "PID_FMT"\n",
4485                 prefix, s->pid);
4486
4487         if (dual_timestamp_is_set(&s->start_timestamp))
4488                 fprintf(f,
4489                         "%sStart Timestamp: %s\n",
4490                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4491
4492         if (dual_timestamp_is_set(&s->exit_timestamp))
4493                 fprintf(f,
4494                         "%sExit Timestamp: %s\n"
4495                         "%sExit Code: %s\n"
4496                         "%sExit Status: %i\n",
4497                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4498                         prefix, sigchld_code_to_string(s->code),
4499                         prefix, s->status);
4500 }
4501
4502 static char *exec_command_line(char **argv) {
4503         size_t k;
4504         char *n, *p, **a;
4505         bool first = true;
4506
4507         assert(argv);
4508
4509         k = 1;
4510         STRV_FOREACH(a, argv)
4511                 k += strlen(*a)+3;
4512
4513         n = new(char, k);
4514         if (!n)
4515                 return NULL;
4516
4517         p = n;
4518         STRV_FOREACH(a, argv) {
4519
4520                 if (!first)
4521                         *(p++) = ' ';
4522                 else
4523                         first = false;
4524
4525                 if (strpbrk(*a, WHITESPACE)) {
4526                         *(p++) = '\'';
4527                         p = stpcpy(p, *a);
4528                         *(p++) = '\'';
4529                 } else
4530                         p = stpcpy(p, *a);
4531
4532         }
4533
4534         *p = 0;
4535
4536         /* FIXME: this doesn't really handle arguments that have
4537          * spaces and ticks in them */
4538
4539         return n;
4540 }
4541
4542 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4543         _cleanup_free_ char *cmd = NULL;
4544         const char *prefix2;
4545
4546         assert(c);
4547         assert(f);
4548
4549         prefix = strempty(prefix);
4550         prefix2 = strjoina(prefix, "\t");
4551
4552         cmd = exec_command_line(c->argv);
4553         fprintf(f,
4554                 "%sCommand Line: %s\n",
4555                 prefix, cmd ? cmd : strerror(ENOMEM));
4556
4557         exec_status_dump(&c->exec_status, f, prefix2);
4558 }
4559
4560 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4561         assert(f);
4562
4563         prefix = strempty(prefix);
4564
4565         LIST_FOREACH(command, c, c)
4566                 exec_command_dump(c, f, prefix);
4567 }
4568
4569 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4570         ExecCommand *end;
4571
4572         assert(l);
4573         assert(e);
4574
4575         if (*l) {
4576                 /* It's kind of important, that we keep the order here */
4577                 LIST_FIND_TAIL(command, *l, end);
4578                 LIST_INSERT_AFTER(command, *l, end, e);
4579         } else
4580               *l = e;
4581 }
4582
4583 int exec_command_set(ExecCommand *c, const char *path, ...) {
4584         va_list ap;
4585         char **l, *p;
4586
4587         assert(c);
4588         assert(path);
4589
4590         va_start(ap, path);
4591         l = strv_new_ap(path, ap);
4592         va_end(ap);
4593
4594         if (!l)
4595                 return -ENOMEM;
4596
4597         p = strdup(path);
4598         if (!p) {
4599                 strv_free(l);
4600                 return -ENOMEM;
4601         }
4602
4603         free(c->path);
4604         c->path = p;
4605
4606         return strv_free_and_replace(c->argv, l);
4607 }
4608
4609 int exec_command_append(ExecCommand *c, const char *path, ...) {
4610         _cleanup_strv_free_ char **l = NULL;
4611         va_list ap;
4612         int r;
4613
4614         assert(c);
4615         assert(path);
4616
4617         va_start(ap, path);
4618         l = strv_new_ap(path, ap);
4619         va_end(ap);
4620
4621         if (!l)
4622                 return -ENOMEM;
4623
4624         r = strv_extend_strv(&c->argv, l, false);
4625         if (r < 0)
4626                 return r;
4627
4628         return 0;
4629 }
4630
4631 static void *remove_tmpdir_thread(void *p) {
4632         _cleanup_free_ char *path = p;
4633
4634         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4635         return NULL;
4636 }
4637
4638 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4639         int r;
4640
4641         if (!rt)
4642                 return NULL;
4643
4644         if (rt->manager)
4645                 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4646
4647         /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4648         if (destroy && rt->tmp_dir) {
4649                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4650
4651                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4652                 if (r < 0) {
4653                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4654                         free(rt->tmp_dir);
4655                 }
4656
4657                 rt->tmp_dir = NULL;
4658         }
4659
4660         if (destroy && rt->var_tmp_dir) {
4661                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4662
4663                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4664                 if (r < 0) {
4665                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4666                         free(rt->var_tmp_dir);
4667                 }
4668
4669                 rt->var_tmp_dir = NULL;
4670         }
4671
4672         rt->id = mfree(rt->id);
4673         rt->tmp_dir = mfree(rt->tmp_dir);
4674         rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4675         safe_close_pair(rt->netns_storage_socket);
4676         return mfree(rt);
4677 }
4678
4679 static void exec_runtime_freep(ExecRuntime **rt) {
4680         if (*rt)
4681                 (void) exec_runtime_free(*rt, false);
4682 }
4683
4684 static int exec_runtime_allocate(ExecRuntime **rt) {
4685         assert(rt);
4686
4687         *rt = new0(ExecRuntime, 1);
4688         if (!*rt)
4689                 return -ENOMEM;
4690
4691         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4692         return 0;
4693 }
4694
4695 static int exec_runtime_add(
4696                 Manager *m,
4697                 const char *id,
4698                 const char *tmp_dir,
4699                 const char *var_tmp_dir,
4700                 const int netns_storage_socket[2],
4701                 ExecRuntime **ret) {
4702
4703         _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
4704         int r;
4705
4706         assert(m);
4707         assert(id);
4708
4709         r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4710         if (r < 0)
4711                 return r;
4712
4713         r = exec_runtime_allocate(&rt);
4714         if (r < 0)
4715                 return r;
4716
4717         rt->id = strdup(id);
4718         if (!rt->id)
4719                 return -ENOMEM;
4720
4721         if (tmp_dir) {
4722                 rt->tmp_dir = strdup(tmp_dir);
4723                 if (!rt->tmp_dir)
4724                         return -ENOMEM;
4725
4726                 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4727                 assert(var_tmp_dir);
4728                 rt->var_tmp_dir = strdup(var_tmp_dir);
4729                 if (!rt->var_tmp_dir)
4730                         return -ENOMEM;
4731         }
4732
4733         if (netns_storage_socket) {
4734                 rt->netns_storage_socket[0] = netns_storage_socket[0];
4735                 rt->netns_storage_socket[1] = netns_storage_socket[1];
4736         }
4737
4738         r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4739         if (r < 0)
4740                 return r;
4741
4742         rt->manager = m;
4743
4744         if (ret)
4745                 *ret = rt;
4746
4747         /* do not remove created ExecRuntime object when the operation succeeds. */
4748         rt = NULL;
4749         return 0;
4750 }
4751
4752 static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4753         _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4754         _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4755         int r;
4756
4757         assert(m);
4758         assert(c);
4759         assert(id);
4760
4761         /* It is not necessary to create ExecRuntime object. */
4762         if (!c->private_network && !c->private_tmp)
4763                 return 0;
4764
4765         if (c->private_tmp) {
4766                 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
4767                 if (r < 0)
4768                         return r;
4769         }
4770
4771         if (c->private_network) {
4772                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4773                         return -errno;
4774         }
4775
4776         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4777         if (r < 0)
4778                 return r;
4779
4780         /* Avoid cleanup */
4781         netns_storage_socket[0] = -1;
4782         netns_storage_socket[1] = -1;
4783         return 1;
4784 }
4785
4786 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4787         ExecRuntime *rt;
4788         int r;
4789
4790         assert(m);
4791         assert(id);
4792         assert(ret);
4793
4794         rt = hashmap_get(m->exec_runtime_by_id, id);
4795         if (rt)
4796                 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4797                 goto ref;
4798
4799         if (!create)
4800                 return 0;
4801
4802         /* If not found, then create a new object. */
4803         r = exec_runtime_make(m, c, id, &rt);
4804         if (r <= 0)
4805                 /* When r == 0, it is not necessary to create ExecRuntime object. */
4806                 return r;
4807
4808 ref:
4809         /* increment reference counter. */
4810         rt->n_ref++;
4811         *ret = rt;
4812         return 1;
4813 }
4814
4815 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4816         if (!rt)
4817                 return NULL;
4818
4819         assert(rt->n_ref > 0);
4820
4821         rt->n_ref--;
4822         if (rt->n_ref > 0)
4823                 return NULL;
4824
4825         return exec_runtime_free(rt, destroy);
4826 }
4827
4828 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4829         ExecRuntime *rt;
4830         Iterator i;
4831
4832         assert(m);
4833         assert(f);
4834         assert(fds);
4835
4836         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4837                 fprintf(f, "exec-runtime=%s", rt->id);
4838
4839                 if (rt->tmp_dir)
4840                         fprintf(f, " tmp-dir=%s", rt->tmp_dir);
4841
4842                 if (rt->var_tmp_dir)
4843                         fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
4844
4845                 if (rt->netns_storage_socket[0] >= 0) {
4846                         int copy;
4847
4848                         copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4849                         if (copy < 0)
4850                                 return copy;
4851
4852                         fprintf(f, " netns-socket-0=%i", copy);
4853                 }
4854
4855                 if (rt->netns_storage_socket[1] >= 0) {
4856                         int copy;
4857
4858                         copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4859                         if (copy < 0)
4860                                 return copy;
4861
4862                         fprintf(f, " netns-socket-1=%i", copy);
4863                 }
4864
4865                 fputc('\n', f);
4866         }
4867
4868         return 0;
4869 }
4870
4871 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4872         _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4873         ExecRuntime *rt;
4874         int r;
4875
4876         /* This is for the migration from old (v237 or earlier) deserialization text.
4877          * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4878          * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4879          * so or not from the serialized text, then we always creates a new object owned by this. */
4880
4881         assert(u);
4882         assert(key);
4883         assert(value);
4884
4885         /* Manager manages ExecRuntime objects by the unit id.
4886          * So, we omit the serialized text when the unit does not have id (yet?)... */
4887         if (isempty(u->id)) {
4888                 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4889                 return 0;
4890         }
4891
4892         r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4893         if (r < 0) {
4894                 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4895                 return 0;
4896         }
4897
4898         rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4899         if (!rt) {
4900                 r = exec_runtime_allocate(&rt_create);
4901                 if (r < 0)
4902                         return log_oom();
4903
4904                 rt_create->id = strdup(u->id);
4905                 if (!rt_create->id)
4906                         return log_oom();
4907
4908                 rt = rt_create;
4909         }
4910
4911         if (streq(key, "tmp-dir")) {
4912                 char *copy;
4913
4914                 copy = strdup(value);
4915                 if (!copy)
4916                         return log_oom();
4917
4918                 free_and_replace(rt->tmp_dir, copy);
4919
4920         } else if (streq(key, "var-tmp-dir")) {
4921                 char *copy;
4922
4923                 copy = strdup(value);
4924                 if (!copy)
4925                         return log_oom();
4926
4927                 free_and_replace(rt->var_tmp_dir, copy);
4928
4929         } else if (streq(key, "netns-socket-0")) {
4930                 int fd;
4931
4932                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4933                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4934                         return 0;
4935                 }
4936
4937                 safe_close(rt->netns_storage_socket[0]);
4938                 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4939
4940         } else if (streq(key, "netns-socket-1")) {
4941                 int fd;
4942
4943                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
4944                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4945                         return 0;
4946                 }
4947
4948                 safe_close(rt->netns_storage_socket[1]);
4949                 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
4950         } else
4951                 return 0;
4952
4953         /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4954         if (rt_create) {
4955                 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4956                 if (r < 0) {
4957                         log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
4958                         return 0;
4959                 }
4960
4961                 rt_create->manager = u->manager;
4962
4963                 /* Avoid cleanup */
4964                 rt_create = NULL;
4965         }
4966
4967         return 1;
4968 }
4969
4970 void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4971         char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4972         int r, fd0 = -1, fd1 = -1;
4973         const char *p, *v = value;
4974         size_t n;
4975
4976         assert(m);
4977         assert(value);
4978         assert(fds);
4979
4980         n = strcspn(v, " ");
4981         id = strndupa(v, n);
4982         if (v[n] != ' ')
4983                 goto finalize;
4984         p = v + n + 1;
4985
4986         v = startswith(p, "tmp-dir=");
4987         if (v) {
4988                 n = strcspn(v, " ");
4989                 tmp_dir = strndupa(v, n);
4990                 if (v[n] != ' ')
4991                         goto finalize;
4992                 p = v + n + 1;
4993         }
4994
4995         v = startswith(p, "var-tmp-dir=");
4996         if (v) {
4997                 n = strcspn(v, " ");
4998                 var_tmp_dir = strndupa(v, n);
4999                 if (v[n] != ' ')
5000                         goto finalize;
5001                 p = v + n + 1;
5002         }
5003
5004         v = startswith(p, "netns-socket-0=");
5005         if (v) {
5006                 char *buf;
5007
5008                 n = strcspn(v, " ");
5009                 buf = strndupa(v, n);
5010                 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5011                         log_debug("Unable to process exec-runtime netns fd specification.");
5012                         return;
5013                 }
5014                 fd0 = fdset_remove(fds, fd0);
5015                 if (v[n] != ' ')
5016                         goto finalize;
5017                 p = v + n + 1;
5018         }
5019
5020         v = startswith(p, "netns-socket-1=");
5021         if (v) {
5022                 char *buf;
5023
5024                 n = strcspn(v, " ");
5025                 buf = strndupa(v, n);
5026                 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5027                         log_debug("Unable to process exec-runtime netns fd specification.");
5028                         return;
5029                 }
5030                 fd1 = fdset_remove(fds, fd1);
5031         }
5032
5033 finalize:
5034
5035         r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
5036         if (r < 0) {
5037                 log_debug_errno(r, "Failed to add exec-runtime: %m");
5038                 return;
5039         }
5040 }
5041
5042 void exec_runtime_vacuum(Manager *m) {
5043         ExecRuntime *rt;
5044         Iterator i;
5045
5046         assert(m);
5047
5048         /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5049
5050         HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5051                 if (rt->n_ref > 0)
5052                         continue;
5053
5054                 (void) exec_runtime_free(rt, false);
5055         }
5056 }
5057
5058 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5059         [EXEC_INPUT_NULL] = "null",
5060         [EXEC_INPUT_TTY] = "tty",
5061         [EXEC_INPUT_TTY_FORCE] = "tty-force",
5062         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
5063         [EXEC_INPUT_SOCKET] = "socket",
5064         [EXEC_INPUT_NAMED_FD] = "fd",
5065         [EXEC_INPUT_DATA] = "data",
5066         [EXEC_INPUT_FILE] = "file",
5067 };
5068
5069 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5070
5071 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
5072         [EXEC_OUTPUT_INHERIT] = "inherit",
5073         [EXEC_OUTPUT_NULL] = "null",
5074         [EXEC_OUTPUT_TTY] = "tty",
5075         [EXEC_OUTPUT_SYSLOG] = "syslog",
5076         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
5077         [EXEC_OUTPUT_KMSG] = "kmsg",
5078         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
5079         [EXEC_OUTPUT_JOURNAL] = "journal",
5080         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
5081         [EXEC_OUTPUT_SOCKET] = "socket",
5082         [EXEC_OUTPUT_NAMED_FD] = "fd",
5083         [EXEC_OUTPUT_FILE] = "file",
5084         [EXEC_OUTPUT_FILE_APPEND] = "append",
5085 };
5086
5087 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
5088
5089 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5090         [EXEC_UTMP_INIT] = "init",
5091         [EXEC_UTMP_LOGIN] = "login",
5092         [EXEC_UTMP_USER] = "user",
5093 };
5094
5095 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
5096
5097 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5098         [EXEC_PRESERVE_NO] = "no",
5099         [EXEC_PRESERVE_YES] = "yes",
5100         [EXEC_PRESERVE_RESTART] = "restart",
5101 };
5102
5103 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
5104
5105 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5106         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5107         [EXEC_DIRECTORY_STATE] = "StateDirectory",
5108         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5109         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5110         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5111 };
5112
5113 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
5114
5115 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5116         [EXEC_KEYRING_INHERIT] = "inherit",
5117         [EXEC_KEYRING_PRIVATE] = "private",
5118         [EXEC_KEYRING_SHARED] = "shared",
5119 };
5120
5121 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);