src/core/execute.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2010 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <errno.h>
  21 #include <fcntl.h>
  22 #include <glob.h>
  23 #include <grp.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <string.h>
  27 #include <sys/capability.h>
  28 #include <sys/eventfd.h>
  29 #include <sys/mman.h>
  30 #include <sys/personality.h>
  31 #include <sys/prctl.h>
  32 #include <sys/shm.h>
  33 #include <sys/socket.h>
  34 #include <sys/stat.h>
  35 #include <sys/types.h>
  36 #include <sys/un.h>
  37 #include <unistd.h>
  38 #include <utmpx.h>
  39
  40 #ifdef HAVE_PAM
  41 #include <security/pam_appl.h>
  42 #endif
  43
  44 #ifdef HAVE_SELINUX
  45 #include <selinux/selinux.h>
  46 #endif
  47
  48 #ifdef HAVE_SECCOMP
  49 #include <seccomp.h>
  50 #endif
  51
  52 #ifdef HAVE_APPARMOR
  53 #include <sys/apparmor.h>
  54 #endif
  55
  56 #include "sd-messages.h"
  57
  58 #include "af-list.h"
  59 #include "alloc-util.h"
  60 #ifdef HAVE_APPARMOR
  61 #include "apparmor-util.h"
  62 #endif
  63 #include "async.h"
  64 #include "barrier.h"
  65 #include "cap-list.h"
  66 #include "capability-util.h"
  67 #include "def.h"
  68 #include "env-util.h"
  69 #include "errno-list.h"
  70 #include "execute.h"
  71 #include "exit-status.h"
  72 #include "fd-util.h"
  73 #include "fileio.h"
  74 #include "format-util.h"
  75 #include "fs-util.h"
  76 #include "glob-util.h"
  77 #include "io-util.h"
  78 #include "ioprio.h"
  79 #include "log.h"
  80 #include "macro.h"
  81 #include "missing.h"
  82 #include "mkdir.h"
  83 #include "namespace.h"
  84 #include "parse-util.h"
  85 #include "path-util.h"
  86 #include "process-util.h"
  87 #include "rlimit-util.h"
  88 #include "rm-rf.h"
  89 #ifdef HAVE_SECCOMP
  90 #include "seccomp-util.h"
  91 #endif
  92 #include "securebits.h"
  93 #include "securebits-util.h"
  94 #include "selinux-util.h"
  95 #include "signal-util.h"
  96 #include "smack-util.h"
  97 #include "special.h"
  98 #include "string-table.h"
  99 #include "string-util.h"
 100 #include "strv.h"
 101 #include "syslog-util.h"
 102 #include "terminal-util.h"
 103 #include "unit.h"
 104 #include "user-util.h"
 105 #include "util.h"
 106 #include "utmp-wtmp.h"
 107
 108 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 109 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 110
 111 /* This assumes there is a 'tty' group */
 112 #define TTY_MODE 0620
 113
 114 #define SNDBUF_SIZE (8*1024*1024)
 115
 116 static int shift_fds(int fds[], unsigned n_fds) {
 117         int start, restart_from;
 118
 119         if (n_fds <= 0)
 120                 return 0;
 121
 122         /* Modifies the fds array! (sorts it) */
 123
 124         assert(fds);
 125
 126         start = 0;
 127         for (;;) {
 128                 int i;
 129
 130                 restart_from = -1;
 131
 132                 for (i = start; i < (int) n_fds; i++) {
 133                         int nfd;
 134
 135                         /* Already at right index? */
 136                         if (fds[i] == i+3)
 137                                 continue;
 138
 139                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 140                         if (nfd < 0)
 141                                 return -errno;
 142
 143                         safe_close(fds[i]);
 144                         fds[i] = nfd;
 145
 146                         /* Hmm, the fd we wanted isn't free? Then
 147                          * let's remember that and try again from here */
 148                         if (nfd != i+3 && restart_from < 0)
 149                                 restart_from = i;
 150                 }
 151
 152                 if (restart_from < 0)
 153                         break;
 154
 155                 start = restart_from;
 156         }
 157
 158         return 0;
 159 }
 160
 161 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 162         unsigned i, n_fds;
 163         int r;
 164
 165         n_fds = n_storage_fds + n_socket_fds;
 166         if (n_fds <= 0)
 167                 return 0;
 168
 169         assert(fds);
 170
 171         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 172          * O_NONBLOCK only applies to socket activation though. */
 173
 174         for (i = 0; i < n_fds; i++) {
 175
 176                 if (i < n_socket_fds) {
 177                         r = fd_nonblock(fds[i], nonblock);
 178                         if (r < 0)
 179                                 return r;
 180                 }
 181
 182                 /* We unconditionally drop FD_CLOEXEC from the fds,
 183                  * since after all we want to pass these fds to our
 184                  * children */
 185
 186                 r = fd_cloexec(fds[i], false);
 187                 if (r < 0)
 188                         return r;
 189         }
 190
 191         return 0;
 192 }
 193
 194 static const char *exec_context_tty_path(const ExecContext *context) {
 195         assert(context);
 196
 197         if (context->stdio_as_fds)
 198                 return NULL;
 199
 200         if (context->tty_path)
 201                 return context->tty_path;
 202
 203         return "/dev/console";
 204 }
 205
 206 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 207         const char *path;
 208
 209         assert(context);
 210
 211         path = exec_context_tty_path(context);
 212
 213         if (context->tty_vhangup) {
 214                 if (p && p->stdin_fd >= 0)
 215                         (void) terminal_vhangup_fd(p->stdin_fd);
 216                 else if (path)
 217                         (void) terminal_vhangup(path);
 218         }
 219
 220         if (context->tty_reset) {
 221                 if (p && p->stdin_fd >= 0)
 222                         (void) reset_terminal_fd(p->stdin_fd, true);
 223                 else if (path)
 224                         (void) reset_terminal(path);
 225         }
 226
 227         if (context->tty_vt_disallocate && path)
 228                 (void) vt_disallocate(path);
 229 }
 230
 231 static bool is_terminal_input(ExecInput i) {
 232         return IN_SET(i,
 233                       EXEC_INPUT_TTY,
 234                       EXEC_INPUT_TTY_FORCE,
 235                       EXEC_INPUT_TTY_FAIL);
 236 }
 237
 238 static bool is_terminal_output(ExecOutput o) {
 239         return IN_SET(o,
 240                       EXEC_OUTPUT_TTY,
 241                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 242                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 243                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 244 }
 245
 246 static bool is_syslog_output(ExecOutput o) {
 247         return IN_SET(o,
 248                       EXEC_OUTPUT_SYSLOG,
 249                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 250 }
 251
 252 static bool is_kmsg_output(ExecOutput o) {
 253         return IN_SET(o,
 254                       EXEC_OUTPUT_KMSG,
 255                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 256 }
 257
 258 static bool exec_context_needs_term(const ExecContext *c) {
 259         assert(c);
 260
 261         /* Return true if the execution context suggests we should set $TERM to something useful. */
 262
 263         if (is_terminal_input(c->std_input))
 264                 return true;
 265
 266         if (is_terminal_output(c->std_output))
 267                 return true;
 268
 269         if (is_terminal_output(c->std_error))
 270                 return true;
 271
 272         return !!c->tty_path;
 273 }
 274
 275 static int open_null_as(int flags, int nfd) {
 276         int fd, r;
 277
 278         assert(nfd >= 0);
 279
 280         fd = open("/dev/null", flags|O_NOCTTY);
 281         if (fd < 0)
 282                 return -errno;
 283
 284         if (fd != nfd) {
 285                 r = dup2(fd, nfd) < 0 ? -errno : nfd;
 286                 safe_close(fd);
 287         } else
 288                 r = nfd;
 289
 290         return r;
 291 }
 292
 293 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 294         static const union sockaddr_union sa = {
 295                 .un.sun_family = AF_UNIX,
 296                 .un.sun_path = "/run/systemd/journal/stdout",
 297         };
 298         uid_t olduid = UID_INVALID;
 299         gid_t oldgid = GID_INVALID;
 300         int r;
 301
 302         if (gid_is_valid(gid)) {
 303                 oldgid = getgid();
 304
 305                 if (setegid(gid) < 0)
 306                         return -errno;
 307         }
 308
 309         if (uid_is_valid(uid)) {
 310                 olduid = getuid();
 311
 312                 if (seteuid(uid) < 0) {
 313                         r = -errno;
 314                         goto restore_gid;
 315                 }
 316         }
 317
 318         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 319
 320         /* If we fail to restore the uid or gid, things will likely
 321            fail later on. This should only happen if an LSM interferes. */
 322
 323         if (uid_is_valid(uid))
 324                 (void) seteuid(olduid);
 325
 326  restore_gid:
 327         if (gid_is_valid(gid))
 328                 (void) setegid(oldgid);
 329
 330         return r;
 331 }
 332
 333 static int connect_logger_as(
 334                 Unit *unit,
 335                 const ExecContext *context,
 336                 const ExecParameters *params,
 337                 ExecOutput output,
 338                 const char *ident,
 339                 int nfd,
 340                 uid_t uid,
 341                 gid_t gid) {
 342
 343         int fd, r;
 344
 345         assert(context);
 346         assert(params);
 347         assert(output < _EXEC_OUTPUT_MAX);
 348         assert(ident);
 349         assert(nfd >= 0);
 350
 351         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 352         if (fd < 0)
 353                 return -errno;
 354
 355         r = connect_journal_socket(fd, uid, gid);
 356         if (r < 0)
 357                 return r;
 358
 359         if (shutdown(fd, SHUT_RD) < 0) {
 360                 safe_close(fd);
 361                 return -errno;
 362         }
 363
 364         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 365
 366         dprintf(fd,
 367                 "%s\n"
 368                 "%s\n"
 369                 "%i\n"
 370                 "%i\n"
 371                 "%i\n"
 372                 "%i\n"
 373                 "%i\n",
 374                 context->syslog_identifier ?: ident,
 375                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 376                 context->syslog_priority,
 377                 !!context->syslog_level_prefix,
 378                 is_syslog_output(output),
 379                 is_kmsg_output(output),
 380                 is_terminal_output(output));
 381
 382         if (fd == nfd)
 383                 return nfd;
 384
 385         r = dup2(fd, nfd) < 0 ? -errno : nfd;
 386         safe_close(fd);
 387
 388         return r;
 389 }
 390 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
 391         int fd, r;
 392
 393         assert(path);
 394         assert(nfd >= 0);
 395
 396         fd = open_terminal(path, mode | O_NOCTTY);
 397         if (fd < 0)
 398                 return fd;
 399
 400         if (fd != nfd) {
 401                 r = dup2(fd, nfd) < 0 ? -errno : nfd;
 402                 safe_close(fd);
 403         } else
 404                 r = nfd;
 405
 406         return r;
 407 }
 408
 409 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
 410
 411         if (is_terminal_input(std_input) && !apply_tty_stdin)
 412                 return EXEC_INPUT_NULL;
 413
 414         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 415                 return EXEC_INPUT_NULL;
 416
 417         return std_input;
 418 }
 419
 420 static int fixup_output(ExecOutput std_output, int socket_fd) {
 421
 422         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 423                 return EXEC_OUTPUT_INHERIT;
 424
 425         return std_output;
 426 }
 427
 428 static int setup_input(
 429                 const ExecContext *context,
 430                 const ExecParameters *params,
 431                 int socket_fd,
 432                 int named_iofds[3]) {
 433
 434         ExecInput i;
 435
 436         assert(context);
 437         assert(params);
 438
 439         if (params->stdin_fd >= 0) {
 440                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 441                         return -errno;
 442
 443                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 444                 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 445                 (void) reset_terminal_fd(STDIN_FILENO, true);
 446
 447                 return STDIN_FILENO;
 448         }
 449
 450         i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 451
 452         switch (i) {
 453
 454         case EXEC_INPUT_NULL:
 455                 return open_null_as(O_RDONLY, STDIN_FILENO);
 456
 457         case EXEC_INPUT_TTY:
 458         case EXEC_INPUT_TTY_FORCE:
 459         case EXEC_INPUT_TTY_FAIL: {
 460                 int fd, r;
 461
 462                 fd = acquire_terminal(exec_context_tty_path(context),
 463                                       i == EXEC_INPUT_TTY_FAIL,
 464                                       i == EXEC_INPUT_TTY_FORCE,
 465                                       false,
 466                                       USEC_INFINITY);
 467                 if (fd < 0)
 468                         return fd;
 469
 470                 if (fd != STDIN_FILENO) {
 471                         r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 472                         safe_close(fd);
 473                 } else
 474                         r = STDIN_FILENO;
 475
 476                 return r;
 477         }
 478
 479         case EXEC_INPUT_SOCKET:
 480                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 481
 482         case EXEC_INPUT_NAMED_FD:
 483                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 484                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 485
 486         default:
 487                 assert_not_reached("Unknown input type");
 488         }
 489 }
 490
 491 static int setup_output(
 492                 Unit *unit,
 493                 const ExecContext *context,
 494                 const ExecParameters *params,
 495                 int fileno,
 496                 int socket_fd,
 497                 int named_iofds[3],
 498                 const char *ident,
 499                 uid_t uid,
 500                 gid_t gid,
 501                 dev_t *journal_stream_dev,
 502                 ino_t *journal_stream_ino) {
 503
 504         ExecOutput o;
 505         ExecInput i;
 506         int r;
 507
 508         assert(unit);
 509         assert(context);
 510         assert(params);
 511         assert(ident);
 512         assert(journal_stream_dev);
 513         assert(journal_stream_ino);
 514
 515         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 516
 517                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 518                         return -errno;
 519
 520                 return STDOUT_FILENO;
 521         }
 522
 523         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 524                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 525                         return -errno;
 526
 527                 return STDERR_FILENO;
 528         }
 529
 530         i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 531         o = fixup_output(context->std_output, socket_fd);
 532
 533         if (fileno == STDERR_FILENO) {
 534                 ExecOutput e;
 535                 e = fixup_output(context->std_error, socket_fd);
 536
 537                 /* This expects the input and output are already set up */
 538
 539                 /* Don't change the stderr file descriptor if we inherit all
 540                  * the way and are not on a tty */
 541                 if (e == EXEC_OUTPUT_INHERIT &&
 542                     o == EXEC_OUTPUT_INHERIT &&
 543                     i == EXEC_INPUT_NULL &&
 544                     !is_terminal_input(context->std_input) &&
 545                     getppid () != 1)
 546                         return fileno;
 547
 548                 /* Duplicate from stdout if possible */
 549                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 550                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 551
 552                 o = e;
 553
 554         } else if (o == EXEC_OUTPUT_INHERIT) {
 555                 /* If input got downgraded, inherit the original value */
 556                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 557                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 558
 559                 /* If the input is connected to anything that's not a /dev/null, inherit that... */
 560                 if (i != EXEC_INPUT_NULL)
 561                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 562
 563                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 564                 if (getppid() != 1)
 565                         return fileno;
 566
 567                 /* We need to open /dev/null here anew, to get the right access mode. */
 568                 return open_null_as(O_WRONLY, fileno);
 569         }
 570
 571         switch (o) {
 572
 573         case EXEC_OUTPUT_NULL:
 574                 return open_null_as(O_WRONLY, fileno);
 575
 576         case EXEC_OUTPUT_TTY:
 577                 if (is_terminal_input(i))
 578                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 579
 580                 /* We don't reset the terminal if this is just about output */
 581                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 582
 583         case EXEC_OUTPUT_SYSLOG:
 584         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 585         case EXEC_OUTPUT_KMSG:
 586         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 587         case EXEC_OUTPUT_JOURNAL:
 588         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 589                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 590                 if (r < 0) {
 591                         log_unit_error_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 592                         r = open_null_as(O_WRONLY, fileno);
 593                 } else {
 594                         struct stat st;
 595
 596                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 597                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 598                          * services to detect whether they are connected to the journal or not.
 599                          *
 600                          * If both stdout and stderr are connected to a stream then let's make sure to store the data
 601                          * about STDERR as that's usually the best way to do logging. */
 602
 603                         if (fstat(fileno, &st) >= 0 &&
 604                             (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
 605                                 *journal_stream_dev = st.st_dev;
 606                                 *journal_stream_ino = st.st_ino;
 607                         }
 608                 }
 609                 return r;
 610
 611         case EXEC_OUTPUT_SOCKET:
 612                 assert(socket_fd >= 0);
 613                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 614
 615         case EXEC_OUTPUT_NAMED_FD:
 616                 (void) fd_nonblock(named_iofds[fileno], false);
 617                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 618
 619         default:
 620                 assert_not_reached("Unknown error type");
 621         }
 622 }
 623
 624 static int chown_terminal(int fd, uid_t uid) {
 625         struct stat st;
 626
 627         assert(fd >= 0);
 628
 629         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 630         if (isatty(fd) < 1)
 631                 return 0;
 632
 633         /* This might fail. What matters are the results. */
 634         (void) fchown(fd, uid, -1);
 635         (void) fchmod(fd, TTY_MODE);
 636
 637         if (fstat(fd, &st) < 0)
 638                 return -errno;
 639
 640         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 641                 return -EPERM;
 642
 643         return 0;
 644 }
 645
 646 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 647         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 648         int r;
 649
 650         assert(_saved_stdin);
 651         assert(_saved_stdout);
 652
 653         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 654         if (saved_stdin < 0)
 655                 return -errno;
 656
 657         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 658         if (saved_stdout < 0)
 659                 return -errno;
 660
 661         fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
 662         if (fd < 0)
 663                 return fd;
 664
 665         r = chown_terminal(fd, getuid());
 666         if (r < 0)
 667                 return r;
 668
 669         r = reset_terminal_fd(fd, true);
 670         if (r < 0)
 671                 return r;
 672
 673         if (dup2(fd, STDIN_FILENO) < 0)
 674                 return -errno;
 675
 676         if (dup2(fd, STDOUT_FILENO) < 0)
 677                 return -errno;
 678
 679         if (fd >= 2)
 680                 safe_close(fd);
 681         fd = -1;
 682
 683         *_saved_stdin = saved_stdin;
 684         *_saved_stdout = saved_stdout;
 685
 686         saved_stdin = saved_stdout = -1;
 687
 688         return 0;
 689 }
 690
 691 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 692         assert(err < 0);
 693
 694         if (err == -ETIMEDOUT)
 695                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 696         else {
 697                 errno = -err;
 698                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 699         }
 700 }
 701
 702 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 703         _cleanup_close_ int fd = -1;
 704
 705         assert(vc);
 706
 707         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 708         if (fd < 0)
 709                 return;
 710
 711         write_confirm_error_fd(err, fd, u);
 712 }
 713
 714 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 715         int r = 0;
 716
 717         assert(saved_stdin);
 718         assert(saved_stdout);
 719
 720         release_terminal();
 721
 722         if (*saved_stdin >= 0)
 723                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 724                         r = -errno;
 725
 726         if (*saved_stdout >= 0)
 727                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 728                         r = -errno;
 729
 730         *saved_stdin = safe_close(*saved_stdin);
 731         *saved_stdout = safe_close(*saved_stdout);
 732
 733         return r;
 734 }
 735
 736 enum {
 737         CONFIRM_PRETEND_FAILURE = -1,
 738         CONFIRM_PRETEND_SUCCESS =  0,
 739         CONFIRM_EXECUTE = 1,
 740 };
 741
 742 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 743         int saved_stdout = -1, saved_stdin = -1, r;
 744         _cleanup_free_ char *e = NULL;
 745         char c;
 746
 747         /* For any internal errors, assume a positive response. */
 748         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 749         if (r < 0) {
 750                 write_confirm_error(r, vc, u);
 751                 return CONFIRM_EXECUTE;
 752         }
 753
 754         /* confirm_spawn might have been disabled while we were sleeping. */
 755         if (manager_is_confirm_spawn_disabled(u->manager)) {
 756                 r = 1;
 757                 goto restore_stdio;
 758         }
 759
 760         e = ellipsize(cmdline, 60, 100);
 761         if (!e) {
 762                 log_oom();
 763                 r = CONFIRM_EXECUTE;
 764                 goto restore_stdio;
 765         }
 766
 767         for (;;) {
 768                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 769                 if (r < 0) {
 770                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 771                         r = CONFIRM_EXECUTE;
 772                         goto restore_stdio;
 773                 }
 774
 775                 switch (c) {
 776                 case 'c':
 777                         printf("Resuming normal execution.\n");
 778                         manager_disable_confirm_spawn();
 779                         r = 1;
 780                         break;
 781                 case 'D':
 782                         unit_dump(u, stdout, "  ");
 783                         continue; /* ask again */
 784                 case 'f':
 785                         printf("Failing execution.\n");
 786                         r = CONFIRM_PRETEND_FAILURE;
 787                         break;
 788                 case 'h':
 789                         printf("  c - continue, proceed without asking anymore\n"
 790                                "  D - dump, show the state of the unit\n"
 791                                "  f - fail, don't execute the command and pretend it failed\n"
 792                                "  h - help\n"
 793                                "  i - info, show a short summary of the unit\n"
 794                                "  j - jobs, show jobs that are in progress\n"
 795                                "  s - skip, don't execute the command and pretend it succeeded\n"
 796                                "  y - yes, execute the command\n");
 797                         continue; /* ask again */
 798                 case 'i':
 799                         printf("  Description: %s\n"
 800                                "  Unit:        %s\n"
 801                                "  Command:     %s\n",
 802                                u->id, u->description, cmdline);
 803                         continue; /* ask again */
 804                 case 'j':
 805                         manager_dump_jobs(u->manager, stdout, "  ");
 806                         continue; /* ask again */
 807                 case 'n':
 808                         /* 'n' was removed in favor of 'f'. */
 809                         printf("Didn't understand 'n', did you mean 'f'?\n");
 810                         continue; /* ask again */
 811                 case 's':
 812                         printf("Skipping execution.\n");
 813                         r = CONFIRM_PRETEND_SUCCESS;
 814                         break;
 815                 case 'y':
 816                         r = CONFIRM_EXECUTE;
 817                         break;
 818                 default:
 819                         assert_not_reached("Unhandled choice");
 820                 }
 821                 break;
 822         }
 823
 824 restore_stdio:
 825         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 826         return r;
 827 }
 828
 829 static int get_fixed_user(const ExecContext *c, const char **user,
 830                           uid_t *uid, gid_t *gid,
 831                           const char **home, const char **shell) {
 832         int r;
 833         const char *name;
 834
 835         assert(c);
 836
 837         if (!c->user)
 838                 return 0;
 839
 840         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 841          * (i.e. are "/" or "/bin/nologin"). */
 842
 843         name = c->user;
 844         r = get_user_creds_clean(&name, uid, gid, home, shell);
 845         if (r < 0)
 846                 return r;
 847
 848         *user = name;
 849         return 0;
 850 }
 851
 852 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 853         int r;
 854         const char *name;
 855
 856         assert(c);
 857
 858         if (!c->group)
 859                 return 0;
 860
 861         name = c->group;
 862         r = get_group_creds(&name, gid);
 863         if (r < 0)
 864                 return r;
 865
 866         *group = name;
 867         return 0;
 868 }
 869
 870 static int get_supplementary_groups(const ExecContext *c, const char *user,
 871                                     const char *group, gid_t gid,
 872                                     gid_t **supplementary_gids, int *ngids) {
 873         char **i;
 874         int r, k = 0;
 875         int ngroups_max;
 876         bool keep_groups = false;
 877         gid_t *groups = NULL;
 878         _cleanup_free_ gid_t *l_gids = NULL;
 879
 880         assert(c);
 881
 882         /*
 883          * If user is given, then lookup GID and supplementary groups list.
 884          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 885          * here and as early as possible so we keep the list of supplementary
 886          * groups of the caller.
 887          */
 888         if (user && gid_is_valid(gid) && gid != 0) {
 889                 /* First step, initialize groups from /etc/groups */
 890                 if (initgroups(user, gid) < 0)
 891                         return -errno;
 892
 893                 keep_groups = true;
 894         }
 895
 896         if (!c->supplementary_groups)
 897                 return 0;
 898
 899         /*
 900          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 901          * be positive, otherwise fail.
 902          */
 903         errno = 0;
 904         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 905         if (ngroups_max <= 0) {
 906                 if (errno > 0)
 907                         return -errno;
 908                 else
 909                         return -EOPNOTSUPP; /* For all other values */
 910         }
 911
 912         l_gids = new(gid_t, ngroups_max);
 913         if (!l_gids)
 914                 return -ENOMEM;
 915
 916         if (keep_groups) {
 917                 /*
 918                  * Lookup the list of groups that the user belongs to, we
 919                  * avoid NSS lookups here too for gid=0.
 920                  */
 921                 k = ngroups_max;
 922                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 923                         return -EINVAL;
 924         } else
 925                 k = 0;
 926
 927         STRV_FOREACH(i, c->supplementary_groups) {
 928                 const char *g;
 929
 930                 if (k >= ngroups_max)
 931                         return -E2BIG;
 932
 933                 g = *i;
 934                 r = get_group_creds(&g, l_gids+k);
 935                 if (r < 0)
 936                         return r;
 937
 938                 k++;
 939         }
 940
 941         /*
 942          * Sets ngids to zero to drop all supplementary groups, happens
 943          * when we are under root and SupplementaryGroups= is empty.
 944          */
 945         if (k == 0) {
 946                 *ngids = 0;
 947                 return 0;
 948         }
 949
 950         /* Otherwise get the final list of supplementary groups */
 951         groups = memdup(l_gids, sizeof(gid_t) * k);
 952         if (!groups)
 953                 return -ENOMEM;
 954
 955         *supplementary_gids = groups;
 956         *ngids = k;
 957
 958         groups = NULL;
 959
 960         return 0;
 961 }
 962
 963 static int enforce_groups(const ExecContext *context, gid_t gid,
 964                           gid_t *supplementary_gids, int ngids) {
 965         int r;
 966
 967         assert(context);
 968
 969         /* Handle SupplementaryGroups= even if it is empty */
 970         if (context->supplementary_groups) {
 971                 r = maybe_setgroups(ngids, supplementary_gids);
 972                 if (r < 0)
 973                         return r;
 974         }
 975
 976         if (gid_is_valid(gid)) {
 977                 /* Then set our gids */
 978                 if (setresgid(gid, gid, gid) < 0)
 979                         return -errno;
 980         }
 981
 982         return 0;
 983 }
 984
 985 static int enforce_user(const ExecContext *context, uid_t uid) {
 986         assert(context);
 987
 988         if (!uid_is_valid(uid))
 989                 return 0;
 990
 991         /* Sets (but doesn't look up) the uid and make sure we keep the
 992          * capabilities while doing so. */
 993
 994         if (context->capability_ambient_set != 0) {
 995
 996                 /* First step: If we need to keep capabilities but
 997                  * drop privileges we need to make sure we keep our
 998                  * caps, while we drop privileges. */
 999                 if (uid != 0) {
1000                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1001
1002                         if (prctl(PR_GET_SECUREBITS) != sb)
1003                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1004                                         return -errno;
1005                 }
1006         }
1007
1008         /* Second step: actually set the uids */
1009         if (setresuid(uid, uid, uid) < 0)
1010                 return -errno;
1011
1012         /* At this point we should have all necessary capabilities but
1013            are otherwise a normal user. However, the caps might got
1014            corrupted due to the setresuid() so we need clean them up
1015            later. This is done outside of this call. */
1016
1017         return 0;
1018 }
1019
1020 #ifdef HAVE_PAM
1021
1022 static int null_conv(
1023                 int num_msg,
1024                 const struct pam_message **msg,
1025                 struct pam_response **resp,
1026                 void *appdata_ptr) {
1027
1028         /* We don't support conversations */
1029
1030         return PAM_CONV_ERR;
1031 }
1032
1033 #endif
1034
1035 static int setup_pam(
1036                 const char *name,
1037                 const char *user,
1038                 uid_t uid,
1039                 gid_t gid,
1040                 const char *tty,
1041                 char ***env,
1042                 int fds[], unsigned n_fds) {
1043
1044 #ifdef HAVE_PAM
1045
1046         static const struct pam_conv conv = {
1047                 .conv = null_conv,
1048                 .appdata_ptr = NULL
1049         };
1050
1051         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1052         pam_handle_t *handle = NULL;
1053         sigset_t old_ss;
1054         int pam_code = PAM_SUCCESS, r;
1055         char **nv, **e = NULL;
1056         bool close_session = false;
1057         pid_t pam_pid = 0, parent_pid;
1058         int flags = 0;
1059
1060         assert(name);
1061         assert(user);
1062         assert(env);
1063
1064         /* We set up PAM in the parent process, then fork. The child
1065          * will then stay around until killed via PR_GET_PDEATHSIG or
1066          * systemd via the cgroup logic. It will then remove the PAM
1067          * session again. The parent process will exec() the actual
1068          * daemon. We do things this way to ensure that the main PID
1069          * of the daemon is the one we initially fork()ed. */
1070
1071         r = barrier_create(&barrier);
1072         if (r < 0)
1073                 goto fail;
1074
1075         if (log_get_max_level() < LOG_DEBUG)
1076                 flags |= PAM_SILENT;
1077
1078         pam_code = pam_start(name, user, &conv, &handle);
1079         if (pam_code != PAM_SUCCESS) {
1080                 handle = NULL;
1081                 goto fail;
1082         }
1083
1084         if (tty) {
1085                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1086                 if (pam_code != PAM_SUCCESS)
1087                         goto fail;
1088         }
1089
1090         STRV_FOREACH(nv, *env) {
1091                 pam_code = pam_putenv(handle, *nv);
1092                 if (pam_code != PAM_SUCCESS)
1093                         goto fail;
1094         }
1095
1096         pam_code = pam_acct_mgmt(handle, flags);
1097         if (pam_code != PAM_SUCCESS)
1098                 goto fail;
1099
1100         pam_code = pam_open_session(handle, flags);
1101         if (pam_code != PAM_SUCCESS)
1102                 goto fail;
1103
1104         close_session = true;
1105
1106         e = pam_getenvlist(handle);
1107         if (!e) {
1108                 pam_code = PAM_BUF_ERR;
1109                 goto fail;
1110         }
1111
1112         /* Block SIGTERM, so that we know that it won't get lost in
1113          * the child */
1114
1115         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1116
1117         parent_pid = getpid_cached();
1118
1119         pam_pid = fork();
1120         if (pam_pid < 0) {
1121                 r = -errno;
1122                 goto fail;
1123         }
1124
1125         if (pam_pid == 0) {
1126                 int sig, ret = EXIT_PAM;
1127
1128                 /* The child's job is to reset the PAM session on
1129                  * termination */
1130                 barrier_set_role(&barrier, BARRIER_CHILD);
1131
1132                 /* This string must fit in 10 chars (i.e. the length
1133                  * of "/sbin/init"), to look pretty in /bin/ps */
1134                 rename_process("(sd-pam)");
1135
1136                 /* Make sure we don't keep open the passed fds in this
1137                 child. We assume that otherwise only those fds are
1138                 open here that have been opened by PAM. */
1139                 close_many(fds, n_fds);
1140
1141                 /* Drop privileges - we don't need any to pam_close_session
1142                  * and this will make PR_SET_PDEATHSIG work in most cases.
1143                  * If this fails, ignore the error - but expect sd-pam threads
1144                  * to fail to exit normally */
1145
1146                 r = maybe_setgroups(0, NULL);
1147                 if (r < 0)
1148                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1149                 if (setresgid(gid, gid, gid) < 0)
1150                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1151                 if (setresuid(uid, uid, uid) < 0)
1152                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1153
1154                 (void) ignore_signals(SIGPIPE, -1);
1155
1156                 /* Wait until our parent died. This will only work if
1157                  * the above setresuid() succeeds, otherwise the kernel
1158                  * will not allow unprivileged parents kill their privileged
1159                  * children this way. We rely on the control groups kill logic
1160                  * to do the rest for us. */
1161                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1162                         goto child_finish;
1163
1164                 /* Tell the parent that our setup is done. This is especially
1165                  * important regarding dropping privileges. Otherwise, unit
1166                  * setup might race against our setresuid(2) call.
1167                  *
1168                  * If the parent aborted, we'll detect this below, hence ignore
1169                  * return failure here. */
1170                 (void) barrier_place(&barrier);
1171
1172                 /* Check if our parent process might already have died? */
1173                 if (getppid() == parent_pid) {
1174                         sigset_t ss;
1175
1176                         assert_se(sigemptyset(&ss) >= 0);
1177                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1178
1179                         for (;;) {
1180                                 if (sigwait(&ss, &sig) < 0) {
1181                                         if (errno == EINTR)
1182                                                 continue;
1183
1184                                         goto child_finish;
1185                                 }
1186
1187                                 assert(sig == SIGTERM);
1188                                 break;
1189                         }
1190                 }
1191
1192                 /* If our parent died we'll end the session */
1193                 if (getppid() != parent_pid) {
1194                         pam_code = pam_close_session(handle, flags);
1195                         if (pam_code != PAM_SUCCESS)
1196                                 goto child_finish;
1197                 }
1198
1199                 ret = 0;
1200
1201         child_finish:
1202                 pam_end(handle, pam_code | flags);
1203                 _exit(ret);
1204         }
1205
1206         barrier_set_role(&barrier, BARRIER_PARENT);
1207
1208         /* If the child was forked off successfully it will do all the
1209          * cleanups, so forget about the handle here. */
1210         handle = NULL;
1211
1212         /* Unblock SIGTERM again in the parent */
1213         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1214
1215         /* We close the log explicitly here, since the PAM modules
1216          * might have opened it, but we don't want this fd around. */
1217         closelog();
1218
1219         /* Synchronously wait for the child to initialize. We don't care for
1220          * errors as we cannot recover. However, warn loudly if it happens. */
1221         if (!barrier_place_and_sync(&barrier))
1222                 log_error("PAM initialization failed");
1223
1224         strv_free(*env);
1225         *env = e;
1226
1227         return 0;
1228
1229 fail:
1230         if (pam_code != PAM_SUCCESS) {
1231                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1232                 r = -EPERM;  /* PAM errors do not map to errno */
1233         } else
1234                 log_error_errno(r, "PAM failed: %m");
1235
1236         if (handle) {
1237                 if (close_session)
1238                         pam_code = pam_close_session(handle, flags);
1239
1240                 pam_end(handle, pam_code | flags);
1241         }
1242
1243         strv_free(e);
1244         closelog();
1245
1246         return r;
1247 #else
1248         return 0;
1249 #endif
1250 }
1251
1252 static void rename_process_from_path(const char *path) {
1253         char process_name[11];
1254         const char *p;
1255         size_t l;
1256
1257         /* This resulting string must fit in 10 chars (i.e. the length
1258          * of "/sbin/init") to look pretty in /bin/ps */
1259
1260         p = basename(path);
1261         if (isempty(p)) {
1262                 rename_process("(...)");
1263                 return;
1264         }
1265
1266         l = strlen(p);
1267         if (l > 8) {
1268                 /* The end of the process name is usually more
1269                  * interesting, since the first bit might just be
1270                  * "systemd-" */
1271                 p = p + l - 8;
1272                 l = 8;
1273         }
1274
1275         process_name[0] = '(';
1276         memcpy(process_name+1, p, l);
1277         process_name[1+l] = ')';
1278         process_name[1+l+1] = 0;
1279
1280         rename_process(process_name);
1281 }
1282
1283 static bool context_has_address_families(const ExecContext *c) {
1284         assert(c);
1285
1286         return c->address_families_whitelist ||
1287                 !set_isempty(c->address_families);
1288 }
1289
1290 static bool context_has_syscall_filters(const ExecContext *c) {
1291         assert(c);
1292
1293         return c->syscall_whitelist ||
1294                 !set_isempty(c->syscall_filter);
1295 }
1296
1297 static bool context_has_no_new_privileges(const ExecContext *c) {
1298         assert(c);
1299
1300         if (c->no_new_privileges)
1301                 return true;
1302
1303         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1304                 return false;
1305
1306         /* We need NNP if we have any form of seccomp and are unprivileged */
1307         return context_has_address_families(c) ||
1308                 c->memory_deny_write_execute ||
1309                 c->restrict_realtime ||
1310                 exec_context_restrict_namespaces_set(c) ||
1311                 c->protect_kernel_tunables ||
1312                 c->protect_kernel_modules ||
1313                 c->private_devices ||
1314                 context_has_syscall_filters(c) ||
1315                 !set_isempty(c->syscall_archs) ||
1316                 c->lock_personality;
1317 }
1318
1319 #ifdef HAVE_SECCOMP
1320
1321 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1322
1323         if (is_seccomp_available())
1324                 return false;
1325
1326         log_open();
1327         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1328         log_close();
1329         return true;
1330 }
1331
1332 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1333         uint32_t negative_action, default_action, action;
1334         int r;
1335
1336         assert(u);
1337         assert(c);
1338
1339         if (!context_has_syscall_filters(c))
1340                 return 0;
1341
1342         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1343                 return 0;
1344
1345         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1346
1347         if (c->syscall_whitelist) {
1348                 default_action = negative_action;
1349                 action = SCMP_ACT_ALLOW;
1350         } else {
1351                 default_action = SCMP_ACT_ALLOW;
1352                 action = negative_action;
1353         }
1354
1355         if (needs_ambient_hack) {
1356                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1357                 if (r < 0)
1358                         return r;
1359         }
1360
1361         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1362 }
1363
1364 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1365         assert(u);
1366         assert(c);
1367
1368         if (set_isempty(c->syscall_archs))
1369                 return 0;
1370
1371         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1372                 return 0;
1373
1374         return seccomp_restrict_archs(c->syscall_archs);
1375 }
1376
1377 static int apply_address_families(const Unit* u, const ExecContext *c) {
1378         assert(u);
1379         assert(c);
1380
1381         if (!context_has_address_families(c))
1382                 return 0;
1383
1384         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1385                 return 0;
1386
1387         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1388 }
1389
1390 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1391         assert(u);
1392         assert(c);
1393
1394         if (!c->memory_deny_write_execute)
1395                 return 0;
1396
1397         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1398                 return 0;
1399
1400         return seccomp_memory_deny_write_execute();
1401 }
1402
1403 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1404         assert(u);
1405         assert(c);
1406
1407         if (!c->restrict_realtime)
1408                 return 0;
1409
1410         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1411                 return 0;
1412
1413         return seccomp_restrict_realtime();
1414 }
1415
1416 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1417         assert(u);
1418         assert(c);
1419
1420         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1421          * let's protect even those systems where this is left on in the kernel. */
1422
1423         if (!c->protect_kernel_tunables)
1424                 return 0;
1425
1426         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1427                 return 0;
1428
1429         return seccomp_protect_sysctl();
1430 }
1431
1432 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1433         assert(u);
1434         assert(c);
1435
1436         /* Turn off module syscalls on ProtectKernelModules=yes */
1437
1438         if (!c->protect_kernel_modules)
1439                 return 0;
1440
1441         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1442                 return 0;
1443
1444         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1445 }
1446
1447 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1448         assert(u);
1449         assert(c);
1450
1451         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1452
1453         if (!c->private_devices)
1454                 return 0;
1455
1456         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1457                 return 0;
1458
1459         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1460 }
1461
1462 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1463         assert(u);
1464         assert(c);
1465
1466         if (!exec_context_restrict_namespaces_set(c))
1467                 return 0;
1468
1469         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1470                 return 0;
1471
1472         return seccomp_restrict_namespaces(c->restrict_namespaces);
1473 }
1474
1475 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1476         unsigned long personality;
1477         int r;
1478
1479         assert(u);
1480         assert(c);
1481
1482         if (!c->lock_personality)
1483                 return 0;
1484
1485         if (skip_seccomp_unavailable(u, "LockPersonality="))
1486                 return 0;
1487
1488         personality = c->personality;
1489
1490         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1491         if (personality == PERSONALITY_INVALID) {
1492
1493                 r = opinionated_personality(&personality);
1494                 if (r < 0)
1495                         return r;
1496         }
1497
1498         return seccomp_lock_personality(personality);
1499 }
1500
1501 #endif
1502
1503 static void do_idle_pipe_dance(int idle_pipe[4]) {
1504         assert(idle_pipe);
1505
1506         idle_pipe[1] = safe_close(idle_pipe[1]);
1507         idle_pipe[2] = safe_close(idle_pipe[2]);
1508
1509         if (idle_pipe[0] >= 0) {
1510                 int r;
1511
1512                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1513
1514                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1515                         ssize_t n;
1516
1517                         /* Signal systemd that we are bored and want to continue. */
1518                         n = write(idle_pipe[3], "x", 1);
1519                         if (n > 0)
1520                                 /* Wait for systemd to react to the signal above. */
1521                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1522                 }
1523
1524                 idle_pipe[0] = safe_close(idle_pipe[0]);
1525
1526         }
1527
1528         idle_pipe[3] = safe_close(idle_pipe[3]);
1529 }
1530
1531 static int build_environment(
1532                 Unit *u,
1533                 const ExecContext *c,
1534                 const ExecParameters *p,
1535                 unsigned n_fds,
1536                 const char *home,
1537                 const char *username,
1538                 const char *shell,
1539                 dev_t journal_stream_dev,
1540                 ino_t journal_stream_ino,
1541                 char ***ret) {
1542
1543         _cleanup_strv_free_ char **our_env = NULL;
1544         unsigned n_env = 0;
1545         char *x;
1546
1547         assert(u);
1548         assert(c);
1549         assert(ret);
1550
1551         our_env = new0(char*, 14);
1552         if (!our_env)
1553                 return -ENOMEM;
1554
1555         if (n_fds > 0) {
1556                 _cleanup_free_ char *joined = NULL;
1557
1558                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1559                         return -ENOMEM;
1560                 our_env[n_env++] = x;
1561
1562                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1563                         return -ENOMEM;
1564                 our_env[n_env++] = x;
1565
1566                 joined = strv_join(p->fd_names, ":");
1567                 if (!joined)
1568                         return -ENOMEM;
1569
1570                 x = strjoin("LISTEN_FDNAMES=", joined);
1571                 if (!x)
1572                         return -ENOMEM;
1573                 our_env[n_env++] = x;
1574         }
1575
1576         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1577                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1578                         return -ENOMEM;
1579                 our_env[n_env++] = x;
1580
1581                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1582                         return -ENOMEM;
1583                 our_env[n_env++] = x;
1584         }
1585
1586         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1587          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1588          * check the database directly. */
1589         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1590                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1591                 if (!x)
1592                         return -ENOMEM;
1593                 our_env[n_env++] = x;
1594         }
1595
1596         if (home) {
1597                 x = strappend("HOME=", home);
1598                 if (!x)
1599                         return -ENOMEM;
1600                 our_env[n_env++] = x;
1601         }
1602
1603         if (username) {
1604                 x = strappend("LOGNAME=", username);
1605                 if (!x)
1606                         return -ENOMEM;
1607                 our_env[n_env++] = x;
1608
1609                 x = strappend("USER=", username);
1610                 if (!x)
1611                         return -ENOMEM;
1612                 our_env[n_env++] = x;
1613         }
1614
1615         if (shell) {
1616                 x = strappend("SHELL=", shell);
1617                 if (!x)
1618                         return -ENOMEM;
1619                 our_env[n_env++] = x;
1620         }
1621
1622         if (!sd_id128_is_null(u->invocation_id)) {
1623                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1624                         return -ENOMEM;
1625
1626                 our_env[n_env++] = x;
1627         }
1628
1629         if (exec_context_needs_term(c)) {
1630                 const char *tty_path, *term = NULL;
1631
1632                 tty_path = exec_context_tty_path(c);
1633
1634                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1635                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1636                  * passes to PID 1 ends up all the way in the console login shown. */
1637
1638                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1639                         term = getenv("TERM");
1640                 if (!term)
1641                         term = default_term_for_tty(tty_path);
1642
1643                 x = strappend("TERM=", term);
1644                 if (!x)
1645                         return -ENOMEM;
1646                 our_env[n_env++] = x;
1647         }
1648
1649         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1650                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1651                         return -ENOMEM;
1652
1653                 our_env[n_env++] = x;
1654         }
1655
1656         our_env[n_env++] = NULL;
1657         assert(n_env <= 12);
1658
1659         *ret = our_env;
1660         our_env = NULL;
1661
1662         return 0;
1663 }
1664
1665 static int build_pass_environment(const ExecContext *c, char ***ret) {
1666         _cleanup_strv_free_ char **pass_env = NULL;
1667         size_t n_env = 0, n_bufsize = 0;
1668         char **i;
1669
1670         STRV_FOREACH(i, c->pass_environment) {
1671                 _cleanup_free_ char *x = NULL;
1672                 char *v;
1673
1674                 v = getenv(*i);
1675                 if (!v)
1676                         continue;
1677                 x = strjoin(*i, "=", v);
1678                 if (!x)
1679                         return -ENOMEM;
1680
1681                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1682                         return -ENOMEM;
1683
1684                 pass_env[n_env++] = x;
1685                 pass_env[n_env] = NULL;
1686                 x = NULL;
1687         }
1688
1689         *ret = pass_env;
1690         pass_env = NULL;
1691
1692         return 0;
1693 }
1694
1695 static bool exec_needs_mount_namespace(
1696                 const ExecContext *context,
1697                 const ExecParameters *params,
1698                 ExecRuntime *runtime) {
1699
1700         assert(context);
1701         assert(params);
1702
1703         if (context->root_image)
1704                 return true;
1705
1706         if (!strv_isempty(context->read_write_paths) ||
1707             !strv_isempty(context->read_only_paths) ||
1708             !strv_isempty(context->inaccessible_paths))
1709                 return true;
1710
1711         if (context->n_bind_mounts > 0)
1712                 return true;
1713
1714         if (context->mount_flags != 0)
1715                 return true;
1716
1717         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1718                 return true;
1719
1720         if (context->private_devices ||
1721             context->protect_system != PROTECT_SYSTEM_NO ||
1722             context->protect_home != PROTECT_HOME_NO ||
1723             context->protect_kernel_tunables ||
1724             context->protect_kernel_modules ||
1725             context->protect_control_groups)
1726                 return true;
1727
1728         if (context->mount_apivfs && (context->root_image || context->root_directory))
1729                 return true;
1730
1731         return false;
1732 }
1733
1734 static int setup_private_users(uid_t uid, gid_t gid) {
1735         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1736         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1737         _cleanup_close_ int unshare_ready_fd = -1;
1738         _cleanup_(sigkill_waitp) pid_t pid = 0;
1739         uint64_t c = 1;
1740         siginfo_t si;
1741         ssize_t n;
1742         int r;
1743
1744         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1745          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1746          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1747          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1748          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1749          * continues execution normally. */
1750
1751         if (uid != 0 && uid_is_valid(uid)) {
1752                 r = asprintf(&uid_map,
1753                              "0 0 1\n"                      /* Map root → root */
1754                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1755                              uid, uid);
1756                 if (r < 0)
1757                         return -ENOMEM;
1758         } else {
1759                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1760                 if (!uid_map)
1761                         return -ENOMEM;
1762         }
1763
1764         if (gid != 0 && gid_is_valid(gid)) {
1765                 r = asprintf(&gid_map,
1766                              "0 0 1\n"                      /* Map root → root */
1767                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1768                              gid, gid);
1769                 if (r < 0)
1770                         return -ENOMEM;
1771         } else {
1772                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1773                 if (!gid_map)
1774                         return -ENOMEM;
1775         }
1776
1777         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1778          * namespace. */
1779         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1780         if (unshare_ready_fd < 0)
1781                 return -errno;
1782
1783         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1784          * failed. */
1785         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1786                 return -errno;
1787
1788         pid = fork();
1789         if (pid < 0)
1790                 return -errno;
1791
1792         if (pid == 0) {
1793                 _cleanup_close_ int fd = -1;
1794                 const char *a;
1795                 pid_t ppid;
1796
1797                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1798                  * here, after the parent opened its own user namespace. */
1799
1800                 ppid = getppid();
1801                 errno_pipe[0] = safe_close(errno_pipe[0]);
1802
1803                 /* Wait until the parent unshared the user namespace */
1804                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1805                         r = -errno;
1806                         goto child_fail;
1807                 }
1808
1809                 /* Disable the setgroups() system call in the child user namespace, for good. */
1810                 a = procfs_file_alloca(ppid, "setgroups");
1811                 fd = open(a, O_WRONLY|O_CLOEXEC);
1812                 if (fd < 0) {
1813                         if (errno != ENOENT) {
1814                                 r = -errno;
1815                                 goto child_fail;
1816                         }
1817
1818                         /* If the file is missing the kernel is too old, let's continue anyway. */
1819                 } else {
1820                         if (write(fd, "deny\n", 5) < 0) {
1821                                 r = -errno;
1822                                 goto child_fail;
1823                         }
1824
1825                         fd = safe_close(fd);
1826                 }
1827
1828                 /* First write the GID map */
1829                 a = procfs_file_alloca(ppid, "gid_map");
1830                 fd = open(a, O_WRONLY|O_CLOEXEC);
1831                 if (fd < 0) {
1832                         r = -errno;
1833                         goto child_fail;
1834                 }
1835                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1836                         r = -errno;
1837                         goto child_fail;
1838                 }
1839                 fd = safe_close(fd);
1840
1841                 /* The write the UID map */
1842                 a = procfs_file_alloca(ppid, "uid_map");
1843                 fd = open(a, O_WRONLY|O_CLOEXEC);
1844                 if (fd < 0) {
1845                         r = -errno;
1846                         goto child_fail;
1847                 }
1848                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1849                         r = -errno;
1850                         goto child_fail;
1851                 }
1852
1853                 _exit(EXIT_SUCCESS);
1854
1855         child_fail:
1856                 (void) write(errno_pipe[1], &r, sizeof(r));
1857                 _exit(EXIT_FAILURE);
1858         }
1859
1860         errno_pipe[1] = safe_close(errno_pipe[1]);
1861
1862         if (unshare(CLONE_NEWUSER) < 0)
1863                 return -errno;
1864
1865         /* Let the child know that the namespace is ready now */
1866         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1867                 return -errno;
1868
1869         /* Try to read an error code from the child */
1870         n = read(errno_pipe[0], &r, sizeof(r));
1871         if (n < 0)
1872                 return -errno;
1873         if (n == sizeof(r)) { /* an error code was sent to us */
1874                 if (r < 0)
1875                         return r;
1876                 return -EIO;
1877         }
1878         if (n != 0) /* on success we should have read 0 bytes */
1879                 return -EIO;
1880
1881         r = wait_for_terminate(pid, &si);
1882         if (r < 0)
1883                 return r;
1884         pid = 0;
1885
1886         /* If something strange happened with the child, let's consider this fatal, too */
1887         if (si.si_code != CLD_EXITED || si.si_status != 0)
1888                 return -EIO;
1889
1890         return 0;
1891 }
1892
1893 static int setup_exec_directory(
1894                 const ExecContext *context,
1895                 const ExecParameters *params,
1896                 uid_t uid,
1897                 gid_t gid,
1898                 ExecDirectoryType type,
1899                 int *exit_status) {
1900
1901         static const int exit_status_table[_EXEC_DIRECTORY_MAX] = {
1902                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1903                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1904                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1905                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1906                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1907         };
1908         char **rt;
1909         int r;
1910
1911         assert(context);
1912         assert(params);
1913         assert(type >= 0 && type < _EXEC_DIRECTORY_MAX);
1914         assert(exit_status);
1915
1916         if (!params->prefix[type])
1917                 return 0;
1918
1919         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1920                 if (!uid_is_valid(uid))
1921                         uid = 0;
1922                 if (!gid_is_valid(gid))
1923                         gid = 0;
1924         }
1925
1926         STRV_FOREACH(rt, context->directories[type].paths) {
1927                 _cleanup_free_ char *p;
1928
1929                 p = strjoin(params->prefix[type], "/", *rt);
1930                 if (!p) {
1931                         r = -ENOMEM;
1932                         goto fail;
1933                 }
1934
1935                 r = mkdir_parents_label(p, 0755);
1936                 if (r < 0)
1937                         goto fail;
1938
1939                 r = mkdir_p_label(p, context->directories[type].mode);
1940                 if (r < 0)
1941                         goto fail;
1942
1943                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
1944                  * a service, and shall not be writable. */
1945                 if (type == EXEC_DIRECTORY_CONFIGURATION)
1946                         continue;
1947
1948                 r = chmod_and_chown(p, context->directories[type].mode, uid, gid);
1949                 if (r < 0)
1950                         goto fail;
1951         }
1952
1953         return 0;
1954
1955 fail:
1956         *exit_status = exit_status_table[type];
1957
1958         return r;
1959 }
1960
1961 static int setup_smack(
1962                 const ExecContext *context,
1963                 const ExecCommand *command) {
1964
1965         int r;
1966
1967         assert(context);
1968         assert(command);
1969
1970         if (context->smack_process_label) {
1971                 r = mac_smack_apply_pid(0, context->smack_process_label);
1972                 if (r < 0)
1973                         return r;
1974         }
1975 #ifdef SMACK_DEFAULT_PROCESS_LABEL
1976         else {
1977                 _cleanup_free_ char *exec_label = NULL;
1978
1979                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
1980                 if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
1981                         return r;
1982
1983                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
1984                 if (r < 0)
1985                         return r;
1986         }
1987 #endif
1988
1989         return 0;
1990 }
1991
1992 static int compile_read_write_paths(
1993                 const ExecContext *context,
1994                 const ExecParameters *params,
1995                 char ***ret) {
1996
1997         _cleanup_strv_free_ char **l = NULL;
1998         char **rt;
1999         ExecDirectoryType i;
2000
2001         /* Compile the list of writable paths. This is the combination of
2002          * the explicitly configured paths, plus all runtime directories. */
2003
2004         if (strv_isempty(context->read_write_paths)) {
2005                 for (i = 0; i < _EXEC_DIRECTORY_MAX; i++)
2006                         if (!strv_isempty(context->directories[i].paths))
2007                                 break;
2008
2009                 if (i == _EXEC_DIRECTORY_MAX) {
2010                         *ret = NULL; /* NOP if neither is set */
2011                         return 0;
2012                 }
2013         }
2014
2015         l = strv_copy(context->read_write_paths);
2016         if (!l)
2017                 return -ENOMEM;
2018
2019         for (i = 0; i < _EXEC_DIRECTORY_MAX; i++) {
2020                 if (!params->prefix[i])
2021                         continue;
2022
2023                 STRV_FOREACH(rt, context->directories[i].paths) {
2024                         char *s;
2025
2026                         s = strjoin(params->prefix[i], "/", *rt);
2027                         if (!s)
2028                                 return -ENOMEM;
2029
2030                         if (strv_consume(&l, s) < 0)
2031                                 return -ENOMEM;
2032                 }
2033         }
2034
2035         *ret = l;
2036         l = NULL;
2037
2038         return 0;
2039 }
2040
2041 static int apply_mount_namespace(
2042                 Unit *u,
2043                 ExecCommand *command,
2044                 const ExecContext *context,
2045                 const ExecParameters *params,
2046                 ExecRuntime *runtime) {
2047
2048         _cleanup_strv_free_ char **rw = NULL;
2049         char *tmp = NULL, *var = NULL;
2050         const char *root_dir = NULL, *root_image = NULL;
2051         NameSpaceInfo ns_info = {
2052                 .ignore_protect_paths = false,
2053                 .private_dev = context->private_devices,
2054                 .protect_control_groups = context->protect_control_groups,
2055                 .protect_kernel_tunables = context->protect_kernel_tunables,
2056                 .protect_kernel_modules = context->protect_kernel_modules,
2057                 .mount_apivfs = context->mount_apivfs,
2058         };
2059         bool needs_sandboxing;
2060         int r;
2061
2062         assert(context);
2063
2064         /* The runtime struct only contains the parent of the private /tmp,
2065          * which is non-accessible to world users. Inside of it there's a /tmp
2066          * that is sticky, and that's the one we want to use here. */
2067
2068         if (context->private_tmp && runtime) {
2069                 if (runtime->tmp_dir)
2070                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2071                 if (runtime->var_tmp_dir)
2072                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2073         }
2074
2075         r = compile_read_write_paths(context, params, &rw);
2076         if (r < 0)
2077                 return r;
2078
2079         if (params->flags & EXEC_APPLY_CHROOT) {
2080                 root_image = context->root_image;
2081
2082                 if (!root_image)
2083                         root_dir = context->root_directory;
2084         }
2085
2086         /*
2087          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2088          * sandbox info, otherwise enforce it, don't ignore protected paths and
2089          * fail if we are enable to apply the sandbox inside the mount namespace.
2090          */
2091         if (!context->dynamic_user && root_dir)
2092                 ns_info.ignore_protect_paths = true;
2093
2094         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2095
2096         r = setup_namespace(root_dir, root_image,
2097                             &ns_info, rw,
2098                             needs_sandboxing ? context->read_only_paths : NULL,
2099                             needs_sandboxing ? context->inaccessible_paths : NULL,
2100                             context->bind_mounts,
2101                             context->n_bind_mounts,
2102                             tmp,
2103                             var,
2104                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2105                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2106                             context->mount_flags,
2107                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2108
2109         /* If we couldn't set up the namespace this is probably due to a
2110          * missing capability. In this case, silently proceeed. */
2111         if (IN_SET(r, -EPERM, -EACCES)) {
2112                 log_open();
2113                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2114                 log_close();
2115                 r = 0;
2116         }
2117
2118         return r;
2119 }
2120
2121 static int apply_working_directory(
2122                 const ExecContext *context,
2123                 const ExecParameters *params,
2124                 const char *home,
2125                 const bool needs_mount_ns,
2126                 int *exit_status) {
2127
2128         const char *d, *wd;
2129
2130         assert(context);
2131         assert(exit_status);
2132
2133         if (context->working_directory_home) {
2134
2135                 if (!home) {
2136                         *exit_status = EXIT_CHDIR;
2137                         return -ENXIO;
2138                 }
2139
2140                 wd = home;
2141
2142         } else if (context->working_directory)
2143                 wd = context->working_directory;
2144         else
2145                 wd = "/";
2146
2147         if (params->flags & EXEC_APPLY_CHROOT) {
2148                 if (!needs_mount_ns && context->root_directory)
2149                         if (chroot(context->root_directory) < 0) {
2150                                 *exit_status = EXIT_CHROOT;
2151                                 return -errno;
2152                         }
2153
2154                 d = wd;
2155         } else
2156                 d = prefix_roota(context->root_directory, wd);
2157
2158         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2159                 *exit_status = EXIT_CHDIR;
2160                 return -errno;
2161         }
2162
2163         return 0;
2164 }
2165
2166 static int setup_keyring(
2167                 Unit *u,
2168                 const ExecContext *context,
2169                 const ExecParameters *p,
2170                 uid_t uid, gid_t gid) {
2171
2172         key_serial_t keyring;
2173         int r;
2174
2175         assert(u);
2176         assert(context);
2177         assert(p);
2178
2179         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2180          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2181          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2182          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2183          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2184          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2185
2186         if (!(p->flags & EXEC_NEW_KEYRING))
2187                 return 0;
2188
2189         if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2190                 return 0;
2191
2192         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2193         if (keyring == -1) {
2194                 if (errno == ENOSYS)
2195                         log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2196                 else if (IN_SET(errno, EACCES, EPERM))
2197                         log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2198                 else if (errno == EDQUOT)
2199                         log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
2200                 else
2201                         return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2202
2203                 return 0;
2204         }
2205
2206         /* Populate they keyring with the invocation ID by default. */
2207         if (!sd_id128_is_null(u->invocation_id)) {
2208                 key_serial_t key;
2209
2210                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2211                 if (key == -1)
2212                         log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
2213                 else {
2214                         if (keyctl(KEYCTL_SETPERM, key,
2215                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2216                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2217                                 return log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
2218                 }
2219         }
2220
2221         /* And now, make the keyring owned by the service's user */
2222         if (uid_is_valid(uid) || gid_is_valid(gid))
2223                 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2224                         return log_error_errno(errno, "Failed to change ownership of session keyring: %m");
2225
2226         /* When requested link the user keyring into the session keyring. */
2227         if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2228                 uid_t saved_uid;
2229                 gid_t saved_gid;
2230
2231                 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2232                  * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2233                  * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2234
2235                 saved_uid = getuid();
2236                 saved_gid = getgid();
2237
2238                 if (gid_is_valid(gid) && gid != saved_gid) {
2239                         if (setregid(gid, -1) < 0)
2240                                 return log_error_errno(errno, "Failed to change GID for user keyring: %m");
2241                 }
2242
2243                 if (uid_is_valid(uid) && uid != saved_uid) {
2244                         if (setreuid(uid, -1) < 0) {
2245                                 (void) setregid(saved_gid, -1);
2246                                 return log_error_errno(errno, "Failed to change UID for user keyring: %m");
2247                         }
2248                 }
2249
2250                 if (keyctl(KEYCTL_LINK,
2251                            KEY_SPEC_USER_KEYRING,
2252                            KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2253
2254                         r = -errno;
2255
2256                         (void) setreuid(saved_uid, -1);
2257                         (void) setregid(saved_gid, -1);
2258
2259                         return log_error_errno(r, "Failed to link user keyring into session keyring: %m");
2260                 }
2261
2262                 if (uid_is_valid(uid) && uid != saved_uid) {
2263                         if (setreuid(saved_uid, -1) < 0) {
2264                                 (void) setregid(saved_gid, -1);
2265                                 return log_error_errno(errno, "Failed to change UID back for user keyring: %m");
2266                         }
2267                 }
2268
2269                 if (gid_is_valid(gid) && gid != saved_gid) {
2270                         if (setregid(saved_gid, -1) < 0)
2271                                 return log_error_errno(errno, "Failed to change GID back for user keyring: %m");
2272                 }
2273         }
2274
2275         return 0;
2276 }
2277
2278 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2279         assert(array);
2280         assert(n);
2281
2282         if (!pair)
2283                 return;
2284
2285         if (pair[0] >= 0)
2286                 array[(*n)++] = pair[0];
2287         if (pair[1] >= 0)
2288                 array[(*n)++] = pair[1];
2289 }
2290
2291 static int close_remaining_fds(
2292                 const ExecParameters *params,
2293                 ExecRuntime *runtime,
2294                 DynamicCreds *dcreds,
2295                 int user_lookup_fd,
2296                 int socket_fd,
2297                 int *fds, unsigned n_fds) {
2298
2299         unsigned n_dont_close = 0;
2300         int dont_close[n_fds + 12];
2301
2302         assert(params);
2303
2304         if (params->stdin_fd >= 0)
2305                 dont_close[n_dont_close++] = params->stdin_fd;
2306         if (params->stdout_fd >= 0)
2307                 dont_close[n_dont_close++] = params->stdout_fd;
2308         if (params->stderr_fd >= 0)
2309                 dont_close[n_dont_close++] = params->stderr_fd;
2310
2311         if (socket_fd >= 0)
2312                 dont_close[n_dont_close++] = socket_fd;
2313         if (n_fds > 0) {
2314                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2315                 n_dont_close += n_fds;
2316         }
2317
2318         if (runtime)
2319                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2320
2321         if (dcreds) {
2322                 if (dcreds->user)
2323                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2324                 if (dcreds->group)
2325                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2326         }
2327
2328         if (user_lookup_fd >= 0)
2329                 dont_close[n_dont_close++] = user_lookup_fd;
2330
2331         return close_all_fds(dont_close, n_dont_close);
2332 }
2333
2334 static int send_user_lookup(
2335                 Unit *unit,
2336                 int user_lookup_fd,
2337                 uid_t uid,
2338                 gid_t gid) {
2339
2340         assert(unit);
2341
2342         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2343          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2344          * specified. */
2345
2346         if (user_lookup_fd < 0)
2347                 return 0;
2348
2349         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2350                 return 0;
2351
2352         if (writev(user_lookup_fd,
2353                (struct iovec[]) {
2354                            { .iov_base = &uid, .iov_len = sizeof(uid) },
2355                            { .iov_base = &gid, .iov_len = sizeof(gid) },
2356                            { .iov_base = unit->id, .iov_len = strlen(unit->id) }}, 3) < 0)
2357                 return -errno;
2358
2359         return 0;
2360 }
2361
2362 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2363         int r;
2364
2365         assert(c);
2366         assert(home);
2367         assert(buf);
2368
2369         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2370
2371         if (*home)
2372                 return 0;
2373
2374         if (!c->working_directory_home)
2375                 return 0;
2376
2377         if (uid == 0) {
2378                 /* Hardcode /root as home directory for UID 0 */
2379                 *home = "/root";
2380                 return 1;
2381         }
2382
2383         r = get_home_dir(buf);
2384         if (r < 0)
2385                 return r;
2386
2387         *home = *buf;
2388         return 1;
2389 }
2390
2391 static int exec_child(
2392                 Unit *unit,
2393                 ExecCommand *command,
2394                 const ExecContext *context,
2395                 const ExecParameters *params,
2396                 ExecRuntime *runtime,
2397                 DynamicCreds *dcreds,
2398                 char **argv,
2399                 int socket_fd,
2400                 int named_iofds[3],
2401                 int *fds,
2402                 unsigned n_storage_fds,
2403                 unsigned n_socket_fds,
2404                 char **files_env,
2405                 int user_lookup_fd,
2406                 int *exit_status,
2407                 char **error_message) {
2408
2409         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2410         _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2411         _cleanup_free_ gid_t *supplementary_gids = NULL;
2412         const char *username = NULL, *groupname = NULL;
2413         const char *home = NULL, *shell = NULL;
2414         dev_t journal_stream_dev = 0;
2415         ino_t journal_stream_ino = 0;
2416         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2417                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2418                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2419                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2420 #ifdef HAVE_SELINUX
2421         bool use_selinux = false;
2422 #endif
2423 #ifdef HAVE_SMACK
2424         bool use_smack = false;
2425 #endif
2426 #ifdef HAVE_APPARMOR
2427         bool use_apparmor = false;
2428 #endif
2429         uid_t uid = UID_INVALID;
2430         gid_t gid = GID_INVALID;
2431         int i, r, ngids = 0;
2432         unsigned n_fds;
2433         ExecDirectoryType dt;
2434         int secure_bits;
2435
2436         assert(unit);
2437         assert(command);
2438         assert(context);
2439         assert(params);
2440         assert(exit_status);
2441         assert(error_message);
2442         /* We don't always set error_message, hence it must be initialized */
2443         assert(*error_message == NULL);
2444
2445         rename_process_from_path(command->path);
2446
2447         /* We reset exactly these signals, since they are the
2448          * only ones we set to SIG_IGN in the main daemon. All
2449          * others we leave untouched because we set them to
2450          * SIG_DFL or a valid handler initially, both of which
2451          * will be demoted to SIG_DFL. */
2452         (void) default_signals(SIGNALS_CRASH_HANDLER,
2453                                SIGNALS_IGNORE, -1);
2454
2455         if (context->ignore_sigpipe)
2456                 (void) ignore_signals(SIGPIPE, -1);
2457
2458         r = reset_signal_mask();
2459         if (r < 0) {
2460                 *exit_status = EXIT_SIGNAL_MASK;
2461                 *error_message = strdup("Failed to set process signal mask");
2462                 /* If strdup fails, here and below, we will just print the generic error message. */
2463                 return r;
2464         }
2465
2466         if (params->idle_pipe)
2467                 do_idle_pipe_dance(params->idle_pipe);
2468
2469         /* Close sockets very early to make sure we don't
2470          * block init reexecution because it cannot bind its
2471          * sockets */
2472
2473         log_forget_fds();
2474
2475         n_fds = n_storage_fds + n_socket_fds;
2476         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2477         if (r < 0) {
2478                 *exit_status = EXIT_FDS;
2479                 *error_message = strdup("Failed to close unwanted file descriptors");
2480                 return r;
2481         }
2482
2483         if (!context->same_pgrp)
2484                 if (setsid() < 0) {
2485                         *exit_status = EXIT_SETSID;
2486                         *error_message = strdup("Failed to create new process session");
2487                         return -errno;
2488                 }
2489
2490         exec_context_tty_reset(context, params);
2491
2492         if (unit_shall_confirm_spawn(unit)) {
2493                 const char *vc = params->confirm_spawn;
2494                 _cleanup_free_ char *cmdline = NULL;
2495
2496                 cmdline = exec_command_line(argv);
2497                 if (!cmdline) {
2498                         *exit_status = EXIT_MEMORY;
2499                         return -ENOMEM;
2500                 }
2501
2502                 r = ask_for_confirmation(vc, unit, cmdline);
2503                 if (r != CONFIRM_EXECUTE) {
2504                         if (r == CONFIRM_PRETEND_SUCCESS) {
2505                                 *exit_status = EXIT_SUCCESS;
2506                                 return 0;
2507                         }
2508                         *exit_status = EXIT_CONFIRM;
2509                         *error_message = strdup("Execution cancelled by the user");
2510                         return -ECANCELED;
2511                 }
2512         }
2513
2514         if (context->dynamic_user && dcreds) {
2515
2516                 /* Make sure we bypass our own NSS module for any NSS checks */
2517                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2518                         *exit_status = EXIT_USER;
2519                         *error_message = strdup("Failed to update environment");
2520                         return -errno;
2521                 }
2522
2523                 r = dynamic_creds_realize(dcreds, &uid, &gid);
2524                 if (r < 0) {
2525                         *exit_status = EXIT_USER;
2526                         *error_message = strdup("Failed to update dynamic user credentials");
2527                         return r;
2528                 }
2529
2530                 if (!uid_is_valid(uid)) {
2531                         *exit_status = EXIT_USER;
2532                         (void) asprintf(error_message, "UID validation failed for \""UID_FMT"\"", uid);
2533                         /* If asprintf fails, here and below, we will just print the generic error message. */
2534                         return -ESRCH;
2535                 }
2536
2537                 if (!gid_is_valid(gid)) {
2538                         *exit_status = EXIT_USER;
2539                         (void) asprintf(error_message, "GID validation failed for \""GID_FMT"\"", gid);
2540                         return -ESRCH;
2541                 }
2542
2543                 if (dcreds->user)
2544                         username = dcreds->user->name;
2545
2546         } else {
2547                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2548                 if (r < 0) {
2549                         *exit_status = EXIT_USER;
2550                         *error_message = strdup("Failed to determine user credentials");
2551                         return r;
2552                 }
2553
2554                 r = get_fixed_group(context, &groupname, &gid);
2555                 if (r < 0) {
2556                         *exit_status = EXIT_GROUP;
2557                         *error_message = strdup("Failed to determine group credentials");
2558                         return r;
2559                 }
2560         }
2561
2562         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2563         r = get_supplementary_groups(context, username, groupname, gid,
2564                                      &supplementary_gids, &ngids);
2565         if (r < 0) {
2566                 *exit_status = EXIT_GROUP;
2567                 *error_message = strdup("Failed to determine supplementary groups");
2568                 return r;
2569         }
2570
2571         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2572         if (r < 0) {
2573                 *exit_status = EXIT_USER;
2574                 *error_message = strdup("Failed to send user credentials to PID1");
2575                 return r;
2576         }
2577
2578         user_lookup_fd = safe_close(user_lookup_fd);
2579
2580         r = acquire_home(context, uid, &home, &home_buffer);
2581         if (r < 0) {
2582                 *exit_status = EXIT_CHDIR;
2583                 *error_message = strdup("Failed to determine $HOME for user");
2584                 return r;
2585         }
2586
2587         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2588          * must sure to drop O_NONBLOCK */
2589         if (socket_fd >= 0)
2590                 (void) fd_nonblock(socket_fd, false);
2591
2592         r = setup_input(context, params, socket_fd, named_iofds);
2593         if (r < 0) {
2594                 *exit_status = EXIT_STDIN;
2595                 *error_message = strdup("Failed to set up standard input");
2596                 return r;
2597         }
2598
2599         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2600         if (r < 0) {
2601                 *exit_status = EXIT_STDOUT;
2602                 *error_message = strdup("Failed to set up standard output");
2603                 return r;
2604         }
2605
2606         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2607         if (r < 0) {
2608                 *exit_status = EXIT_STDERR;
2609                 *error_message = strdup("Failed to set up standard error output");
2610                 return r;
2611         }
2612
2613         if (params->cgroup_path) {
2614                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2615                 if (r < 0) {
2616                         *exit_status = EXIT_CGROUP;
2617                         (void) asprintf(error_message, "Failed to attach to cgroup %s", params->cgroup_path);
2618                         return r;
2619                 }
2620         }
2621
2622         if (context->oom_score_adjust_set) {
2623                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2624
2625                 /* When we can't make this change due to EPERM, then
2626                  * let's silently skip over it. User namespaces
2627                  * prohibit write access to this file, and we
2628                  * shouldn't trip up over that. */
2629
2630                 sprintf(t, "%i", context->oom_score_adjust);
2631                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2632                 if (r == -EPERM || r == -EACCES) {
2633                         log_open();
2634                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2635                         log_close();
2636                 } else if (r < 0) {
2637                         *exit_status = EXIT_OOM_ADJUST;
2638                         *error_message = strdup("Failed to adjust OOM setting");
2639                         return -errno;
2640                 }
2641         }
2642
2643         if (context->nice_set)
2644                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2645                         *exit_status = EXIT_NICE;
2646                         *error_message = strdup("Failed to set up process scheduling priority (nice level)");
2647                         return -errno;
2648                 }
2649
2650         if (context->cpu_sched_set) {
2651                 struct sched_param param = {
2652                         .sched_priority = context->cpu_sched_priority,
2653                 };
2654
2655                 r = sched_setscheduler(0,
2656                                        context->cpu_sched_policy |
2657                                        (context->cpu_sched_reset_on_fork ?
2658                                         SCHED_RESET_ON_FORK : 0),
2659                                        &param);
2660                 if (r < 0) {
2661                         *exit_status = EXIT_SETSCHEDULER;
2662                         *error_message = strdup("Failed to set up CPU scheduling");
2663                         return -errno;
2664                 }
2665         }
2666
2667         if (context->cpuset)
2668                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2669                         *exit_status = EXIT_CPUAFFINITY;
2670                         *error_message = strdup("Failed to set up CPU affinity");
2671                         return -errno;
2672                 }
2673
2674         if (context->ioprio_set)
2675                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2676                         *exit_status = EXIT_IOPRIO;
2677                         *error_message = strdup("Failed to set up IO scheduling priority");
2678                         return -errno;
2679                 }
2680
2681         if (context->timer_slack_nsec != NSEC_INFINITY)
2682                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2683                         *exit_status = EXIT_TIMERSLACK;
2684                         *error_message = strdup("Failed to set up timer slack");
2685                         return -errno;
2686                 }
2687
2688         if (context->personality != PERSONALITY_INVALID) {
2689                 r = safe_personality(context->personality);
2690                 if (r < 0) {
2691                         *exit_status = EXIT_PERSONALITY;
2692                         *error_message = strdup("Failed to set up execution domain (personality)");
2693                         return r;
2694                 }
2695         }
2696
2697         if (context->utmp_id)
2698                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2699                                       context->tty_path,
2700                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
2701                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2702                                       USER_PROCESS,
2703                                       username);
2704
2705         if (context->user) {
2706                 r = chown_terminal(STDIN_FILENO, uid);
2707                 if (r < 0) {
2708                         *exit_status = EXIT_STDIN;
2709                         *error_message = strdup("Failed to change ownership of terminal");
2710                         return r;
2711                 }
2712         }
2713
2714         /* If delegation is enabled we'll pass ownership of the cgroup
2715          * (but only in systemd's own controller hierarchy!) to the
2716          * user of the new process. */
2717         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2718                 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2719                 if (r < 0) {
2720                         *exit_status = EXIT_CGROUP;
2721                         *error_message = strdup("Failed to adjust control group access");
2722                         return r;
2723                 }
2724
2725
2726                 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2727                 if (r < 0) {
2728                         *exit_status = EXIT_CGROUP;
2729                         *error_message = strdup("Failed to adjust control group access");
2730                         return r;
2731                 }
2732         }
2733
2734         for (dt = 0; dt < _EXEC_DIRECTORY_MAX; dt++) {
2735                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2736                 if (r < 0) {
2737                         *error_message = strdup("Failed to set up special execution directory");
2738                         return r;
2739                 }
2740         }
2741
2742         r = build_environment(
2743                         unit,
2744                         context,
2745                         params,
2746                         n_fds,
2747                         home,
2748                         username,
2749                         shell,
2750                         journal_stream_dev,
2751                         journal_stream_ino,
2752                         &our_env);
2753         if (r < 0) {
2754                 *exit_status = EXIT_MEMORY;
2755                 return r;
2756         }
2757
2758         r = build_pass_environment(context, &pass_env);
2759         if (r < 0) {
2760                 *exit_status = EXIT_MEMORY;
2761                 return r;
2762         }
2763
2764         accum_env = strv_env_merge(5,
2765                                    params->environment,
2766                                    our_env,
2767                                    pass_env,
2768                                    context->environment,
2769                                    files_env,
2770                                    NULL);
2771         if (!accum_env) {
2772                 *exit_status = EXIT_MEMORY;
2773                 return -ENOMEM;
2774         }
2775         accum_env = strv_env_clean(accum_env);
2776
2777         (void) umask(context->umask);
2778
2779         r = setup_keyring(unit, context, params, uid, gid);
2780         if (r < 0) {
2781                 *exit_status = EXIT_KEYRING;
2782                 *error_message = strdup("Failed to set up kernel keyring");
2783                 return r;
2784         }
2785
2786         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2787         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2788
2789         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2790         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
2791
2792         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2793         if (needs_ambient_hack)
2794                 needs_setuid = false;
2795         else
2796                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
2797
2798         if (needs_sandboxing) {
2799                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
2800                  * present. The actual MAC context application will happen later, as late as possible, to avoid
2801                  * impacting our own code paths. */
2802
2803 #ifdef HAVE_SELINUX
2804                 use_selinux = mac_selinux_use();
2805 #endif
2806 #ifdef HAVE_SMACK
2807                 use_smack = mac_smack_use();
2808 #endif
2809 #ifdef HAVE_APPARMOR
2810                 use_apparmor = mac_apparmor_use();
2811 #endif
2812         }
2813
2814         if (needs_setuid) {
2815                 if (context->pam_name && username) {
2816                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
2817                         if (r < 0) {
2818                                 *exit_status = EXIT_PAM;
2819                                 *error_message = strdup("Failed to set up PAM session");
2820                                 return r;
2821                         }
2822                 }
2823         }
2824
2825         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
2826                 r = setup_netns(runtime->netns_storage_socket);
2827                 if (r < 0) {
2828                         *exit_status = EXIT_NETWORK;
2829                         *error_message = strdup("Failed to set up network namespacing");
2830                         return r;
2831                 }
2832         }
2833
2834         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
2835         if (needs_mount_namespace) {
2836                 r = apply_mount_namespace(unit, command, context, params, runtime);
2837                 if (r < 0) {
2838                         *exit_status = EXIT_NAMESPACE;
2839                         *error_message = strdup("Failed to set up mount namespacing");
2840                         return r;
2841                 }
2842         }
2843
2844         /* Apply just after mount namespace setup */
2845         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
2846         if (r < 0) {
2847                 *error_message = strdup("Changing to the requested working directory failed");
2848                 return r;
2849         }
2850
2851         /* Drop groups as early as possbile */
2852         if (needs_setuid) {
2853                 r = enforce_groups(context, gid, supplementary_gids, ngids);
2854                 if (r < 0) {
2855                         *error_message = strdup("Changing group credentials failed");
2856                         *exit_status = EXIT_GROUP;
2857                         return r;
2858                 }
2859         }
2860
2861         if (needs_sandboxing) {
2862 #ifdef HAVE_SELINUX
2863                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
2864                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
2865                         if (r < 0) {
2866                                 *error_message = strdup("Failed to determine SELinux context");
2867                                 *exit_status = EXIT_SELINUX_CONTEXT;
2868                                 return r;
2869                         }
2870                 }
2871 #endif
2872
2873                 if (context->private_users) {
2874                         r = setup_private_users(uid, gid);
2875                         if (r < 0) {
2876                                 *error_message = strdup("Failed to set up user namespacing");
2877                                 *exit_status = EXIT_USER;
2878                                 return r;
2879                         }
2880                 }
2881         }
2882
2883         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
2884          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
2885          * was needed to upload the policy and can now be closed as well. */
2886         r = close_all_fds(fds, n_fds);
2887         if (r >= 0)
2888                 r = shift_fds(fds, n_fds);
2889         if (r >= 0)
2890                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
2891         if (r < 0) {
2892                 *error_message = strdup("Failed to adjust passed file descriptors");
2893                 *exit_status = EXIT_FDS;
2894                 return r;
2895         }
2896
2897         secure_bits = context->secure_bits;
2898
2899         if (needs_sandboxing) {
2900                 uint64_t bset;
2901
2902                 for (i = 0; i < _RLIMIT_MAX; i++) {
2903
2904                         if (!context->rlimit[i])
2905                                 continue;
2906
2907                         r = setrlimit_closest(i, context->rlimit[i]);
2908                         if (r < 0) {
2909                                 *error_message = strdup("Failed to adjust resource limits");
2910                                 *exit_status = EXIT_LIMITS;
2911                                 return r;
2912                         }
2913                 }
2914
2915                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
2916                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
2917                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
2918                                 *error_message = strdup("Failed to adjust RLIMIT_RTPRIO resource limit");
2919                                 *exit_status = EXIT_LIMITS;
2920                                 return -errno;
2921                         }
2922                 }
2923
2924                 bset = context->capability_bounding_set;
2925                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
2926                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
2927                  * instead of us doing that */
2928                 if (needs_ambient_hack)
2929                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
2930                                 (UINT64_C(1) << CAP_SETUID) |
2931                                 (UINT64_C(1) << CAP_SETGID);
2932
2933                 if (!cap_test_all(bset)) {
2934                         r = capability_bounding_set_drop(bset, false);
2935                         if (r < 0) {
2936                                 *exit_status = EXIT_CAPABILITIES;
2937                                 *error_message = strdup("Failed to drop capabilities");
2938                                 return r;
2939                         }
2940                 }
2941
2942                 /* This is done before enforce_user, but ambient set
2943                  * does not survive over setresuid() if keep_caps is not set. */
2944                 if (!needs_ambient_hack &&
2945                     context->capability_ambient_set != 0) {
2946                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
2947                         if (r < 0) {
2948                                 *exit_status = EXIT_CAPABILITIES;
2949                                 *error_message = strdup("Failed to apply ambient capabilities (before UID change)");
2950                                 return r;
2951                         }
2952                 }
2953         }
2954
2955         if (needs_setuid) {
2956                 if (context->user) {
2957                         r = enforce_user(context, uid);
2958                         if (r < 0) {
2959                                 *exit_status = EXIT_USER;
2960                                 (void) asprintf(error_message, "Failed to change UID to "UID_FMT, uid);
2961                                 return r;
2962                         }
2963
2964                         if (!needs_ambient_hack &&
2965                             context->capability_ambient_set != 0) {
2966
2967                                 /* Fix the ambient capabilities after user change. */
2968                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
2969                                 if (r < 0) {
2970                                         *exit_status = EXIT_CAPABILITIES;
2971                                         *error_message = strdup("Failed to apply ambient capabilities (after UID change)");
2972                                         return r;
2973                                 }
2974
2975                                 /* If we were asked to change user and ambient capabilities
2976                                  * were requested, we had to add keep-caps to the securebits
2977                                  * so that we would maintain the inherited capability set
2978                                  * through the setresuid(). Make sure that the bit is added
2979                                  * also to the context secure_bits so that we don't try to
2980                                  * drop the bit away next. */
2981
2982                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
2983                         }
2984                 }
2985         }
2986
2987         if (needs_sandboxing) {
2988                 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
2989                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
2990                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
2991                  * are restricted. */
2992
2993 #ifdef HAVE_SELINUX
2994                 if (use_selinux) {
2995                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
2996
2997                         if (exec_context) {
2998                                 r = setexeccon(exec_context);
2999                                 if (r < 0) {
3000                                         *exit_status = EXIT_SELINUX_CONTEXT;
3001                                         (void) asprintf(error_message, "Failed to change SELinux context to %s", exec_context);
3002                                         return r;
3003                                 }
3004                         }
3005                 }
3006 #endif
3007
3008 #ifdef HAVE_SMACK
3009                 if (use_smack) {
3010                         r = setup_smack(context, command);
3011                         if (r < 0) {
3012                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3013                                 *error_message = strdup("Failed to set SMACK process label");
3014                                 return r;
3015                         }
3016                 }
3017 #endif
3018
3019 #ifdef HAVE_APPARMOR
3020                 if (use_apparmor && context->apparmor_profile) {
3021                         r = aa_change_onexec(context->apparmor_profile);
3022                         if (r < 0 && !context->apparmor_profile_ignore) {
3023                                 *exit_status = EXIT_APPARMOR_PROFILE;
3024                                 (void) asprintf(error_message,
3025                                                 "Failed to prepare AppArmor profile change to %s",
3026                                                 context->apparmor_profile);
3027                                 return -errno;
3028                         }
3029                 }
3030 #endif
3031
3032                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3033                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3034                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3035                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3036                                 *exit_status = EXIT_SECUREBITS;
3037                                 *error_message = strdup("Failed to set process secure bits");
3038                                 return -errno;
3039                         }
3040
3041                 if (context_has_no_new_privileges(context))
3042                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3043                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3044                                 *error_message = strdup("Failed to disable new privileges");
3045                                 return -errno;
3046                         }
3047
3048 #ifdef HAVE_SECCOMP
3049                 r = apply_address_families(unit, context);
3050                 if (r < 0) {
3051                         *exit_status = EXIT_ADDRESS_FAMILIES;
3052                         *error_message = strdup("Failed to restrict address families");
3053                         return r;
3054                 }
3055
3056                 r = apply_memory_deny_write_execute(unit, context);
3057                 if (r < 0) {
3058                         *exit_status = EXIT_SECCOMP;
3059                         *error_message = strdup("Failed to disable writing to executable memory");
3060                         return r;
3061                 }
3062
3063                 r = apply_restrict_realtime(unit, context);
3064                 if (r < 0) {
3065                         *exit_status = EXIT_SECCOMP;
3066                         *error_message = strdup("Failed to apply realtime restrictions");
3067                         return r;
3068                 }
3069
3070                 r = apply_restrict_namespaces(unit, context);
3071                 if (r < 0) {
3072                         *exit_status = EXIT_SECCOMP;
3073                         *error_message = strdup("Failed to apply namespace restrictions");
3074                         return r;
3075                 }
3076
3077                 r = apply_protect_sysctl(unit, context);
3078                 if (r < 0) {
3079                         *exit_status = EXIT_SECCOMP;
3080                         *error_message = strdup("Failed to apply sysctl restrictions");
3081                         return r;
3082                 }
3083
3084                 r = apply_protect_kernel_modules(unit, context);
3085                 if (r < 0) {
3086                         *exit_status = EXIT_SECCOMP;
3087                         *error_message = strdup("Failed to apply module loading restrictions");
3088                         return r;
3089                 }
3090
3091                 r = apply_private_devices(unit, context);
3092                 if (r < 0) {
3093                         *exit_status = EXIT_SECCOMP;
3094                         *error_message = strdup("Failed to set up private devices");
3095                         return r;
3096                 }
3097
3098                 r = apply_syscall_archs(unit, context);
3099                 if (r < 0) {
3100                         *exit_status = EXIT_SECCOMP;
3101                         *error_message = strdup("Failed to apply syscall architecture restrictions");
3102                         return r;
3103                 }
3104
3105                 r = apply_lock_personality(unit, context);
3106                 if (r < 0) {
3107                         *exit_status = EXIT_SECCOMP;
3108                         *error_message = strdup("Failed to lock personalities");
3109                         return r;
3110                 }
3111
3112                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3113                  * by the filter as little as possible. */
3114                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3115                 if (r < 0) {
3116                         *exit_status = EXIT_SECCOMP;
3117                         *error_message = strdup("Failed to apply system call filters");
3118                         return r;
3119                 }
3120 #endif
3121         }
3122
3123         if (!strv_isempty(context->unset_environment)) {
3124                 char **ee = NULL;
3125
3126                 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3127                 if (!ee) {
3128                         *exit_status = EXIT_MEMORY;
3129                         return -ENOMEM;
3130                 }
3131
3132                 strv_free(accum_env);
3133                 accum_env = ee;
3134         }
3135
3136         final_argv = replace_env_argv(argv, accum_env);
3137         if (!final_argv) {
3138                 *exit_status = EXIT_MEMORY;
3139                 *error_message = strdup("Failed to prepare process arguments");
3140                 return -ENOMEM;
3141         }
3142
3143         if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3144                 _cleanup_free_ char *line;
3145
3146                 line = exec_command_line(final_argv);
3147                 if (line) {
3148                         log_open();
3149                         log_struct(LOG_DEBUG,
3150                                    "EXECUTABLE=%s", command->path,
3151                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3152                                    LOG_UNIT_ID(unit),
3153                                    NULL);
3154                         log_close();
3155                 }
3156         }
3157
3158         execve(command->path, final_argv, accum_env);
3159         *exit_status = EXIT_EXEC;
3160         return -errno;
3161 }
3162
3163 int exec_spawn(Unit *unit,
3164                ExecCommand *command,
3165                const ExecContext *context,
3166                const ExecParameters *params,
3167                ExecRuntime *runtime,
3168                DynamicCreds *dcreds,
3169                pid_t *ret) {
3170
3171         _cleanup_strv_free_ char **files_env = NULL;
3172         int *fds = NULL;
3173         unsigned n_storage_fds = 0, n_socket_fds = 0;
3174         _cleanup_free_ char *line = NULL;
3175         int socket_fd, r;
3176         int named_iofds[3] = { -1, -1, -1 };
3177         char **argv;
3178         pid_t pid;
3179
3180         assert(unit);
3181         assert(command);
3182         assert(context);
3183         assert(ret);
3184         assert(params);
3185         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3186
3187         if (context->std_input == EXEC_INPUT_SOCKET ||
3188             context->std_output == EXEC_OUTPUT_SOCKET ||
3189             context->std_error == EXEC_OUTPUT_SOCKET) {
3190
3191                 if (params->n_socket_fds > 1) {
3192                         log_unit_error(unit, "Got more than one socket.");
3193                         return -EINVAL;
3194                 }
3195
3196                 if (params->n_socket_fds == 0) {
3197                         log_unit_error(unit, "Got no socket.");
3198                         return -EINVAL;
3199                 }
3200
3201                 socket_fd = params->fds[0];
3202         } else {
3203                 socket_fd = -1;
3204                 fds = params->fds;
3205                 n_storage_fds = params->n_storage_fds;
3206                 n_socket_fds = params->n_socket_fds;
3207         }
3208
3209         r = exec_context_named_iofds(unit, context, params, named_iofds);
3210         if (r < 0)
3211                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3212
3213         r = exec_context_load_environment(unit, context, &files_env);
3214         if (r < 0)
3215                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3216
3217         argv = params->argv ?: command->argv;
3218         line = exec_command_line(argv);
3219         if (!line)
3220                 return log_oom();
3221
3222         log_struct(LOG_DEBUG,
3223                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3224                    "EXECUTABLE=%s", command->path,
3225                    LOG_UNIT_ID(unit),
3226                    NULL);
3227         pid = fork();
3228         if (pid < 0)
3229                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3230
3231         if (pid == 0) {
3232                 int exit_status;
3233                 _cleanup_free_ char *error_message = NULL;
3234
3235                 r = exec_child(unit,
3236                                command,
3237                                context,
3238                                params,
3239                                runtime,
3240                                dcreds,
3241                                argv,
3242                                socket_fd,
3243                                named_iofds,
3244                                fds,
3245                                n_storage_fds,
3246                                n_socket_fds,
3247                                files_env,
3248                                unit->manager->user_lookup_fds[1],
3249                                &exit_status,
3250                                &error_message);
3251                 if (r < 0) {
3252                         log_open();
3253                         if (error_message)
3254                                 log_struct_errno(LOG_ERR, r,
3255                                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3256                                                  LOG_UNIT_ID(unit),
3257                                                  LOG_UNIT_MESSAGE(unit, "%s: %m",
3258                                                                   error_message),
3259                                                  "EXECUTABLE=%s", command->path,
3260                                                  NULL);
3261                         else if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE))
3262                                 log_struct_errno(LOG_INFO, r,
3263                                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3264                                                  LOG_UNIT_ID(unit),
3265                                                  LOG_UNIT_MESSAGE(unit, "Skipped spawning %s: %m",
3266                                                                   command->path),
3267                                                  "EXECUTABLE=%s", command->path,
3268                                                  NULL);
3269                         else
3270                                 log_struct_errno(LOG_ERR, r,
3271                                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3272                                                  LOG_UNIT_ID(unit),
3273                                                  LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3274                                                                   exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3275                                                                   command->path),
3276                                                  "EXECUTABLE=%s", command->path,
3277                                                  NULL);
3278                 }
3279
3280                 _exit(exit_status);
3281         }
3282
3283         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3284
3285         /* We add the new process to the cgroup both in the child (so
3286          * that we can be sure that no user code is ever executed
3287          * outside of the cgroup) and in the parent (so that we can be
3288          * sure that when we kill the cgroup the process will be
3289          * killed too). */
3290         if (params->cgroup_path)
3291                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3292
3293         exec_status_start(&command->exec_status, pid);
3294
3295         *ret = pid;
3296         return 0;
3297 }
3298
3299 void exec_context_init(ExecContext *c) {
3300         ExecDirectoryType i;
3301
3302         assert(c);
3303
3304         c->umask = 0022;
3305         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3306         c->cpu_sched_policy = SCHED_OTHER;
3307         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3308         c->syslog_level_prefix = true;
3309         c->ignore_sigpipe = true;
3310         c->timer_slack_nsec = NSEC_INFINITY;
3311         c->personality = PERSONALITY_INVALID;
3312         for (i = 0; i < _EXEC_DIRECTORY_MAX; i++)
3313                 c->directories[i].mode = 0755;
3314         c->capability_bounding_set = CAP_ALL;
3315         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3316 }
3317
3318 void exec_context_done(ExecContext *c) {
3319         unsigned l;
3320         ExecDirectoryType i;
3321
3322         assert(c);
3323
3324         c->environment = strv_free(c->environment);
3325         c->environment_files = strv_free(c->environment_files);
3326         c->pass_environment = strv_free(c->pass_environment);
3327         c->unset_environment = strv_free(c->unset_environment);
3328
3329         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3330                 c->rlimit[l] = mfree(c->rlimit[l]);
3331
3332         for (l = 0; l < 3; l++)
3333                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3334
3335         c->working_directory = mfree(c->working_directory);
3336         c->root_directory = mfree(c->root_directory);
3337         c->root_image = mfree(c->root_image);
3338         c->tty_path = mfree(c->tty_path);
3339         c->syslog_identifier = mfree(c->syslog_identifier);
3340         c->user = mfree(c->user);
3341         c->group = mfree(c->group);
3342
3343         c->supplementary_groups = strv_free(c->supplementary_groups);
3344
3345         c->pam_name = mfree(c->pam_name);
3346
3347         c->read_only_paths = strv_free(c->read_only_paths);
3348         c->read_write_paths = strv_free(c->read_write_paths);
3349         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3350
3351         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3352
3353         if (c->cpuset)
3354                 CPU_FREE(c->cpuset);
3355
3356         c->utmp_id = mfree(c->utmp_id);
3357         c->selinux_context = mfree(c->selinux_context);
3358         c->apparmor_profile = mfree(c->apparmor_profile);
3359         c->smack_process_label = mfree(c->smack_process_label);
3360
3361         c->syscall_filter = set_free(c->syscall_filter);
3362         c->syscall_archs = set_free(c->syscall_archs);
3363         c->address_families = set_free(c->address_families);
3364
3365         for (i = 0; i < _EXEC_DIRECTORY_MAX; i++)
3366                 c->directories[i].paths = strv_free(c->directories[i].paths);
3367 }
3368
3369 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3370         char **i;
3371
3372         assert(c);
3373
3374         if (!runtime_prefix)
3375                 return 0;
3376
3377         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3378                 _cleanup_free_ char *p;
3379
3380                 p = strjoin(runtime_prefix, "/", *i);
3381                 if (!p)
3382                         return -ENOMEM;
3383
3384                 /* We execute this synchronously, since we need to be
3385                  * sure this is gone when we start the service
3386                  * next. */
3387                 (void) rm_rf(p, REMOVE_ROOT);
3388         }
3389
3390         return 0;
3391 }
3392
3393 void exec_command_done(ExecCommand *c) {
3394         assert(c);
3395
3396         c->path = mfree(c->path);
3397
3398         c->argv = strv_free(c->argv);
3399 }
3400
3401 void exec_command_done_array(ExecCommand *c, unsigned n) {
3402         unsigned i;
3403
3404         for (i = 0; i < n; i++)
3405                 exec_command_done(c+i);
3406 }
3407
3408 ExecCommand* exec_command_free_list(ExecCommand *c) {
3409         ExecCommand *i;
3410
3411         while ((i = c)) {
3412                 LIST_REMOVE(command, c, i);
3413                 exec_command_done(i);
3414                 free(i);
3415         }
3416
3417         return NULL;
3418 }
3419
3420 void exec_command_free_array(ExecCommand **c, unsigned n) {
3421         unsigned i;
3422
3423         for (i = 0; i < n; i++)
3424                 c[i] = exec_command_free_list(c[i]);
3425 }
3426
3427 typedef struct InvalidEnvInfo {
3428         Unit *unit;
3429         const char *path;
3430 } InvalidEnvInfo;
3431
3432 static void invalid_env(const char *p, void *userdata) {
3433         InvalidEnvInfo *info = userdata;
3434
3435         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3436 }
3437
3438 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3439         assert(c);
3440
3441         switch (fd_index) {
3442         case STDIN_FILENO:
3443                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3444                         return NULL;
3445                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3446         case STDOUT_FILENO:
3447                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3448                         return NULL;
3449                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3450         case STDERR_FILENO:
3451                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3452                         return NULL;
3453                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3454         default:
3455                 return NULL;
3456         }
3457 }
3458
3459 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3460         unsigned i, targets;
3461         const char* stdio_fdname[3];
3462         unsigned n_fds;
3463
3464         assert(c);
3465         assert(p);
3466
3467         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3468                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3469                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3470
3471         for (i = 0; i < 3; i++)
3472                 stdio_fdname[i] = exec_context_fdname(c, i);
3473
3474         n_fds = p->n_storage_fds + p->n_socket_fds;
3475
3476         for (i = 0; i < n_fds  && targets > 0; i++)
3477                 if (named_iofds[STDIN_FILENO] < 0 &&
3478                     c->std_input == EXEC_INPUT_NAMED_FD &&
3479                     stdio_fdname[STDIN_FILENO] &&
3480                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3481
3482                         named_iofds[STDIN_FILENO] = p->fds[i];
3483                         targets--;
3484
3485                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3486                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3487                            stdio_fdname[STDOUT_FILENO] &&
3488                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3489
3490                         named_iofds[STDOUT_FILENO] = p->fds[i];
3491                         targets--;
3492
3493                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3494                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3495                            stdio_fdname[STDERR_FILENO] &&
3496                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3497
3498                         named_iofds[STDERR_FILENO] = p->fds[i];
3499                         targets--;
3500                 }
3501
3502         return targets == 0 ? 0 : -ENOENT;
3503 }
3504
3505 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3506         char **i, **r = NULL;
3507
3508         assert(c);
3509         assert(l);
3510
3511         STRV_FOREACH(i, c->environment_files) {
3512                 char *fn;
3513                 int k;
3514                 unsigned n;
3515                 bool ignore = false;
3516                 char **p;
3517                 _cleanup_globfree_ glob_t pglob = {};
3518
3519                 fn = *i;
3520
3521                 if (fn[0] == '-') {
3522                         ignore = true;
3523                         fn++;
3524                 }
3525
3526                 if (!path_is_absolute(fn)) {
3527                         if (ignore)
3528                                 continue;
3529
3530                         strv_free(r);
3531                         return -EINVAL;
3532                 }
3533
3534                 /* Filename supports globbing, take all matching files */
3535                 k = safe_glob(fn, 0, &pglob);
3536                 if (k < 0) {
3537                         if (ignore)
3538                                 continue;
3539
3540                         strv_free(r);
3541                         return k;
3542                 }
3543
3544                 /* When we don't match anything, -ENOENT should be returned */
3545                 assert(pglob.gl_pathc > 0);
3546
3547                 for (n = 0; n < pglob.gl_pathc; n++) {
3548                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3549                         if (k < 0) {
3550                                 if (ignore)
3551                                         continue;
3552
3553                                 strv_free(r);
3554                                 return k;
3555                         }
3556                         /* Log invalid environment variables with filename */
3557                         if (p) {
3558                                 InvalidEnvInfo info = {
3559                                         .unit = unit,
3560                                         .path = pglob.gl_pathv[n]
3561                                 };
3562
3563                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3564                         }
3565
3566                         if (r == NULL)
3567                                 r = p;
3568                         else {
3569                                 char **m;
3570
3571                                 m = strv_env_merge(2, r, p);
3572                                 strv_free(r);
3573                                 strv_free(p);
3574                                 if (!m)
3575                                         return -ENOMEM;
3576
3577                                 r = m;
3578                         }
3579                 }
3580         }
3581
3582         *l = r;
3583
3584         return 0;
3585 }
3586
3587 static bool tty_may_match_dev_console(const char *tty) {
3588         _cleanup_free_ char *active = NULL;
3589         char *console;
3590
3591         if (!tty)
3592                 return true;
3593
3594         tty = skip_dev_prefix(tty);
3595
3596         /* trivial identity? */
3597         if (streq(tty, "console"))
3598                 return true;
3599
3600         console = resolve_dev_console(&active);
3601         /* if we could not resolve, assume it may */
3602         if (!console)
3603                 return true;
3604
3605         /* "tty0" means the active VC, so it may be the same sometimes */
3606         return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3607 }
3608
3609 bool exec_context_may_touch_console(ExecContext *ec) {
3610
3611         return (ec->tty_reset ||
3612                 ec->tty_vhangup ||
3613                 ec->tty_vt_disallocate ||
3614                 is_terminal_input(ec->std_input) ||
3615                 is_terminal_output(ec->std_output) ||
3616                 is_terminal_output(ec->std_error)) &&
3617                tty_may_match_dev_console(exec_context_tty_path(ec));
3618 }
3619
3620 static void strv_fprintf(FILE *f, char **l) {
3621         char **g;
3622
3623         assert(f);
3624
3625         STRV_FOREACH(g, l)
3626                 fprintf(f, " %s", *g);
3627 }
3628
3629 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3630         char **e, **d;
3631         unsigned i;
3632         ExecDirectoryType dt;
3633         int r;
3634
3635         assert(c);
3636         assert(f);
3637
3638         prefix = strempty(prefix);
3639
3640         fprintf(f,
3641                 "%sUMask: %04o\n"
3642                 "%sWorkingDirectory: %s\n"
3643                 "%sRootDirectory: %s\n"
3644                 "%sNonBlocking: %s\n"
3645                 "%sPrivateTmp: %s\n"
3646                 "%sPrivateDevices: %s\n"
3647                 "%sProtectKernelTunables: %s\n"
3648                 "%sProtectKernelModules: %s\n"
3649                 "%sProtectControlGroups: %s\n"
3650                 "%sPrivateNetwork: %s\n"
3651                 "%sPrivateUsers: %s\n"
3652                 "%sProtectHome: %s\n"
3653                 "%sProtectSystem: %s\n"
3654                 "%sMountAPIVFS: %s\n"
3655                 "%sIgnoreSIGPIPE: %s\n"
3656                 "%sMemoryDenyWriteExecute: %s\n"
3657                 "%sRestrictRealtime: %s\n"
3658                 "%sKeyringMode: %s\n",
3659                 prefix, c->umask,
3660                 prefix, c->working_directory ? c->working_directory : "/",
3661                 prefix, c->root_directory ? c->root_directory : "/",
3662                 prefix, yes_no(c->non_blocking),
3663                 prefix, yes_no(c->private_tmp),
3664                 prefix, yes_no(c->private_devices),
3665                 prefix, yes_no(c->protect_kernel_tunables),
3666                 prefix, yes_no(c->protect_kernel_modules),
3667                 prefix, yes_no(c->protect_control_groups),
3668                 prefix, yes_no(c->private_network),
3669                 prefix, yes_no(c->private_users),
3670                 prefix, protect_home_to_string(c->protect_home),
3671                 prefix, protect_system_to_string(c->protect_system),
3672                 prefix, yes_no(c->mount_apivfs),
3673                 prefix, yes_no(c->ignore_sigpipe),
3674                 prefix, yes_no(c->memory_deny_write_execute),
3675                 prefix, yes_no(c->restrict_realtime),
3676                 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3677
3678         if (c->root_image)
3679                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3680
3681         STRV_FOREACH(e, c->environment)
3682                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3683
3684         STRV_FOREACH(e, c->environment_files)
3685                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3686
3687         STRV_FOREACH(e, c->pass_environment)
3688                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3689
3690         STRV_FOREACH(e, c->unset_environment)
3691                 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3692
3693         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3694
3695         for (dt = 0; dt < _EXEC_DIRECTORY_MAX; dt++) {
3696                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3697
3698                 STRV_FOREACH(d, c->directories[dt].paths)
3699                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3700         }
3701
3702         if (c->nice_set)
3703                 fprintf(f,
3704                         "%sNice: %i\n",
3705                         prefix, c->nice);
3706
3707         if (c->oom_score_adjust_set)
3708                 fprintf(f,
3709                         "%sOOMScoreAdjust: %i\n",
3710                         prefix, c->oom_score_adjust);
3711
3712         for (i = 0; i < RLIM_NLIMITS; i++)
3713                 if (c->rlimit[i]) {
3714                         fprintf(f, "%s%s: " RLIM_FMT "\n",
3715                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3716                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3717                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3718                 }
3719
3720         if (c->ioprio_set) {
3721                 _cleanup_free_ char *class_str = NULL;
3722
3723                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3724                 if (r >= 0)
3725                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3726
3727                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3728         }
3729
3730         if (c->cpu_sched_set) {
3731                 _cleanup_free_ char *policy_str = NULL;
3732
3733                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3734                 if (r >= 0)
3735                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3736
3737                 fprintf(f,
3738                         "%sCPUSchedulingPriority: %i\n"
3739                         "%sCPUSchedulingResetOnFork: %s\n",
3740                         prefix, c->cpu_sched_priority,
3741                         prefix, yes_no(c->cpu_sched_reset_on_fork));
3742         }
3743
3744         if (c->cpuset) {
3745                 fprintf(f, "%sCPUAffinity:", prefix);
3746                 for (i = 0; i < c->cpuset_ncpus; i++)
3747                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3748                                 fprintf(f, " %u", i);
3749                 fputs("\n", f);
3750         }
3751
3752         if (c->timer_slack_nsec != NSEC_INFINITY)
3753                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3754
3755         fprintf(f,
3756                 "%sStandardInput: %s\n"
3757                 "%sStandardOutput: %s\n"
3758                 "%sStandardError: %s\n",
3759                 prefix, exec_input_to_string(c->std_input),
3760                 prefix, exec_output_to_string(c->std_output),
3761                 prefix, exec_output_to_string(c->std_error));
3762
3763         if (c->tty_path)
3764                 fprintf(f,
3765                         "%sTTYPath: %s\n"
3766                         "%sTTYReset: %s\n"
3767                         "%sTTYVHangup: %s\n"
3768                         "%sTTYVTDisallocate: %s\n",
3769                         prefix, c->tty_path,
3770                         prefix, yes_no(c->tty_reset),
3771                         prefix, yes_no(c->tty_vhangup),
3772                         prefix, yes_no(c->tty_vt_disallocate));
3773
3774         if (IN_SET(c->std_output,
3775                    EXEC_OUTPUT_SYSLOG,
3776                    EXEC_OUTPUT_KMSG,
3777                    EXEC_OUTPUT_JOURNAL,
3778                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3779                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
3780                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3781             IN_SET(c->std_error,
3782                    EXEC_OUTPUT_SYSLOG,
3783                    EXEC_OUTPUT_KMSG,
3784                    EXEC_OUTPUT_JOURNAL,
3785                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3786                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
3787                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3788
3789                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3790
3791                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3792                 if (r >= 0)
3793                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3794
3795                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3796                 if (r >= 0)
3797                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3798         }
3799
3800         if (c->secure_bits) {
3801                 _cleanup_free_ char *str = NULL;
3802
3803                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
3804                 if (r >= 0)
3805                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
3806         }
3807
3808         if (c->capability_bounding_set != CAP_ALL) {
3809                 _cleanup_free_ char *str = NULL;
3810
3811                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
3812                 if (r >= 0)
3813                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
3814         }
3815
3816         if (c->capability_ambient_set != 0) {
3817                 _cleanup_free_ char *str = NULL;
3818
3819                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
3820                 if (r >= 0)
3821                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
3822         }
3823
3824         if (c->user)
3825                 fprintf(f, "%sUser: %s\n", prefix, c->user);
3826         if (c->group)
3827                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
3828
3829         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
3830
3831         if (strv_length(c->supplementary_groups) > 0) {
3832                 fprintf(f, "%sSupplementaryGroups:", prefix);
3833                 strv_fprintf(f, c->supplementary_groups);
3834                 fputs("\n", f);
3835         }
3836
3837         if (c->pam_name)
3838                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
3839
3840         if (strv_length(c->read_write_paths) > 0) {
3841                 fprintf(f, "%sReadWritePaths:", prefix);
3842                 strv_fprintf(f, c->read_write_paths);
3843                 fputs("\n", f);
3844         }
3845
3846         if (strv_length(c->read_only_paths) > 0) {
3847                 fprintf(f, "%sReadOnlyPaths:", prefix);
3848                 strv_fprintf(f, c->read_only_paths);
3849                 fputs("\n", f);
3850         }
3851
3852         if (strv_length(c->inaccessible_paths) > 0) {
3853                 fprintf(f, "%sInaccessiblePaths:", prefix);
3854                 strv_fprintf(f, c->inaccessible_paths);
3855                 fputs("\n", f);
3856         }
3857
3858         if (c->n_bind_mounts > 0)
3859                 for (i = 0; i < c->n_bind_mounts; i++) {
3860                         fprintf(f, "%s%s: %s:%s:%s\n", prefix,
3861                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
3862                                 c->bind_mounts[i].source,
3863                                 c->bind_mounts[i].destination,
3864                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
3865                 }
3866
3867         if (c->utmp_id)
3868                 fprintf(f,
3869                         "%sUtmpIdentifier: %s\n",
3870                         prefix, c->utmp_id);
3871
3872         if (c->selinux_context)
3873                 fprintf(f,
3874                         "%sSELinuxContext: %s%s\n",
3875                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
3876
3877         if (c->apparmor_profile)
3878                 fprintf(f,
3879                         "%sAppArmorProfile: %s%s\n",
3880                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
3881
3882         if (c->smack_process_label)
3883                 fprintf(f,
3884                         "%sSmackProcessLabel: %s%s\n",
3885                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
3886
3887         if (c->personality != PERSONALITY_INVALID)
3888                 fprintf(f,
3889                         "%sPersonality: %s\n",
3890                         prefix, strna(personality_to_string(c->personality)));
3891
3892         fprintf(f,
3893                 "%sLockPersonality: %s\n",
3894                 prefix, yes_no(c->lock_personality));
3895
3896         if (c->syscall_filter) {
3897 #ifdef HAVE_SECCOMP
3898                 Iterator j;
3899                 void *id;
3900                 bool first = true;
3901 #endif
3902
3903                 fprintf(f,
3904                         "%sSystemCallFilter: ",
3905                         prefix);
3906
3907                 if (!c->syscall_whitelist)
3908                         fputc('~', f);
3909
3910 #ifdef HAVE_SECCOMP
3911                 SET_FOREACH(id, c->syscall_filter, j) {
3912                         _cleanup_free_ char *name = NULL;
3913
3914                         if (first)
3915                                 first = false;
3916                         else
3917                                 fputc(' ', f);
3918
3919                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
3920                         fputs(strna(name), f);
3921                 }
3922 #endif
3923
3924                 fputc('\n', f);
3925         }
3926
3927         if (c->syscall_archs) {
3928 #ifdef HAVE_SECCOMP
3929                 Iterator j;
3930                 void *id;
3931 #endif
3932
3933                 fprintf(f,
3934                         "%sSystemCallArchitectures:",
3935                         prefix);
3936
3937 #ifdef HAVE_SECCOMP
3938                 SET_FOREACH(id, c->syscall_archs, j)
3939                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
3940 #endif
3941                 fputc('\n', f);
3942         }
3943
3944         if (exec_context_restrict_namespaces_set(c)) {
3945                 _cleanup_free_ char *s = NULL;
3946
3947                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
3948                 if (r >= 0)
3949                         fprintf(f, "%sRestrictNamespaces: %s\n",
3950                                 prefix, s);
3951         }
3952
3953         if (c->syscall_errno > 0)
3954                 fprintf(f,
3955                         "%sSystemCallErrorNumber: %s\n",
3956                         prefix, strna(errno_to_name(c->syscall_errno)));
3957
3958         if (c->apparmor_profile)
3959                 fprintf(f,
3960                         "%sAppArmorProfile: %s%s\n",
3961                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
3962 }
3963
3964 bool exec_context_maintains_privileges(ExecContext *c) {
3965         assert(c);
3966
3967         /* Returns true if the process forked off would run under
3968          * an unchanged UID or as root. */
3969
3970         if (!c->user)
3971                 return true;
3972
3973         if (streq(c->user, "root") || streq(c->user, "0"))
3974                 return true;
3975
3976         return false;
3977 }
3978
3979 int exec_context_get_effective_ioprio(ExecContext *c) {
3980         int p;
3981
3982         assert(c);
3983
3984         if (c->ioprio_set)
3985                 return c->ioprio;
3986
3987         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
3988         if (p < 0)
3989                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
3990
3991         return p;
3992 }
3993
3994 void exec_status_start(ExecStatus *s, pid_t pid) {
3995         assert(s);
3996
3997         zero(*s);
3998         s->pid = pid;
3999         dual_timestamp_get(&s->start_timestamp);
4000 }
4001
4002 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4003         assert(s);
4004
4005         if (s->pid && s->pid != pid)
4006                 zero(*s);
4007
4008         s->pid = pid;
4009         dual_timestamp_get(&s->exit_timestamp);
4010
4011         s->code = code;
4012         s->status = status;
4013
4014         if (context) {
4015                 if (context->utmp_id)
4016                         utmp_put_dead_process(context->utmp_id, pid, code, status);
4017
4018                 exec_context_tty_reset(context, NULL);
4019         }
4020 }
4021
4022 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4023         char buf[FORMAT_TIMESTAMP_MAX];
4024
4025         assert(s);
4026         assert(f);
4027
4028         if (s->pid <= 0)
4029                 return;
4030
4031         prefix = strempty(prefix);
4032
4033         fprintf(f,
4034                 "%sPID: "PID_FMT"\n",
4035                 prefix, s->pid);
4036
4037         if (dual_timestamp_is_set(&s->start_timestamp))
4038                 fprintf(f,
4039                         "%sStart Timestamp: %s\n",
4040                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4041
4042         if (dual_timestamp_is_set(&s->exit_timestamp))
4043                 fprintf(f,
4044                         "%sExit Timestamp: %s\n"
4045                         "%sExit Code: %s\n"
4046                         "%sExit Status: %i\n",
4047                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4048                         prefix, sigchld_code_to_string(s->code),
4049                         prefix, s->status);
4050 }
4051
4052 char *exec_command_line(char **argv) {
4053         size_t k;
4054         char *n, *p, **a;
4055         bool first = true;
4056
4057         assert(argv);
4058
4059         k = 1;
4060         STRV_FOREACH(a, argv)
4061                 k += strlen(*a)+3;
4062
4063         n = new(char, k);
4064         if (!n)
4065                 return NULL;
4066
4067         p = n;
4068         STRV_FOREACH(a, argv) {
4069
4070                 if (!first)
4071                         *(p++) = ' ';
4072                 else
4073                         first = false;
4074
4075                 if (strpbrk(*a, WHITESPACE)) {
4076                         *(p++) = '\'';
4077                         p = stpcpy(p, *a);
4078                         *(p++) = '\'';
4079                 } else
4080                         p = stpcpy(p, *a);
4081
4082         }
4083
4084         *p = 0;
4085
4086         /* FIXME: this doesn't really handle arguments that have
4087          * spaces and ticks in them */
4088
4089         return n;
4090 }
4091
4092 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4093         _cleanup_free_ char *cmd = NULL;
4094         const char *prefix2;
4095
4096         assert(c);
4097         assert(f);
4098
4099         prefix = strempty(prefix);
4100         prefix2 = strjoina(prefix, "\t");
4101
4102         cmd = exec_command_line(c->argv);
4103         fprintf(f,
4104                 "%sCommand Line: %s\n",
4105                 prefix, cmd ? cmd : strerror(ENOMEM));
4106
4107         exec_status_dump(&c->exec_status, f, prefix2);
4108 }
4109
4110 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4111         assert(f);
4112
4113         prefix = strempty(prefix);
4114
4115         LIST_FOREACH(command, c, c)
4116                 exec_command_dump(c, f, prefix);
4117 }
4118
4119 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4120         ExecCommand *end;
4121
4122         assert(l);
4123         assert(e);
4124
4125         if (*l) {
4126                 /* It's kind of important, that we keep the order here */
4127                 LIST_FIND_TAIL(command, *l, end);
4128                 LIST_INSERT_AFTER(command, *l, end, e);
4129         } else
4130               *l = e;
4131 }
4132
4133 int exec_command_set(ExecCommand *c, const char *path, ...) {
4134         va_list ap;
4135         char **l, *p;
4136
4137         assert(c);
4138         assert(path);
4139
4140         va_start(ap, path);
4141         l = strv_new_ap(path, ap);
4142         va_end(ap);
4143
4144         if (!l)
4145                 return -ENOMEM;
4146
4147         p = strdup(path);
4148         if (!p) {
4149                 strv_free(l);
4150                 return -ENOMEM;
4151         }
4152
4153         free(c->path);
4154         c->path = p;
4155
4156         strv_free(c->argv);
4157         c->argv = l;
4158
4159         return 0;
4160 }
4161
4162 int exec_command_append(ExecCommand *c, const char *path, ...) {
4163         _cleanup_strv_free_ char **l = NULL;
4164         va_list ap;
4165         int r;
4166
4167         assert(c);
4168         assert(path);
4169
4170         va_start(ap, path);
4171         l = strv_new_ap(path, ap);
4172         va_end(ap);
4173
4174         if (!l)
4175                 return -ENOMEM;
4176
4177         r = strv_extend_strv(&c->argv, l, false);
4178         if (r < 0)
4179                 return r;
4180
4181         return 0;
4182 }
4183
4184
4185 static int exec_runtime_allocate(ExecRuntime **rt) {
4186
4187         if (*rt)
4188                 return 0;
4189
4190         *rt = new0(ExecRuntime, 1);
4191         if (!*rt)
4192                 return -ENOMEM;
4193
4194         (*rt)->n_ref = 1;
4195         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4196
4197         return 0;
4198 }
4199
4200 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4201         int r;
4202
4203         assert(rt);
4204         assert(c);
4205         assert(id);
4206
4207         if (*rt)
4208                 return 1;
4209
4210         if (!c->private_network && !c->private_tmp)
4211                 return 0;
4212
4213         r = exec_runtime_allocate(rt);
4214         if (r < 0)
4215                 return r;
4216
4217         if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4218                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4219                         return -errno;
4220         }
4221
4222         if (c->private_tmp && !(*rt)->tmp_dir) {
4223                 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4224                 if (r < 0)
4225                         return r;
4226         }
4227
4228         return 1;
4229 }
4230
4231 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4232         assert(r);
4233         assert(r->n_ref > 0);
4234
4235         r->n_ref++;
4236         return r;
4237 }
4238
4239 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4240
4241         if (!r)
4242                 return NULL;
4243
4244         assert(r->n_ref > 0);
4245
4246         r->n_ref--;
4247         if (r->n_ref > 0)
4248                 return NULL;
4249
4250         free(r->tmp_dir);
4251         free(r->var_tmp_dir);
4252         safe_close_pair(r->netns_storage_socket);
4253         return mfree(r);
4254 }
4255
4256 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4257         assert(u);
4258         assert(f);
4259         assert(fds);
4260
4261         if (!rt)
4262                 return 0;
4263
4264         if (rt->tmp_dir)
4265                 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4266
4267         if (rt->var_tmp_dir)
4268                 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4269
4270         if (rt->netns_storage_socket[0] >= 0) {
4271                 int copy;
4272
4273                 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4274                 if (copy < 0)
4275                         return copy;
4276
4277                 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4278         }
4279
4280         if (rt->netns_storage_socket[1] >= 0) {
4281                 int copy;
4282
4283                 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4284                 if (copy < 0)
4285                         return copy;
4286
4287                 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4288         }
4289
4290         return 0;
4291 }
4292
4293 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4294         int r;
4295
4296         assert(rt);
4297         assert(key);
4298         assert(value);
4299
4300         if (streq(key, "tmp-dir")) {
4301                 char *copy;
4302
4303                 r = exec_runtime_allocate(rt);
4304                 if (r < 0)
4305                         return log_oom();
4306
4307                 copy = strdup(value);
4308                 if (!copy)
4309                         return log_oom();
4310
4311                 free((*rt)->tmp_dir);
4312                 (*rt)->tmp_dir = copy;
4313
4314         } else if (streq(key, "var-tmp-dir")) {
4315                 char *copy;
4316
4317                 r = exec_runtime_allocate(rt);
4318                 if (r < 0)
4319                         return log_oom();
4320
4321                 copy = strdup(value);
4322                 if (!copy)
4323                         return log_oom();
4324
4325                 free((*rt)->var_tmp_dir);
4326                 (*rt)->var_tmp_dir = copy;
4327
4328         } else if (streq(key, "netns-socket-0")) {
4329                 int fd;
4330
4331                 r = exec_runtime_allocate(rt);
4332                 if (r < 0)
4333                         return log_oom();
4334
4335                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4336                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4337                 else {
4338                         safe_close((*rt)->netns_storage_socket[0]);
4339                         (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4340                 }
4341         } else if (streq(key, "netns-socket-1")) {
4342                 int fd;
4343
4344                 r = exec_runtime_allocate(rt);
4345                 if (r < 0)
4346                         return log_oom();
4347
4348                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4349                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4350                 else {
4351                         safe_close((*rt)->netns_storage_socket[1]);
4352                         (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4353                 }
4354         } else
4355                 return 0;
4356
4357         return 1;
4358 }
4359
4360 static void *remove_tmpdir_thread(void *p) {
4361         _cleanup_free_ char *path = p;
4362
4363         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4364         return NULL;
4365 }
4366
4367 void exec_runtime_destroy(ExecRuntime *rt) {
4368         int r;
4369
4370         if (!rt)
4371                 return;
4372
4373         /* If there are multiple users of this, let's leave the stuff around */
4374         if (rt->n_ref > 1)
4375                 return;
4376
4377         if (rt->tmp_dir) {
4378                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4379
4380                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4381                 if (r < 0) {
4382                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4383                         free(rt->tmp_dir);
4384                 }
4385
4386                 rt->tmp_dir = NULL;
4387         }
4388
4389         if (rt->var_tmp_dir) {
4390                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4391
4392                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4393                 if (r < 0) {
4394                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4395                         free(rt->var_tmp_dir);
4396                 }
4397
4398                 rt->var_tmp_dir = NULL;
4399         }
4400
4401         safe_close_pair(rt->netns_storage_socket);
4402 }
4403
4404 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4405         [EXEC_INPUT_NULL] = "null",
4406         [EXEC_INPUT_TTY] = "tty",
4407         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4408         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4409         [EXEC_INPUT_SOCKET] = "socket",
4410         [EXEC_INPUT_NAMED_FD] = "fd",
4411 };
4412
4413 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4414
4415 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4416         [EXEC_OUTPUT_INHERIT] = "inherit",
4417         [EXEC_OUTPUT_NULL] = "null",
4418         [EXEC_OUTPUT_TTY] = "tty",
4419         [EXEC_OUTPUT_SYSLOG] = "syslog",
4420         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4421         [EXEC_OUTPUT_KMSG] = "kmsg",
4422         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4423         [EXEC_OUTPUT_JOURNAL] = "journal",
4424         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4425         [EXEC_OUTPUT_SOCKET] = "socket",
4426         [EXEC_OUTPUT_NAMED_FD] = "fd",
4427 };
4428
4429 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4430
4431 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4432         [EXEC_UTMP_INIT] = "init",
4433         [EXEC_UTMP_LOGIN] = "login",
4434         [EXEC_UTMP_USER] = "user",
4435 };
4436
4437 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4438
4439 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4440         [EXEC_PRESERVE_NO] = "no",
4441         [EXEC_PRESERVE_YES] = "yes",
4442         [EXEC_PRESERVE_RESTART] = "restart",
4443 };
4444
4445 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4446
4447 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_MAX] = {
4448         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4449         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4450         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4451         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4452         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4453 };
4454
4455 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4456
4457 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4458         [EXEC_KEYRING_INHERIT] = "inherit",
4459         [EXEC_KEYRING_PRIVATE] = "private",
4460         [EXEC_KEYRING_SHARED] = "shared",
4461 };
4462
4463 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);