src/core/execute.c

   1 /***
   2   This file is part of systemd.
   3
   4   Copyright 2010 Lennart Poettering
   5
   6   systemd is free software; you can redistribute it and/or modify it
   7   under the terms of the GNU Lesser General Public License as published by
   8   the Free Software Foundation; either version 2.1 of the License, or
   9   (at your option) any later version.
  10
  11   systemd is distributed in the hope that it will be useful, but
  12   WITHOUT ANY WARRANTY; without even the implied warranty of
  13   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14   Lesser General Public License for more details.
  15
  16   You should have received a copy of the GNU Lesser General Public License
  17   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  18 ***/
  19
  20 #include <errno.h>
  21 #include <fcntl.h>
  22 #include <glob.h>
  23 #include <grp.h>
  24 #include <poll.h>
  25 #include <signal.h>
  26 #include <string.h>
  27 #include <sys/capability.h>
  28 #include <sys/eventfd.h>
  29 #include <sys/mman.h>
  30 #include <sys/personality.h>
  31 #include <sys/prctl.h>
  32 #include <sys/shm.h>
  33 #include <sys/socket.h>
  34 #include <sys/stat.h>
  35 #include <sys/types.h>
  36 #include <sys/un.h>
  37 #include <unistd.h>
  38 #include <utmpx.h>
  39
  40 #ifdef HAVE_PAM
  41 #include <security/pam_appl.h>
  42 #endif
  43
  44 #ifdef HAVE_SELINUX
  45 #include <selinux/selinux.h>
  46 #endif
  47
  48 #ifdef HAVE_SECCOMP
  49 #include <seccomp.h>
  50 #endif
  51
  52 #ifdef HAVE_APPARMOR
  53 #include <sys/apparmor.h>
  54 #endif
  55
  56 #include "sd-messages.h"
  57
  58 #include "af-list.h"
  59 #include "alloc-util.h"
  60 #ifdef HAVE_APPARMOR
  61 #include "apparmor-util.h"
  62 #endif
  63 #include "async.h"
  64 #include "barrier.h"
  65 #include "cap-list.h"
  66 #include "capability-util.h"
  67 #include "def.h"
  68 #include "env-util.h"
  69 #include "errno-list.h"
  70 #include "execute.h"
  71 #include "exit-status.h"
  72 #include "fd-util.h"
  73 #include "fileio.h"
  74 #include "format-util.h"
  75 #include "fs-util.h"
  76 #include "glob-util.h"
  77 #include "io-util.h"
  78 #include "ioprio.h"
  79 #include "log.h"
  80 #include "macro.h"
  81 #include "missing.h"
  82 #include "mkdir.h"
  83 #include "namespace.h"
  84 #include "parse-util.h"
  85 #include "path-util.h"
  86 #include "process-util.h"
  87 #include "rlimit-util.h"
  88 #include "rm-rf.h"
  89 #ifdef HAVE_SECCOMP
  90 #include "seccomp-util.h"
  91 #endif
  92 #include "securebits.h"
  93 #include "securebits-util.h"
  94 #include "selinux-util.h"
  95 #include "signal-util.h"
  96 #include "smack-util.h"
  97 #include "special.h"
  98 #include "string-table.h"
  99 #include "string-util.h"
 100 #include "strv.h"
 101 #include "syslog-util.h"
 102 #include "terminal-util.h"
 103 #include "unit.h"
 104 #include "user-util.h"
 105 #include "util.h"
 106 #include "utmp-wtmp.h"
 107
 108 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
 109 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
 110
 111 /* This assumes there is a 'tty' group */
 112 #define TTY_MODE 0620
 113
 114 #define SNDBUF_SIZE (8*1024*1024)
 115
 116 static int shift_fds(int fds[], unsigned n_fds) {
 117         int start, restart_from;
 118
 119         if (n_fds <= 0)
 120                 return 0;
 121
 122         /* Modifies the fds array! (sorts it) */
 123
 124         assert(fds);
 125
 126         start = 0;
 127         for (;;) {
 128                 int i;
 129
 130                 restart_from = -1;
 131
 132                 for (i = start; i < (int) n_fds; i++) {
 133                         int nfd;
 134
 135                         /* Already at right index? */
 136                         if (fds[i] == i+3)
 137                                 continue;
 138
 139                         nfd = fcntl(fds[i], F_DUPFD, i + 3);
 140                         if (nfd < 0)
 141                                 return -errno;
 142
 143                         safe_close(fds[i]);
 144                         fds[i] = nfd;
 145
 146                         /* Hmm, the fd we wanted isn't free? Then
 147                          * let's remember that and try again from here */
 148                         if (nfd != i+3 && restart_from < 0)
 149                                 restart_from = i;
 150                 }
 151
 152                 if (restart_from < 0)
 153                         break;
 154
 155                 start = restart_from;
 156         }
 157
 158         return 0;
 159 }
 160
 161 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
 162         unsigned i, n_fds;
 163         int r;
 164
 165         n_fds = n_storage_fds + n_socket_fds;
 166         if (n_fds <= 0)
 167                 return 0;
 168
 169         assert(fds);
 170
 171         /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
 172          * O_NONBLOCK only applies to socket activation though. */
 173
 174         for (i = 0; i < n_fds; i++) {
 175
 176                 if (i < n_socket_fds) {
 177                         r = fd_nonblock(fds[i], nonblock);
 178                         if (r < 0)
 179                                 return r;
 180                 }
 181
 182                 /* We unconditionally drop FD_CLOEXEC from the fds,
 183                  * since after all we want to pass these fds to our
 184                  * children */
 185
 186                 r = fd_cloexec(fds[i], false);
 187                 if (r < 0)
 188                         return r;
 189         }
 190
 191         return 0;
 192 }
 193
 194 static const char *exec_context_tty_path(const ExecContext *context) {
 195         assert(context);
 196
 197         if (context->stdio_as_fds)
 198                 return NULL;
 199
 200         if (context->tty_path)
 201                 return context->tty_path;
 202
 203         return "/dev/console";
 204 }
 205
 206 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
 207         const char *path;
 208
 209         assert(context);
 210
 211         path = exec_context_tty_path(context);
 212
 213         if (context->tty_vhangup) {
 214                 if (p && p->stdin_fd >= 0)
 215                         (void) terminal_vhangup_fd(p->stdin_fd);
 216                 else if (path)
 217                         (void) terminal_vhangup(path);
 218         }
 219
 220         if (context->tty_reset) {
 221                 if (p && p->stdin_fd >= 0)
 222                         (void) reset_terminal_fd(p->stdin_fd, true);
 223                 else if (path)
 224                         (void) reset_terminal(path);
 225         }
 226
 227         if (context->tty_vt_disallocate && path)
 228                 (void) vt_disallocate(path);
 229 }
 230
 231 static bool is_terminal_input(ExecInput i) {
 232         return IN_SET(i,
 233                       EXEC_INPUT_TTY,
 234                       EXEC_INPUT_TTY_FORCE,
 235                       EXEC_INPUT_TTY_FAIL);
 236 }
 237
 238 static bool is_terminal_output(ExecOutput o) {
 239         return IN_SET(o,
 240                       EXEC_OUTPUT_TTY,
 241                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
 242                       EXEC_OUTPUT_KMSG_AND_CONSOLE,
 243                       EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
 244 }
 245
 246 static bool is_syslog_output(ExecOutput o) {
 247         return IN_SET(o,
 248                       EXEC_OUTPUT_SYSLOG,
 249                       EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
 250 }
 251
 252 static bool is_kmsg_output(ExecOutput o) {
 253         return IN_SET(o,
 254                       EXEC_OUTPUT_KMSG,
 255                       EXEC_OUTPUT_KMSG_AND_CONSOLE);
 256 }
 257
 258 static bool exec_context_needs_term(const ExecContext *c) {
 259         assert(c);
 260
 261         /* Return true if the execution context suggests we should set $TERM to something useful. */
 262
 263         if (is_terminal_input(c->std_input))
 264                 return true;
 265
 266         if (is_terminal_output(c->std_output))
 267                 return true;
 268
 269         if (is_terminal_output(c->std_error))
 270                 return true;
 271
 272         return !!c->tty_path;
 273 }
 274
 275 static int open_null_as(int flags, int nfd) {
 276         int fd, r;
 277
 278         assert(nfd >= 0);
 279
 280         fd = open("/dev/null", flags|O_NOCTTY);
 281         if (fd < 0)
 282                 return -errno;
 283
 284         if (fd != nfd) {
 285                 r = dup2(fd, nfd) < 0 ? -errno : nfd;
 286                 safe_close(fd);
 287         } else
 288                 r = nfd;
 289
 290         return r;
 291 }
 292
 293 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
 294         static const union sockaddr_union sa = {
 295                 .un.sun_family = AF_UNIX,
 296                 .un.sun_path = "/run/systemd/journal/stdout",
 297         };
 298         uid_t olduid = UID_INVALID;
 299         gid_t oldgid = GID_INVALID;
 300         int r;
 301
 302         if (gid_is_valid(gid)) {
 303                 oldgid = getgid();
 304
 305                 if (setegid(gid) < 0)
 306                         return -errno;
 307         }
 308
 309         if (uid_is_valid(uid)) {
 310                 olduid = getuid();
 311
 312                 if (seteuid(uid) < 0) {
 313                         r = -errno;
 314                         goto restore_gid;
 315                 }
 316         }
 317
 318         r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
 319
 320         /* If we fail to restore the uid or gid, things will likely
 321            fail later on. This should only happen if an LSM interferes. */
 322
 323         if (uid_is_valid(uid))
 324                 (void) seteuid(olduid);
 325
 326  restore_gid:
 327         if (gid_is_valid(gid))
 328                 (void) setegid(oldgid);
 329
 330         return r;
 331 }
 332
 333 static int connect_logger_as(
 334                 Unit *unit,
 335                 const ExecContext *context,
 336                 const ExecParameters *params,
 337                 ExecOutput output,
 338                 const char *ident,
 339                 int nfd,
 340                 uid_t uid,
 341                 gid_t gid) {
 342
 343         int fd, r;
 344
 345         assert(context);
 346         assert(params);
 347         assert(output < _EXEC_OUTPUT_MAX);
 348         assert(ident);
 349         assert(nfd >= 0);
 350
 351         fd = socket(AF_UNIX, SOCK_STREAM, 0);
 352         if (fd < 0)
 353                 return -errno;
 354
 355         r = connect_journal_socket(fd, uid, gid);
 356         if (r < 0)
 357                 return r;
 358
 359         if (shutdown(fd, SHUT_RD) < 0) {
 360                 safe_close(fd);
 361                 return -errno;
 362         }
 363
 364         (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
 365
 366         dprintf(fd,
 367                 "%s\n"
 368                 "%s\n"
 369                 "%i\n"
 370                 "%i\n"
 371                 "%i\n"
 372                 "%i\n"
 373                 "%i\n",
 374                 context->syslog_identifier ?: ident,
 375                 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
 376                 context->syslog_priority,
 377                 !!context->syslog_level_prefix,
 378                 is_syslog_output(output),
 379                 is_kmsg_output(output),
 380                 is_terminal_output(output));
 381
 382         if (fd == nfd)
 383                 return nfd;
 384
 385         r = dup2(fd, nfd) < 0 ? -errno : nfd;
 386         safe_close(fd);
 387
 388         return r;
 389 }
 390 static int open_terminal_as(const char *path, mode_t mode, int nfd) {
 391         int fd, r;
 392
 393         assert(path);
 394         assert(nfd >= 0);
 395
 396         fd = open_terminal(path, mode | O_NOCTTY);
 397         if (fd < 0)
 398                 return fd;
 399
 400         if (fd != nfd) {
 401                 r = dup2(fd, nfd) < 0 ? -errno : nfd;
 402                 safe_close(fd);
 403         } else
 404                 r = nfd;
 405
 406         return r;
 407 }
 408
 409 static int fixup_input(ExecInput std_input, int socket_fd, bool apply_tty_stdin) {
 410
 411         if (is_terminal_input(std_input) && !apply_tty_stdin)
 412                 return EXEC_INPUT_NULL;
 413
 414         if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
 415                 return EXEC_INPUT_NULL;
 416
 417         return std_input;
 418 }
 419
 420 static int fixup_output(ExecOutput std_output, int socket_fd) {
 421
 422         if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
 423                 return EXEC_OUTPUT_INHERIT;
 424
 425         return std_output;
 426 }
 427
 428 static int setup_input(
 429                 const ExecContext *context,
 430                 const ExecParameters *params,
 431                 int socket_fd,
 432                 int named_iofds[3]) {
 433
 434         ExecInput i;
 435
 436         assert(context);
 437         assert(params);
 438
 439         if (params->stdin_fd >= 0) {
 440                 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
 441                         return -errno;
 442
 443                 /* Try to make this the controlling tty, if it is a tty, and reset it */
 444                 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
 445                 (void) reset_terminal_fd(STDIN_FILENO, true);
 446
 447                 return STDIN_FILENO;
 448         }
 449
 450         i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 451
 452         switch (i) {
 453
 454         case EXEC_INPUT_NULL:
 455                 return open_null_as(O_RDONLY, STDIN_FILENO);
 456
 457         case EXEC_INPUT_TTY:
 458         case EXEC_INPUT_TTY_FORCE:
 459         case EXEC_INPUT_TTY_FAIL: {
 460                 int fd, r;
 461
 462                 fd = acquire_terminal(exec_context_tty_path(context),
 463                                       i == EXEC_INPUT_TTY_FAIL,
 464                                       i == EXEC_INPUT_TTY_FORCE,
 465                                       false,
 466                                       USEC_INFINITY);
 467                 if (fd < 0)
 468                         return fd;
 469
 470                 if (fd != STDIN_FILENO) {
 471                         r = dup2(fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 472                         safe_close(fd);
 473                 } else
 474                         r = STDIN_FILENO;
 475
 476                 return r;
 477         }
 478
 479         case EXEC_INPUT_SOCKET:
 480                 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 481
 482         case EXEC_INPUT_NAMED_FD:
 483                 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
 484                 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
 485
 486         default:
 487                 assert_not_reached("Unknown input type");
 488         }
 489 }
 490
 491 static int setup_output(
 492                 Unit *unit,
 493                 const ExecContext *context,
 494                 const ExecParameters *params,
 495                 int fileno,
 496                 int socket_fd,
 497                 int named_iofds[3],
 498                 const char *ident,
 499                 uid_t uid,
 500                 gid_t gid,
 501                 dev_t *journal_stream_dev,
 502                 ino_t *journal_stream_ino) {
 503
 504         ExecOutput o;
 505         ExecInput i;
 506         int r;
 507
 508         assert(unit);
 509         assert(context);
 510         assert(params);
 511         assert(ident);
 512         assert(journal_stream_dev);
 513         assert(journal_stream_ino);
 514
 515         if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
 516
 517                 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
 518                         return -errno;
 519
 520                 return STDOUT_FILENO;
 521         }
 522
 523         if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
 524                 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
 525                         return -errno;
 526
 527                 return STDERR_FILENO;
 528         }
 529
 530         i = fixup_input(context->std_input, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
 531         o = fixup_output(context->std_output, socket_fd);
 532
 533         if (fileno == STDERR_FILENO) {
 534                 ExecOutput e;
 535                 e = fixup_output(context->std_error, socket_fd);
 536
 537                 /* This expects the input and output are already set up */
 538
 539                 /* Don't change the stderr file descriptor if we inherit all
 540                  * the way and are not on a tty */
 541                 if (e == EXEC_OUTPUT_INHERIT &&
 542                     o == EXEC_OUTPUT_INHERIT &&
 543                     i == EXEC_INPUT_NULL &&
 544                     !is_terminal_input(context->std_input) &&
 545                     getppid () != 1)
 546                         return fileno;
 547
 548                 /* Duplicate from stdout if possible */
 549                 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
 550                         return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
 551
 552                 o = e;
 553
 554         } else if (o == EXEC_OUTPUT_INHERIT) {
 555                 /* If input got downgraded, inherit the original value */
 556                 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
 557                         return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 558
 559                 /* If the input is connected to anything that's not a /dev/null, inherit that... */
 560                 if (i != EXEC_INPUT_NULL)
 561                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 562
 563                 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
 564                 if (getppid() != 1)
 565                         return fileno;
 566
 567                 /* We need to open /dev/null here anew, to get the right access mode. */
 568                 return open_null_as(O_WRONLY, fileno);
 569         }
 570
 571         switch (o) {
 572
 573         case EXEC_OUTPUT_NULL:
 574                 return open_null_as(O_WRONLY, fileno);
 575
 576         case EXEC_OUTPUT_TTY:
 577                 if (is_terminal_input(i))
 578                         return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
 579
 580                 /* We don't reset the terminal if this is just about output */
 581                 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
 582
 583         case EXEC_OUTPUT_SYSLOG:
 584         case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
 585         case EXEC_OUTPUT_KMSG:
 586         case EXEC_OUTPUT_KMSG_AND_CONSOLE:
 587         case EXEC_OUTPUT_JOURNAL:
 588         case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
 589                 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
 590                 if (r < 0) {
 591                         log_unit_error_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
 592                         r = open_null_as(O_WRONLY, fileno);
 593                 } else {
 594                         struct stat st;
 595
 596                         /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
 597                          * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
 598                          * services to detect whether they are connected to the journal or not. */
 599
 600                         if (fstat(fileno, &st) >= 0) {
 601                                 *journal_stream_dev = st.st_dev;
 602                                 *journal_stream_ino = st.st_ino;
 603                         }
 604                 }
 605                 return r;
 606
 607         case EXEC_OUTPUT_SOCKET:
 608                 assert(socket_fd >= 0);
 609                 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
 610
 611         case EXEC_OUTPUT_NAMED_FD:
 612                 (void) fd_nonblock(named_iofds[fileno], false);
 613                 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
 614
 615         default:
 616                 assert_not_reached("Unknown error type");
 617         }
 618 }
 619
 620 static int chown_terminal(int fd, uid_t uid) {
 621         struct stat st;
 622
 623         assert(fd >= 0);
 624
 625         /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
 626         if (isatty(fd) < 1)
 627                 return 0;
 628
 629         /* This might fail. What matters are the results. */
 630         (void) fchown(fd, uid, -1);
 631         (void) fchmod(fd, TTY_MODE);
 632
 633         if (fstat(fd, &st) < 0)
 634                 return -errno;
 635
 636         if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
 637                 return -EPERM;
 638
 639         return 0;
 640 }
 641
 642 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
 643         _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
 644         int r;
 645
 646         assert(_saved_stdin);
 647         assert(_saved_stdout);
 648
 649         saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
 650         if (saved_stdin < 0)
 651                 return -errno;
 652
 653         saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
 654         if (saved_stdout < 0)
 655                 return -errno;
 656
 657         fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
 658         if (fd < 0)
 659                 return fd;
 660
 661         r = chown_terminal(fd, getuid());
 662         if (r < 0)
 663                 return r;
 664
 665         r = reset_terminal_fd(fd, true);
 666         if (r < 0)
 667                 return r;
 668
 669         if (dup2(fd, STDIN_FILENO) < 0)
 670                 return -errno;
 671
 672         if (dup2(fd, STDOUT_FILENO) < 0)
 673                 return -errno;
 674
 675         if (fd >= 2)
 676                 safe_close(fd);
 677         fd = -1;
 678
 679         *_saved_stdin = saved_stdin;
 680         *_saved_stdout = saved_stdout;
 681
 682         saved_stdin = saved_stdout = -1;
 683
 684         return 0;
 685 }
 686
 687 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
 688         assert(err < 0);
 689
 690         if (err == -ETIMEDOUT)
 691                 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
 692         else {
 693                 errno = -err;
 694                 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
 695         }
 696 }
 697
 698 static void write_confirm_error(int err, const char *vc, const Unit *u) {
 699         _cleanup_close_ int fd = -1;
 700
 701         assert(vc);
 702
 703         fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
 704         if (fd < 0)
 705                 return;
 706
 707         write_confirm_error_fd(err, fd, u);
 708 }
 709
 710 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
 711         int r = 0;
 712
 713         assert(saved_stdin);
 714         assert(saved_stdout);
 715
 716         release_terminal();
 717
 718         if (*saved_stdin >= 0)
 719                 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
 720                         r = -errno;
 721
 722         if (*saved_stdout >= 0)
 723                 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
 724                         r = -errno;
 725
 726         *saved_stdin = safe_close(*saved_stdin);
 727         *saved_stdout = safe_close(*saved_stdout);
 728
 729         return r;
 730 }
 731
 732 enum {
 733         CONFIRM_PRETEND_FAILURE = -1,
 734         CONFIRM_PRETEND_SUCCESS =  0,
 735         CONFIRM_EXECUTE = 1,
 736 };
 737
 738 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
 739         int saved_stdout = -1, saved_stdin = -1, r;
 740         _cleanup_free_ char *e = NULL;
 741         char c;
 742
 743         /* For any internal errors, assume a positive response. */
 744         r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
 745         if (r < 0) {
 746                 write_confirm_error(r, vc, u);
 747                 return CONFIRM_EXECUTE;
 748         }
 749
 750         /* confirm_spawn might have been disabled while we were sleeping. */
 751         if (manager_is_confirm_spawn_disabled(u->manager)) {
 752                 r = 1;
 753                 goto restore_stdio;
 754         }
 755
 756         e = ellipsize(cmdline, 60, 100);
 757         if (!e) {
 758                 log_oom();
 759                 r = CONFIRM_EXECUTE;
 760                 goto restore_stdio;
 761         }
 762
 763         for (;;) {
 764                 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
 765                 if (r < 0) {
 766                         write_confirm_error_fd(r, STDOUT_FILENO, u);
 767                         r = CONFIRM_EXECUTE;
 768                         goto restore_stdio;
 769                 }
 770
 771                 switch (c) {
 772                 case 'c':
 773                         printf("Resuming normal execution.\n");
 774                         manager_disable_confirm_spawn();
 775                         r = 1;
 776                         break;
 777                 case 'D':
 778                         unit_dump(u, stdout, "  ");
 779                         continue; /* ask again */
 780                 case 'f':
 781                         printf("Failing execution.\n");
 782                         r = CONFIRM_PRETEND_FAILURE;
 783                         break;
 784                 case 'h':
 785                         printf("  c - continue, proceed without asking anymore\n"
 786                                "  D - dump, show the state of the unit\n"
 787                                "  f - fail, don't execute the command and pretend it failed\n"
 788                                "  h - help\n"
 789                                "  i - info, show a short summary of the unit\n"
 790                                "  j - jobs, show jobs that are in progress\n"
 791                                "  s - skip, don't execute the command and pretend it succeeded\n"
 792                                "  y - yes, execute the command\n");
 793                         continue; /* ask again */
 794                 case 'i':
 795                         printf("  Description: %s\n"
 796                                "  Unit:        %s\n"
 797                                "  Command:     %s\n",
 798                                u->id, u->description, cmdline);
 799                         continue; /* ask again */
 800                 case 'j':
 801                         manager_dump_jobs(u->manager, stdout, "  ");
 802                         continue; /* ask again */
 803                 case 'n':
 804                         /* 'n' was removed in favor of 'f'. */
 805                         printf("Didn't understand 'n', did you mean 'f'?\n");
 806                         continue; /* ask again */
 807                 case 's':
 808                         printf("Skipping execution.\n");
 809                         r = CONFIRM_PRETEND_SUCCESS;
 810                         break;
 811                 case 'y':
 812                         r = CONFIRM_EXECUTE;
 813                         break;
 814                 default:
 815                         assert_not_reached("Unhandled choice");
 816                 }
 817                 break;
 818         }
 819
 820 restore_stdio:
 821         restore_confirm_stdio(&saved_stdin, &saved_stdout);
 822         return r;
 823 }
 824
 825 static int get_fixed_user(const ExecContext *c, const char **user,
 826                           uid_t *uid, gid_t *gid,
 827                           const char **home, const char **shell) {
 828         int r;
 829         const char *name;
 830
 831         assert(c);
 832
 833         if (!c->user)
 834                 return 0;
 835
 836         /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
 837          * (i.e. are "/" or "/bin/nologin"). */
 838
 839         name = c->user;
 840         r = get_user_creds_clean(&name, uid, gid, home, shell);
 841         if (r < 0)
 842                 return r;
 843
 844         *user = name;
 845         return 0;
 846 }
 847
 848 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
 849         int r;
 850         const char *name;
 851
 852         assert(c);
 853
 854         if (!c->group)
 855                 return 0;
 856
 857         name = c->group;
 858         r = get_group_creds(&name, gid);
 859         if (r < 0)
 860                 return r;
 861
 862         *group = name;
 863         return 0;
 864 }
 865
 866 static int get_supplementary_groups(const ExecContext *c, const char *user,
 867                                     const char *group, gid_t gid,
 868                                     gid_t **supplementary_gids, int *ngids) {
 869         char **i;
 870         int r, k = 0;
 871         int ngroups_max;
 872         bool keep_groups = false;
 873         gid_t *groups = NULL;
 874         _cleanup_free_ gid_t *l_gids = NULL;
 875
 876         assert(c);
 877
 878         /*
 879          * If user is given, then lookup GID and supplementary groups list.
 880          * We avoid NSS lookups for gid=0. Also we have to initialize groups
 881          * here and as early as possible so we keep the list of supplementary
 882          * groups of the caller.
 883          */
 884         if (user && gid_is_valid(gid) && gid != 0) {
 885                 /* First step, initialize groups from /etc/groups */
 886                 if (initgroups(user, gid) < 0)
 887                         return -errno;
 888
 889                 keep_groups = true;
 890         }
 891
 892         if (!c->supplementary_groups)
 893                 return 0;
 894
 895         /*
 896          * If SupplementaryGroups= was passed then NGROUPS_MAX has to
 897          * be positive, otherwise fail.
 898          */
 899         errno = 0;
 900         ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
 901         if (ngroups_max <= 0) {
 902                 if (errno > 0)
 903                         return -errno;
 904                 else
 905                         return -EOPNOTSUPP; /* For all other values */
 906         }
 907
 908         l_gids = new(gid_t, ngroups_max);
 909         if (!l_gids)
 910                 return -ENOMEM;
 911
 912         if (keep_groups) {
 913                 /*
 914                  * Lookup the list of groups that the user belongs to, we
 915                  * avoid NSS lookups here too for gid=0.
 916                  */
 917                 k = ngroups_max;
 918                 if (getgrouplist(user, gid, l_gids, &k) < 0)
 919                         return -EINVAL;
 920         } else
 921                 k = 0;
 922
 923         STRV_FOREACH(i, c->supplementary_groups) {
 924                 const char *g;
 925
 926                 if (k >= ngroups_max)
 927                         return -E2BIG;
 928
 929                 g = *i;
 930                 r = get_group_creds(&g, l_gids+k);
 931                 if (r < 0)
 932                         return r;
 933
 934                 k++;
 935         }
 936
 937         /*
 938          * Sets ngids to zero to drop all supplementary groups, happens
 939          * when we are under root and SupplementaryGroups= is empty.
 940          */
 941         if (k == 0) {
 942                 *ngids = 0;
 943                 return 0;
 944         }
 945
 946         /* Otherwise get the final list of supplementary groups */
 947         groups = memdup(l_gids, sizeof(gid_t) * k);
 948         if (!groups)
 949                 return -ENOMEM;
 950
 951         *supplementary_gids = groups;
 952         *ngids = k;
 953
 954         groups = NULL;
 955
 956         return 0;
 957 }
 958
 959 static int enforce_groups(const ExecContext *context, gid_t gid,
 960                           gid_t *supplementary_gids, int ngids) {
 961         int r;
 962
 963         assert(context);
 964
 965         /* Handle SupplementaryGroups= even if it is empty */
 966         if (context->supplementary_groups) {
 967                 r = maybe_setgroups(ngids, supplementary_gids);
 968                 if (r < 0)
 969                         return r;
 970         }
 971
 972         if (gid_is_valid(gid)) {
 973                 /* Then set our gids */
 974                 if (setresgid(gid, gid, gid) < 0)
 975                         return -errno;
 976         }
 977
 978         return 0;
 979 }
 980
 981 static int enforce_user(const ExecContext *context, uid_t uid) {
 982         assert(context);
 983
 984         if (!uid_is_valid(uid))
 985                 return 0;
 986
 987         /* Sets (but doesn't look up) the uid and make sure we keep the
 988          * capabilities while doing so. */
 989
 990         if (context->capability_ambient_set != 0) {
 991
 992                 /* First step: If we need to keep capabilities but
 993                  * drop privileges we need to make sure we keep our
 994                  * caps, while we drop privileges. */
 995                 if (uid != 0) {
 996                         int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
 997
 998                         if (prctl(PR_GET_SECUREBITS) != sb)
 999                                 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1000                                         return -errno;
1001                 }
1002         }
1003
1004         /* Second step: actually set the uids */
1005         if (setresuid(uid, uid, uid) < 0)
1006                 return -errno;
1007
1008         /* At this point we should have all necessary capabilities but
1009            are otherwise a normal user. However, the caps might got
1010            corrupted due to the setresuid() so we need clean them up
1011            later. This is done outside of this call. */
1012
1013         return 0;
1014 }
1015
1016 #ifdef HAVE_PAM
1017
1018 static int null_conv(
1019                 int num_msg,
1020                 const struct pam_message **msg,
1021                 struct pam_response **resp,
1022                 void *appdata_ptr) {
1023
1024         /* We don't support conversations */
1025
1026         return PAM_CONV_ERR;
1027 }
1028
1029 #endif
1030
1031 static int setup_pam(
1032                 const char *name,
1033                 const char *user,
1034                 uid_t uid,
1035                 gid_t gid,
1036                 const char *tty,
1037                 char ***env,
1038                 int fds[], unsigned n_fds) {
1039
1040 #ifdef HAVE_PAM
1041
1042         static const struct pam_conv conv = {
1043                 .conv = null_conv,
1044                 .appdata_ptr = NULL
1045         };
1046
1047         _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1048         pam_handle_t *handle = NULL;
1049         sigset_t old_ss;
1050         int pam_code = PAM_SUCCESS, r;
1051         char **nv, **e = NULL;
1052         bool close_session = false;
1053         pid_t pam_pid = 0, parent_pid;
1054         int flags = 0;
1055
1056         assert(name);
1057         assert(user);
1058         assert(env);
1059
1060         /* We set up PAM in the parent process, then fork. The child
1061          * will then stay around until killed via PR_GET_PDEATHSIG or
1062          * systemd via the cgroup logic. It will then remove the PAM
1063          * session again. The parent process will exec() the actual
1064          * daemon. We do things this way to ensure that the main PID
1065          * of the daemon is the one we initially fork()ed. */
1066
1067         r = barrier_create(&barrier);
1068         if (r < 0)
1069                 goto fail;
1070
1071         if (log_get_max_level() < LOG_DEBUG)
1072                 flags |= PAM_SILENT;
1073
1074         pam_code = pam_start(name, user, &conv, &handle);
1075         if (pam_code != PAM_SUCCESS) {
1076                 handle = NULL;
1077                 goto fail;
1078         }
1079
1080         if (tty) {
1081                 pam_code = pam_set_item(handle, PAM_TTY, tty);
1082                 if (pam_code != PAM_SUCCESS)
1083                         goto fail;
1084         }
1085
1086         STRV_FOREACH(nv, *env) {
1087                 pam_code = pam_putenv(handle, *nv);
1088                 if (pam_code != PAM_SUCCESS)
1089                         goto fail;
1090         }
1091
1092         pam_code = pam_acct_mgmt(handle, flags);
1093         if (pam_code != PAM_SUCCESS)
1094                 goto fail;
1095
1096         pam_code = pam_open_session(handle, flags);
1097         if (pam_code != PAM_SUCCESS)
1098                 goto fail;
1099
1100         close_session = true;
1101
1102         e = pam_getenvlist(handle);
1103         if (!e) {
1104                 pam_code = PAM_BUF_ERR;
1105                 goto fail;
1106         }
1107
1108         /* Block SIGTERM, so that we know that it won't get lost in
1109          * the child */
1110
1111         assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1112
1113         parent_pid = getpid_cached();
1114
1115         pam_pid = fork();
1116         if (pam_pid < 0) {
1117                 r = -errno;
1118                 goto fail;
1119         }
1120
1121         if (pam_pid == 0) {
1122                 int sig, ret = EXIT_PAM;
1123
1124                 /* The child's job is to reset the PAM session on
1125                  * termination */
1126                 barrier_set_role(&barrier, BARRIER_CHILD);
1127
1128                 /* This string must fit in 10 chars (i.e. the length
1129                  * of "/sbin/init"), to look pretty in /bin/ps */
1130                 rename_process("(sd-pam)");
1131
1132                 /* Make sure we don't keep open the passed fds in this
1133                 child. We assume that otherwise only those fds are
1134                 open here that have been opened by PAM. */
1135                 close_many(fds, n_fds);
1136
1137                 /* Drop privileges - we don't need any to pam_close_session
1138                  * and this will make PR_SET_PDEATHSIG work in most cases.
1139                  * If this fails, ignore the error - but expect sd-pam threads
1140                  * to fail to exit normally */
1141
1142                 r = maybe_setgroups(0, NULL);
1143                 if (r < 0)
1144                         log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1145                 if (setresgid(gid, gid, gid) < 0)
1146                         log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1147                 if (setresuid(uid, uid, uid) < 0)
1148                         log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1149
1150                 (void) ignore_signals(SIGPIPE, -1);
1151
1152                 /* Wait until our parent died. This will only work if
1153                  * the above setresuid() succeeds, otherwise the kernel
1154                  * will not allow unprivileged parents kill their privileged
1155                  * children this way. We rely on the control groups kill logic
1156                  * to do the rest for us. */
1157                 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1158                         goto child_finish;
1159
1160                 /* Tell the parent that our setup is done. This is especially
1161                  * important regarding dropping privileges. Otherwise, unit
1162                  * setup might race against our setresuid(2) call.
1163                  *
1164                  * If the parent aborted, we'll detect this below, hence ignore
1165                  * return failure here. */
1166                 (void) barrier_place(&barrier);
1167
1168                 /* Check if our parent process might already have died? */
1169                 if (getppid() == parent_pid) {
1170                         sigset_t ss;
1171
1172                         assert_se(sigemptyset(&ss) >= 0);
1173                         assert_se(sigaddset(&ss, SIGTERM) >= 0);
1174
1175                         for (;;) {
1176                                 if (sigwait(&ss, &sig) < 0) {
1177                                         if (errno == EINTR)
1178                                                 continue;
1179
1180                                         goto child_finish;
1181                                 }
1182
1183                                 assert(sig == SIGTERM);
1184                                 break;
1185                         }
1186                 }
1187
1188                 /* If our parent died we'll end the session */
1189                 if (getppid() != parent_pid) {
1190                         pam_code = pam_close_session(handle, flags);
1191                         if (pam_code != PAM_SUCCESS)
1192                                 goto child_finish;
1193                 }
1194
1195                 ret = 0;
1196
1197         child_finish:
1198                 pam_end(handle, pam_code | flags);
1199                 _exit(ret);
1200         }
1201
1202         barrier_set_role(&barrier, BARRIER_PARENT);
1203
1204         /* If the child was forked off successfully it will do all the
1205          * cleanups, so forget about the handle here. */
1206         handle = NULL;
1207
1208         /* Unblock SIGTERM again in the parent */
1209         assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1210
1211         /* We close the log explicitly here, since the PAM modules
1212          * might have opened it, but we don't want this fd around. */
1213         closelog();
1214
1215         /* Synchronously wait for the child to initialize. We don't care for
1216          * errors as we cannot recover. However, warn loudly if it happens. */
1217         if (!barrier_place_and_sync(&barrier))
1218                 log_error("PAM initialization failed");
1219
1220         strv_free(*env);
1221         *env = e;
1222
1223         return 0;
1224
1225 fail:
1226         if (pam_code != PAM_SUCCESS) {
1227                 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1228                 r = -EPERM;  /* PAM errors do not map to errno */
1229         } else
1230                 log_error_errno(r, "PAM failed: %m");
1231
1232         if (handle) {
1233                 if (close_session)
1234                         pam_code = pam_close_session(handle, flags);
1235
1236                 pam_end(handle, pam_code | flags);
1237         }
1238
1239         strv_free(e);
1240         closelog();
1241
1242         return r;
1243 #else
1244         return 0;
1245 #endif
1246 }
1247
1248 static void rename_process_from_path(const char *path) {
1249         char process_name[11];
1250         const char *p;
1251         size_t l;
1252
1253         /* This resulting string must fit in 10 chars (i.e. the length
1254          * of "/sbin/init") to look pretty in /bin/ps */
1255
1256         p = basename(path);
1257         if (isempty(p)) {
1258                 rename_process("(...)");
1259                 return;
1260         }
1261
1262         l = strlen(p);
1263         if (l > 8) {
1264                 /* The end of the process name is usually more
1265                  * interesting, since the first bit might just be
1266                  * "systemd-" */
1267                 p = p + l - 8;
1268                 l = 8;
1269         }
1270
1271         process_name[0] = '(';
1272         memcpy(process_name+1, p, l);
1273         process_name[1+l] = ')';
1274         process_name[1+l+1] = 0;
1275
1276         rename_process(process_name);
1277 }
1278
1279 static bool context_has_address_families(const ExecContext *c) {
1280         assert(c);
1281
1282         return c->address_families_whitelist ||
1283                 !set_isempty(c->address_families);
1284 }
1285
1286 static bool context_has_syscall_filters(const ExecContext *c) {
1287         assert(c);
1288
1289         return c->syscall_whitelist ||
1290                 !set_isempty(c->syscall_filter);
1291 }
1292
1293 static bool context_has_no_new_privileges(const ExecContext *c) {
1294         assert(c);
1295
1296         if (c->no_new_privileges)
1297                 return true;
1298
1299         if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1300                 return false;
1301
1302         /* We need NNP if we have any form of seccomp and are unprivileged */
1303         return context_has_address_families(c) ||
1304                 c->memory_deny_write_execute ||
1305                 c->restrict_realtime ||
1306                 exec_context_restrict_namespaces_set(c) ||
1307                 c->protect_kernel_tunables ||
1308                 c->protect_kernel_modules ||
1309                 c->private_devices ||
1310                 context_has_syscall_filters(c) ||
1311                 !set_isempty(c->syscall_archs) ||
1312                 c->lock_personality;
1313 }
1314
1315 #ifdef HAVE_SECCOMP
1316
1317 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1318
1319         if (is_seccomp_available())
1320                 return false;
1321
1322         log_open();
1323         log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1324         log_close();
1325         return true;
1326 }
1327
1328 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1329         uint32_t negative_action, default_action, action;
1330         int r;
1331
1332         assert(u);
1333         assert(c);
1334
1335         if (!context_has_syscall_filters(c))
1336                 return 0;
1337
1338         if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1339                 return 0;
1340
1341         negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1342
1343         if (c->syscall_whitelist) {
1344                 default_action = negative_action;
1345                 action = SCMP_ACT_ALLOW;
1346         } else {
1347                 default_action = SCMP_ACT_ALLOW;
1348                 action = negative_action;
1349         }
1350
1351         if (needs_ambient_hack) {
1352                 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1353                 if (r < 0)
1354                         return r;
1355         }
1356
1357         return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1358 }
1359
1360 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1361         assert(u);
1362         assert(c);
1363
1364         if (set_isempty(c->syscall_archs))
1365                 return 0;
1366
1367         if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1368                 return 0;
1369
1370         return seccomp_restrict_archs(c->syscall_archs);
1371 }
1372
1373 static int apply_address_families(const Unit* u, const ExecContext *c) {
1374         assert(u);
1375         assert(c);
1376
1377         if (!context_has_address_families(c))
1378                 return 0;
1379
1380         if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1381                 return 0;
1382
1383         return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1384 }
1385
1386 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1387         assert(u);
1388         assert(c);
1389
1390         if (!c->memory_deny_write_execute)
1391                 return 0;
1392
1393         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1394                 return 0;
1395
1396         return seccomp_memory_deny_write_execute();
1397 }
1398
1399 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1400         assert(u);
1401         assert(c);
1402
1403         if (!c->restrict_realtime)
1404                 return 0;
1405
1406         if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1407                 return 0;
1408
1409         return seccomp_restrict_realtime();
1410 }
1411
1412 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1413         assert(u);
1414         assert(c);
1415
1416         /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1417          * let's protect even those systems where this is left on in the kernel. */
1418
1419         if (!c->protect_kernel_tunables)
1420                 return 0;
1421
1422         if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1423                 return 0;
1424
1425         return seccomp_protect_sysctl();
1426 }
1427
1428 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1429         assert(u);
1430         assert(c);
1431
1432         /* Turn off module syscalls on ProtectKernelModules=yes */
1433
1434         if (!c->protect_kernel_modules)
1435                 return 0;
1436
1437         if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1438                 return 0;
1439
1440         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1441 }
1442
1443 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1444         assert(u);
1445         assert(c);
1446
1447         /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1448
1449         if (!c->private_devices)
1450                 return 0;
1451
1452         if (skip_seccomp_unavailable(u, "PrivateDevices="))
1453                 return 0;
1454
1455         return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1456 }
1457
1458 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1459         assert(u);
1460         assert(c);
1461
1462         if (!exec_context_restrict_namespaces_set(c))
1463                 return 0;
1464
1465         if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1466                 return 0;
1467
1468         return seccomp_restrict_namespaces(c->restrict_namespaces);
1469 }
1470
1471 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1472         unsigned long personality;
1473         int r;
1474
1475         assert(u);
1476         assert(c);
1477
1478         if (!c->lock_personality)
1479                 return 0;
1480
1481         if (skip_seccomp_unavailable(u, "LockPersonality="))
1482                 return 0;
1483
1484         personality = c->personality;
1485
1486         /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1487         if (personality == PERSONALITY_INVALID) {
1488
1489                 r = opinionated_personality(&personality);
1490                 if (r < 0)
1491                         return r;
1492         }
1493
1494         return seccomp_lock_personality(personality);
1495 }
1496
1497 #endif
1498
1499 static void do_idle_pipe_dance(int idle_pipe[4]) {
1500         assert(idle_pipe);
1501
1502         idle_pipe[1] = safe_close(idle_pipe[1]);
1503         idle_pipe[2] = safe_close(idle_pipe[2]);
1504
1505         if (idle_pipe[0] >= 0) {
1506                 int r;
1507
1508                 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1509
1510                 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1511                         ssize_t n;
1512
1513                         /* Signal systemd that we are bored and want to continue. */
1514                         n = write(idle_pipe[3], "x", 1);
1515                         if (n > 0)
1516                                 /* Wait for systemd to react to the signal above. */
1517                                 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1518                 }
1519
1520                 idle_pipe[0] = safe_close(idle_pipe[0]);
1521
1522         }
1523
1524         idle_pipe[3] = safe_close(idle_pipe[3]);
1525 }
1526
1527 static int build_environment(
1528                 Unit *u,
1529                 const ExecContext *c,
1530                 const ExecParameters *p,
1531                 unsigned n_fds,
1532                 const char *home,
1533                 const char *username,
1534                 const char *shell,
1535                 dev_t journal_stream_dev,
1536                 ino_t journal_stream_ino,
1537                 char ***ret) {
1538
1539         _cleanup_strv_free_ char **our_env = NULL;
1540         unsigned n_env = 0;
1541         char *x;
1542
1543         assert(u);
1544         assert(c);
1545         assert(ret);
1546
1547         our_env = new0(char*, 14);
1548         if (!our_env)
1549                 return -ENOMEM;
1550
1551         if (n_fds > 0) {
1552                 _cleanup_free_ char *joined = NULL;
1553
1554                 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1555                         return -ENOMEM;
1556                 our_env[n_env++] = x;
1557
1558                 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1559                         return -ENOMEM;
1560                 our_env[n_env++] = x;
1561
1562                 joined = strv_join(p->fd_names, ":");
1563                 if (!joined)
1564                         return -ENOMEM;
1565
1566                 x = strjoin("LISTEN_FDNAMES=", joined);
1567                 if (!x)
1568                         return -ENOMEM;
1569                 our_env[n_env++] = x;
1570         }
1571
1572         if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1573                 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1574                         return -ENOMEM;
1575                 our_env[n_env++] = x;
1576
1577                 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1578                         return -ENOMEM;
1579                 our_env[n_env++] = x;
1580         }
1581
1582         /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1583          * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1584          * check the database directly. */
1585         if (p->flags & EXEC_NSS_BYPASS_BUS) {
1586                 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1587                 if (!x)
1588                         return -ENOMEM;
1589                 our_env[n_env++] = x;
1590         }
1591
1592         if (home) {
1593                 x = strappend("HOME=", home);
1594                 if (!x)
1595                         return -ENOMEM;
1596                 our_env[n_env++] = x;
1597         }
1598
1599         if (username) {
1600                 x = strappend("LOGNAME=", username);
1601                 if (!x)
1602                         return -ENOMEM;
1603                 our_env[n_env++] = x;
1604
1605                 x = strappend("USER=", username);
1606                 if (!x)
1607                         return -ENOMEM;
1608                 our_env[n_env++] = x;
1609         }
1610
1611         if (shell) {
1612                 x = strappend("SHELL=", shell);
1613                 if (!x)
1614                         return -ENOMEM;
1615                 our_env[n_env++] = x;
1616         }
1617
1618         if (!sd_id128_is_null(u->invocation_id)) {
1619                 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1620                         return -ENOMEM;
1621
1622                 our_env[n_env++] = x;
1623         }
1624
1625         if (exec_context_needs_term(c)) {
1626                 const char *tty_path, *term = NULL;
1627
1628                 tty_path = exec_context_tty_path(c);
1629
1630                 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1631                  * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1632                  * passes to PID 1 ends up all the way in the console login shown. */
1633
1634                 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1635                         term = getenv("TERM");
1636                 if (!term)
1637                         term = default_term_for_tty(tty_path);
1638
1639                 x = strappend("TERM=", term);
1640                 if (!x)
1641                         return -ENOMEM;
1642                 our_env[n_env++] = x;
1643         }
1644
1645         if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1646                 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1647                         return -ENOMEM;
1648
1649                 our_env[n_env++] = x;
1650         }
1651
1652         our_env[n_env++] = NULL;
1653         assert(n_env <= 12);
1654
1655         *ret = our_env;
1656         our_env = NULL;
1657
1658         return 0;
1659 }
1660
1661 static int build_pass_environment(const ExecContext *c, char ***ret) {
1662         _cleanup_strv_free_ char **pass_env = NULL;
1663         size_t n_env = 0, n_bufsize = 0;
1664         char **i;
1665
1666         STRV_FOREACH(i, c->pass_environment) {
1667                 _cleanup_free_ char *x = NULL;
1668                 char *v;
1669
1670                 v = getenv(*i);
1671                 if (!v)
1672                         continue;
1673                 x = strjoin(*i, "=", v);
1674                 if (!x)
1675                         return -ENOMEM;
1676                 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1677                         return -ENOMEM;
1678                 pass_env[n_env++] = x;
1679                 pass_env[n_env] = NULL;
1680                 x = NULL;
1681         }
1682
1683         *ret = pass_env;
1684         pass_env = NULL;
1685
1686         return 0;
1687 }
1688
1689 static bool exec_needs_mount_namespace(
1690                 const ExecContext *context,
1691                 const ExecParameters *params,
1692                 ExecRuntime *runtime) {
1693
1694         assert(context);
1695         assert(params);
1696
1697         if (context->root_image)
1698                 return true;
1699
1700         if (!strv_isempty(context->read_write_paths) ||
1701             !strv_isempty(context->read_only_paths) ||
1702             !strv_isempty(context->inaccessible_paths))
1703                 return true;
1704
1705         if (context->n_bind_mounts > 0)
1706                 return true;
1707
1708         if (context->mount_flags != 0)
1709                 return true;
1710
1711         if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1712                 return true;
1713
1714         if (context->private_devices ||
1715             context->protect_system != PROTECT_SYSTEM_NO ||
1716             context->protect_home != PROTECT_HOME_NO ||
1717             context->protect_kernel_tunables ||
1718             context->protect_kernel_modules ||
1719             context->protect_control_groups)
1720                 return true;
1721
1722         if (context->mount_apivfs && (context->root_image || context->root_directory))
1723                 return true;
1724
1725         return false;
1726 }
1727
1728 static int setup_private_users(uid_t uid, gid_t gid) {
1729         _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1730         _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1731         _cleanup_close_ int unshare_ready_fd = -1;
1732         _cleanup_(sigkill_waitp) pid_t pid = 0;
1733         uint64_t c = 1;
1734         siginfo_t si;
1735         ssize_t n;
1736         int r;
1737
1738         /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1739          * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1740          * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1741          * which waits for the parent to create the new user namespace while staying in the original namespace. The
1742          * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1743          * continues execution normally. */
1744
1745         if (uid != 0 && uid_is_valid(uid)) {
1746                 r = asprintf(&uid_map,
1747                              "0 0 1\n"                      /* Map root → root */
1748                              UID_FMT " " UID_FMT " 1\n",    /* Map $UID → $UID */
1749                              uid, uid);
1750                 if (r < 0)
1751                         return -ENOMEM;
1752         } else {
1753                 uid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1754                 if (!uid_map)
1755                         return -ENOMEM;
1756         }
1757
1758         if (gid != 0 && gid_is_valid(gid)) {
1759                 r = asprintf(&gid_map,
1760                              "0 0 1\n"                      /* Map root → root */
1761                              GID_FMT " " GID_FMT " 1\n",    /* Map $GID → $GID */
1762                              gid, gid);
1763                 if (r < 0)
1764                         return -ENOMEM;
1765         } else {
1766                 gid_map = strdup("0 0 1\n");            /* The case where the above is the same */
1767                 if (!gid_map)
1768                         return -ENOMEM;
1769         }
1770
1771         /* Create a communication channel so that the parent can tell the child when it finished creating the user
1772          * namespace. */
1773         unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1774         if (unshare_ready_fd < 0)
1775                 return -errno;
1776
1777         /* Create a communication channel so that the child can tell the parent a proper error code in case it
1778          * failed. */
1779         if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1780                 return -errno;
1781
1782         pid = fork();
1783         if (pid < 0)
1784                 return -errno;
1785
1786         if (pid == 0) {
1787                 _cleanup_close_ int fd = -1;
1788                 const char *a;
1789                 pid_t ppid;
1790
1791                 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1792                  * here, after the parent opened its own user namespace. */
1793
1794                 ppid = getppid();
1795                 errno_pipe[0] = safe_close(errno_pipe[0]);
1796
1797                 /* Wait until the parent unshared the user namespace */
1798                 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1799                         r = -errno;
1800                         goto child_fail;
1801                 }
1802
1803                 /* Disable the setgroups() system call in the child user namespace, for good. */
1804                 a = procfs_file_alloca(ppid, "setgroups");
1805                 fd = open(a, O_WRONLY|O_CLOEXEC);
1806                 if (fd < 0) {
1807                         if (errno != ENOENT) {
1808                                 r = -errno;
1809                                 goto child_fail;
1810                         }
1811
1812                         /* If the file is missing the kernel is too old, let's continue anyway. */
1813                 } else {
1814                         if (write(fd, "deny\n", 5) < 0) {
1815                                 r = -errno;
1816                                 goto child_fail;
1817                         }
1818
1819                         fd = safe_close(fd);
1820                 }
1821
1822                 /* First write the GID map */
1823                 a = procfs_file_alloca(ppid, "gid_map");
1824                 fd = open(a, O_WRONLY|O_CLOEXEC);
1825                 if (fd < 0) {
1826                         r = -errno;
1827                         goto child_fail;
1828                 }
1829                 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1830                         r = -errno;
1831                         goto child_fail;
1832                 }
1833                 fd = safe_close(fd);
1834
1835                 /* The write the UID map */
1836                 a = procfs_file_alloca(ppid, "uid_map");
1837                 fd = open(a, O_WRONLY|O_CLOEXEC);
1838                 if (fd < 0) {
1839                         r = -errno;
1840                         goto child_fail;
1841                 }
1842                 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1843                         r = -errno;
1844                         goto child_fail;
1845                 }
1846
1847                 _exit(EXIT_SUCCESS);
1848
1849         child_fail:
1850                 (void) write(errno_pipe[1], &r, sizeof(r));
1851                 _exit(EXIT_FAILURE);
1852         }
1853
1854         errno_pipe[1] = safe_close(errno_pipe[1]);
1855
1856         if (unshare(CLONE_NEWUSER) < 0)
1857                 return -errno;
1858
1859         /* Let the child know that the namespace is ready now */
1860         if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1861                 return -errno;
1862
1863         /* Try to read an error code from the child */
1864         n = read(errno_pipe[0], &r, sizeof(r));
1865         if (n < 0)
1866                 return -errno;
1867         if (n == sizeof(r)) { /* an error code was sent to us */
1868                 if (r < 0)
1869                         return r;
1870                 return -EIO;
1871         }
1872         if (n != 0) /* on success we should have read 0 bytes */
1873                 return -EIO;
1874
1875         r = wait_for_terminate(pid, &si);
1876         if (r < 0)
1877                 return r;
1878         pid = 0;
1879
1880         /* If something strange happened with the child, let's consider this fatal, too */
1881         if (si.si_code != CLD_EXITED || si.si_status != 0)
1882                 return -EIO;
1883
1884         return 0;
1885 }
1886
1887 static int setup_exec_directory(
1888                 const ExecContext *context,
1889                 const ExecParameters *params,
1890                 uid_t uid,
1891                 gid_t gid,
1892                 ExecDirectoryType type,
1893                 int *exit_status) {
1894
1895         static const int exit_status_table[_EXEC_DIRECTORY_MAX] = {
1896                 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1897                 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1898                 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1899                 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1900                 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1901         };
1902         char **rt;
1903         int r;
1904
1905         assert(context);
1906         assert(params);
1907         assert(type >= 0 && type < _EXEC_DIRECTORY_MAX);
1908         assert(exit_status);
1909
1910         if (!params->prefix[type])
1911                 return 0;
1912
1913         if (params->flags & EXEC_CHOWN_DIRECTORIES) {
1914                 if (!uid_is_valid(uid))
1915                         uid = 0;
1916                 if (!gid_is_valid(gid))
1917                         gid = 0;
1918         }
1919
1920         STRV_FOREACH(rt, context->directories[type].paths) {
1921                 _cleanup_free_ char *p;
1922
1923                 p = strjoin(params->prefix[type], "/", *rt);
1924                 if (!p) {
1925                         r = -ENOMEM;
1926                         goto fail;
1927                 }
1928
1929                 r = mkdir_parents_label(p, 0755);
1930                 if (r < 0)
1931                         goto fail;
1932
1933                 r = mkdir_p_label(p, context->directories[type].mode);
1934                 if (r < 0)
1935                         goto fail;
1936
1937                 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
1938                  * a service, and shall not be writable. */
1939                 if (type == EXEC_DIRECTORY_CONFIGURATION)
1940                         continue;
1941
1942                 r = chmod_and_chown(p, context->directories[type].mode, uid, gid);
1943                 if (r < 0)
1944                         goto fail;
1945         }
1946
1947         return 0;
1948
1949 fail:
1950         *exit_status = exit_status_table[type];
1951
1952         return r;
1953 }
1954
1955 static int setup_smack(
1956                 const ExecContext *context,
1957                 const ExecCommand *command) {
1958
1959         int r;
1960
1961         assert(context);
1962         assert(command);
1963
1964         if (context->smack_process_label) {
1965                 r = mac_smack_apply_pid(0, context->smack_process_label);
1966                 if (r < 0)
1967                         return r;
1968         }
1969 #ifdef SMACK_DEFAULT_PROCESS_LABEL
1970         else {
1971                 _cleanup_free_ char *exec_label = NULL;
1972
1973                 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
1974                 if (r < 0 && r != -ENODATA && r != -EOPNOTSUPP)
1975                         return r;
1976
1977                 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
1978                 if (r < 0)
1979                         return r;
1980         }
1981 #endif
1982
1983         return 0;
1984 }
1985
1986 static int compile_read_write_paths(
1987                 const ExecContext *context,
1988                 const ExecParameters *params,
1989                 char ***ret) {
1990
1991         _cleanup_strv_free_ char **l = NULL;
1992         char **rt;
1993         ExecDirectoryType i;
1994
1995         /* Compile the list of writable paths. This is the combination of
1996          * the explicitly configured paths, plus all runtime directories. */
1997
1998         if (strv_isempty(context->read_write_paths)) {
1999                 for (i = 0; i < _EXEC_DIRECTORY_MAX; i++)
2000                         if (!strv_isempty(context->directories[i].paths))
2001                                 break;
2002
2003                 if (i == _EXEC_DIRECTORY_MAX) {
2004                         *ret = NULL; /* NOP if neither is set */
2005                         return 0;
2006                 }
2007         }
2008
2009         l = strv_copy(context->read_write_paths);
2010         if (!l)
2011                 return -ENOMEM;
2012
2013         for (i = 0; i < _EXEC_DIRECTORY_MAX; i++) {
2014                 if (!params->prefix[i])
2015                         continue;
2016
2017                 STRV_FOREACH(rt, context->directories[i].paths) {
2018                         char *s;
2019
2020                         s = strjoin(params->prefix[i], "/", *rt);
2021                         if (!s)
2022                                 return -ENOMEM;
2023
2024                         if (strv_consume(&l, s) < 0)
2025                                 return -ENOMEM;
2026                 }
2027         }
2028
2029         *ret = l;
2030         l = NULL;
2031
2032         return 0;
2033 }
2034
2035 static int apply_mount_namespace(
2036                 Unit *u,
2037                 ExecCommand *command,
2038                 const ExecContext *context,
2039                 const ExecParameters *params,
2040                 ExecRuntime *runtime) {
2041
2042         _cleanup_strv_free_ char **rw = NULL;
2043         char *tmp = NULL, *var = NULL;
2044         const char *root_dir = NULL, *root_image = NULL;
2045         NameSpaceInfo ns_info = {
2046                 .ignore_protect_paths = false,
2047                 .private_dev = context->private_devices,
2048                 .protect_control_groups = context->protect_control_groups,
2049                 .protect_kernel_tunables = context->protect_kernel_tunables,
2050                 .protect_kernel_modules = context->protect_kernel_modules,
2051                 .mount_apivfs = context->mount_apivfs,
2052         };
2053         bool needs_sandboxing;
2054         int r;
2055
2056         assert(context);
2057
2058         /* The runtime struct only contains the parent of the private /tmp,
2059          * which is non-accessible to world users. Inside of it there's a /tmp
2060          * that is sticky, and that's the one we want to use here. */
2061
2062         if (context->private_tmp && runtime) {
2063                 if (runtime->tmp_dir)
2064                         tmp = strjoina(runtime->tmp_dir, "/tmp");
2065                 if (runtime->var_tmp_dir)
2066                         var = strjoina(runtime->var_tmp_dir, "/tmp");
2067         }
2068
2069         r = compile_read_write_paths(context, params, &rw);
2070         if (r < 0)
2071                 return r;
2072
2073         if (params->flags & EXEC_APPLY_CHROOT) {
2074                 root_image = context->root_image;
2075
2076                 if (!root_image)
2077                         root_dir = context->root_directory;
2078         }
2079
2080         /*
2081          * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2082          * sandbox info, otherwise enforce it, don't ignore protected paths and
2083          * fail if we are enable to apply the sandbox inside the mount namespace.
2084          */
2085         if (!context->dynamic_user && root_dir)
2086                 ns_info.ignore_protect_paths = true;
2087
2088         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2089
2090         r = setup_namespace(root_dir, root_image,
2091                             &ns_info, rw,
2092                             needs_sandboxing ? context->read_only_paths : NULL,
2093                             needs_sandboxing ? context->inaccessible_paths : NULL,
2094                             context->bind_mounts,
2095                             context->n_bind_mounts,
2096                             tmp,
2097                             var,
2098                             needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2099                             needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2100                             context->mount_flags,
2101                             DISSECT_IMAGE_DISCARD_ON_LOOP);
2102
2103         /* If we couldn't set up the namespace this is probably due to a
2104          * missing capability. In this case, silently proceeed. */
2105         if (IN_SET(r, -EPERM, -EACCES)) {
2106                 log_open();
2107                 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2108                 log_close();
2109                 r = 0;
2110         }
2111
2112         return r;
2113 }
2114
2115 static int apply_working_directory(
2116                 const ExecContext *context,
2117                 const ExecParameters *params,
2118                 const char *home,
2119                 const bool needs_mount_ns,
2120                 int *exit_status) {
2121
2122         const char *d, *wd;
2123
2124         assert(context);
2125         assert(exit_status);
2126
2127         if (context->working_directory_home) {
2128
2129                 if (!home) {
2130                         *exit_status = EXIT_CHDIR;
2131                         return -ENXIO;
2132                 }
2133
2134                 wd = home;
2135
2136         } else if (context->working_directory)
2137                 wd = context->working_directory;
2138         else
2139                 wd = "/";
2140
2141         if (params->flags & EXEC_APPLY_CHROOT) {
2142                 if (!needs_mount_ns && context->root_directory)
2143                         if (chroot(context->root_directory) < 0) {
2144                                 *exit_status = EXIT_CHROOT;
2145                                 return -errno;
2146                         }
2147
2148                 d = wd;
2149         } else
2150                 d = prefix_roota(context->root_directory, wd);
2151
2152         if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2153                 *exit_status = EXIT_CHDIR;
2154                 return -errno;
2155         }
2156
2157         return 0;
2158 }
2159
2160 static int setup_keyring(Unit *u, const ExecParameters *p, uid_t uid, gid_t gid) {
2161         key_serial_t keyring;
2162
2163         assert(u);
2164         assert(p);
2165
2166         /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2167          * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2168          * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2169          * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2170          * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2171          * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2172
2173         if (!(p->flags & EXEC_NEW_KEYRING))
2174                 return 0;
2175
2176         keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2177         if (keyring == -1) {
2178                 if (errno == ENOSYS)
2179                         log_debug_errno(errno, "Kernel keyring not supported, ignoring.");
2180                 else if (IN_SET(errno, EACCES, EPERM))
2181                         log_debug_errno(errno, "Kernel keyring access prohibited, ignoring.");
2182                 else if (errno == EDQUOT)
2183                         log_debug_errno(errno, "Out of kernel keyrings to allocate, ignoring.");
2184                 else
2185                         return log_error_errno(errno, "Setting up kernel keyring failed: %m");
2186
2187                 return 0;
2188         }
2189
2190         /* Populate they keyring with the invocation ID by default. */
2191         if (!sd_id128_is_null(u->invocation_id)) {
2192                 key_serial_t key;
2193
2194                 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2195                 if (key == -1)
2196                         log_debug_errno(errno, "Failed to add invocation ID to keyring, ignoring: %m");
2197                 else {
2198                         if (keyctl(KEYCTL_SETPERM, key,
2199                                    KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2200                                    KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2201                                 return log_error_errno(errno, "Failed to restrict invocation ID permission: %m");
2202                 }
2203         }
2204
2205         /* And now, make the keyring owned by the service's user */
2206         if (uid_is_valid(uid) || gid_is_valid(gid))
2207                 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2208                         return log_error_errno(errno, "Failed to change ownership of session keyring: %m");
2209
2210         return 0;
2211 }
2212
2213 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2214         assert(array);
2215         assert(n);
2216
2217         if (!pair)
2218                 return;
2219
2220         if (pair[0] >= 0)
2221                 array[(*n)++] = pair[0];
2222         if (pair[1] >= 0)
2223                 array[(*n)++] = pair[1];
2224 }
2225
2226 static int close_remaining_fds(
2227                 const ExecParameters *params,
2228                 ExecRuntime *runtime,
2229                 DynamicCreds *dcreds,
2230                 int user_lookup_fd,
2231                 int socket_fd,
2232                 int *fds, unsigned n_fds) {
2233
2234         unsigned n_dont_close = 0;
2235         int dont_close[n_fds + 12];
2236
2237         assert(params);
2238
2239         if (params->stdin_fd >= 0)
2240                 dont_close[n_dont_close++] = params->stdin_fd;
2241         if (params->stdout_fd >= 0)
2242                 dont_close[n_dont_close++] = params->stdout_fd;
2243         if (params->stderr_fd >= 0)
2244                 dont_close[n_dont_close++] = params->stderr_fd;
2245
2246         if (socket_fd >= 0)
2247                 dont_close[n_dont_close++] = socket_fd;
2248         if (n_fds > 0) {
2249                 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2250                 n_dont_close += n_fds;
2251         }
2252
2253         if (runtime)
2254                 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2255
2256         if (dcreds) {
2257                 if (dcreds->user)
2258                         append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2259                 if (dcreds->group)
2260                         append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2261         }
2262
2263         if (user_lookup_fd >= 0)
2264                 dont_close[n_dont_close++] = user_lookup_fd;
2265
2266         return close_all_fds(dont_close, n_dont_close);
2267 }
2268
2269 static int send_user_lookup(
2270                 Unit *unit,
2271                 int user_lookup_fd,
2272                 uid_t uid,
2273                 gid_t gid) {
2274
2275         assert(unit);
2276
2277         /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2278          * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2279          * specified. */
2280
2281         if (user_lookup_fd < 0)
2282                 return 0;
2283
2284         if (!uid_is_valid(uid) && !gid_is_valid(gid))
2285                 return 0;
2286
2287         if (writev(user_lookup_fd,
2288                (struct iovec[]) {
2289                            { .iov_base = &uid, .iov_len = sizeof(uid) },
2290                            { .iov_base = &gid, .iov_len = sizeof(gid) },
2291                            { .iov_base = unit->id, .iov_len = strlen(unit->id) }}, 3) < 0)
2292                 return -errno;
2293
2294         return 0;
2295 }
2296
2297 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2298         int r;
2299
2300         assert(c);
2301         assert(home);
2302         assert(buf);
2303
2304         /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2305
2306         if (*home)
2307                 return 0;
2308
2309         if (!c->working_directory_home)
2310                 return 0;
2311
2312         if (uid == 0) {
2313                 /* Hardcode /root as home directory for UID 0 */
2314                 *home = "/root";
2315                 return 1;
2316         }
2317
2318         r = get_home_dir(buf);
2319         if (r < 0)
2320                 return r;
2321
2322         *home = *buf;
2323         return 1;
2324 }
2325
2326 static int exec_child(
2327                 Unit *unit,
2328                 ExecCommand *command,
2329                 const ExecContext *context,
2330                 const ExecParameters *params,
2331                 ExecRuntime *runtime,
2332                 DynamicCreds *dcreds,
2333                 char **argv,
2334                 int socket_fd,
2335                 int named_iofds[3],
2336                 int *fds,
2337                 unsigned n_storage_fds,
2338                 unsigned n_socket_fds,
2339                 char **files_env,
2340                 int user_lookup_fd,
2341                 int *exit_status,
2342                 char **error_message) {
2343
2344         _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2345         _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2346         _cleanup_free_ gid_t *supplementary_gids = NULL;
2347         const char *username = NULL, *groupname = NULL;
2348         const char *home = NULL, *shell = NULL;
2349         dev_t journal_stream_dev = 0;
2350         ino_t journal_stream_ino = 0;
2351         bool needs_sandboxing,          /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2352                 needs_setuid,           /* Do we need to do the actual setresuid()/setresgid() calls? */
2353                 needs_mount_namespace,  /* Do we need to set up a mount namespace for this kernel? */
2354                 needs_ambient_hack;     /* Do we need to apply the ambient capabilities hack? */
2355 #ifdef HAVE_SELINUX
2356         bool use_selinux = false;
2357 #endif
2358 #ifdef HAVE_SMACK
2359         bool use_smack = false;
2360 #endif
2361 #ifdef HAVE_APPARMOR
2362         bool use_apparmor = false;
2363 #endif
2364         uid_t uid = UID_INVALID;
2365         gid_t gid = GID_INVALID;
2366         int i, r, ngids = 0;
2367         unsigned n_fds;
2368         ExecDirectoryType dt;
2369         int secure_bits;
2370
2371         assert(unit);
2372         assert(command);
2373         assert(context);
2374         assert(params);
2375         assert(exit_status);
2376         assert(error_message);
2377         /* We don't always set error_message, hence it must be initialized */
2378         assert(*error_message == NULL);
2379
2380         rename_process_from_path(command->path);
2381
2382         /* We reset exactly these signals, since they are the
2383          * only ones we set to SIG_IGN in the main daemon. All
2384          * others we leave untouched because we set them to
2385          * SIG_DFL or a valid handler initially, both of which
2386          * will be demoted to SIG_DFL. */
2387         (void) default_signals(SIGNALS_CRASH_HANDLER,
2388                                SIGNALS_IGNORE, -1);
2389
2390         if (context->ignore_sigpipe)
2391                 (void) ignore_signals(SIGPIPE, -1);
2392
2393         r = reset_signal_mask();
2394         if (r < 0) {
2395                 *exit_status = EXIT_SIGNAL_MASK;
2396                 *error_message = strdup("Failed to reset signal mask");
2397                 /* If strdup fails, here and below, we will just print the generic error message. */
2398                 return r;
2399         }
2400
2401         if (params->idle_pipe)
2402                 do_idle_pipe_dance(params->idle_pipe);
2403
2404         /* Close sockets very early to make sure we don't
2405          * block init reexecution because it cannot bind its
2406          * sockets */
2407
2408         log_forget_fds();
2409
2410         n_fds = n_storage_fds + n_socket_fds;
2411         r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2412         if (r < 0) {
2413                 *exit_status = EXIT_FDS;
2414                 *error_message = strdup("Failed to close remaining fds");
2415                 return r;
2416         }
2417
2418         if (!context->same_pgrp)
2419                 if (setsid() < 0) {
2420                         *exit_status = EXIT_SETSID;
2421                         return -errno;
2422                 }
2423
2424         exec_context_tty_reset(context, params);
2425
2426         if (unit_shall_confirm_spawn(unit)) {
2427                 const char *vc = params->confirm_spawn;
2428                 _cleanup_free_ char *cmdline = NULL;
2429
2430                 cmdline = exec_command_line(argv);
2431                 if (!cmdline) {
2432                         *exit_status = EXIT_CONFIRM;
2433                         return -ENOMEM;
2434                 }
2435
2436                 r = ask_for_confirmation(vc, unit, cmdline);
2437                 if (r != CONFIRM_EXECUTE) {
2438                         if (r == CONFIRM_PRETEND_SUCCESS) {
2439                                 *exit_status = EXIT_SUCCESS;
2440                                 return 0;
2441                         }
2442                         *exit_status = EXIT_CONFIRM;
2443                         *error_message = strdup("Execution cancelled");
2444                         return -ECANCELED;
2445                 }
2446         }
2447
2448         if (context->dynamic_user && dcreds) {
2449
2450                 /* Make sure we bypass our own NSS module for any NSS checks */
2451                 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2452                         *exit_status = EXIT_USER;
2453                         *error_message = strdup("Failed to update environment");
2454                         return -errno;
2455                 }
2456
2457                 r = dynamic_creds_realize(dcreds, &uid, &gid);
2458                 if (r < 0) {
2459                         *exit_status = EXIT_USER;
2460                         *error_message = strdup("Failed to update dynamic user credentials");
2461                         return r;
2462                 }
2463
2464                 if (!uid_is_valid(uid)) {
2465                         *exit_status = EXIT_USER;
2466                         (void) asprintf(error_message, "UID validation failed for \""UID_FMT"\"", uid);
2467                         /* If asprintf fails, here and below, we will just print the generic error message. */
2468                         return -ESRCH;
2469                 }
2470
2471                 if (!gid_is_valid(gid)) {
2472                         *exit_status = EXIT_USER;
2473                         (void) asprintf(error_message, "GID validation failed for \""GID_FMT"\"", gid);
2474                         return -ESRCH;
2475                 }
2476
2477                 if (dcreds->user)
2478                         username = dcreds->user->name;
2479
2480         } else {
2481                 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2482                 if (r < 0) {
2483                         *exit_status = EXIT_USER;
2484                         *error_message = strdup("Failed to determine user credentials");
2485                         return r;
2486                 }
2487
2488                 r = get_fixed_group(context, &groupname, &gid);
2489                 if (r < 0) {
2490                         *exit_status = EXIT_GROUP;
2491                         *error_message = strdup("Failed to determine group credentials");
2492                         return r;
2493                 }
2494         }
2495
2496         /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2497         r = get_supplementary_groups(context, username, groupname, gid,
2498                                      &supplementary_gids, &ngids);
2499         if (r < 0) {
2500                 *exit_status = EXIT_GROUP;
2501                 *error_message = strdup("Failed to determine supplementary groups");
2502                 return r;
2503         }
2504
2505         r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2506         if (r < 0) {
2507                 *exit_status = EXIT_USER;
2508                 *error_message = strdup("Failed to send user credentials to PID1");
2509                 return r;
2510         }
2511
2512         user_lookup_fd = safe_close(user_lookup_fd);
2513
2514         r = acquire_home(context, uid, &home, &home_buffer);
2515         if (r < 0) {
2516                 *exit_status = EXIT_CHDIR;
2517                 *error_message = strdup("Failed to determine $HOME for user");
2518                 return r;
2519         }
2520
2521         /* If a socket is connected to STDIN/STDOUT/STDERR, we
2522          * must sure to drop O_NONBLOCK */
2523         if (socket_fd >= 0)
2524                 (void) fd_nonblock(socket_fd, false);
2525
2526         r = setup_input(context, params, socket_fd, named_iofds);
2527         if (r < 0) {
2528                 *exit_status = EXIT_STDIN;
2529                 *error_message = strdup("Failed to set up stdin");
2530                 return r;
2531         }
2532
2533         r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2534         if (r < 0) {
2535                 *exit_status = EXIT_STDOUT;
2536                 *error_message = strdup("Failed to set up stdout");
2537                 return r;
2538         }
2539
2540         r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2541         if (r < 0) {
2542                 *exit_status = EXIT_STDERR;
2543                 *error_message = strdup("Failed to set up stderr");
2544                 return r;
2545         }
2546
2547         if (params->cgroup_path) {
2548                 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2549                 if (r < 0) {
2550                         *exit_status = EXIT_CGROUP;
2551                         (void) asprintf(error_message, "Failed to attach to cgroup %s", params->cgroup_path);
2552                         return r;
2553                 }
2554         }
2555
2556         if (context->oom_score_adjust_set) {
2557                 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2558
2559                 /* When we can't make this change due to EPERM, then
2560                  * let's silently skip over it. User namespaces
2561                  * prohibit write access to this file, and we
2562                  * shouldn't trip up over that. */
2563
2564                 sprintf(t, "%i", context->oom_score_adjust);
2565                 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2566                 if (r == -EPERM || r == -EACCES) {
2567                         log_open();
2568                         log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2569                         log_close();
2570                 } else if (r < 0) {
2571                         *exit_status = EXIT_OOM_ADJUST;
2572                         *error_message = strdup("Failed to write /proc/self/oom_score_adj");
2573                         return -errno;
2574                 }
2575         }
2576
2577         if (context->nice_set)
2578                 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2579                         *exit_status = EXIT_NICE;
2580                         return -errno;
2581                 }
2582
2583         if (context->cpu_sched_set) {
2584                 struct sched_param param = {
2585                         .sched_priority = context->cpu_sched_priority,
2586                 };
2587
2588                 r = sched_setscheduler(0,
2589                                        context->cpu_sched_policy |
2590                                        (context->cpu_sched_reset_on_fork ?
2591                                         SCHED_RESET_ON_FORK : 0),
2592                                        &param);
2593                 if (r < 0) {
2594                         *exit_status = EXIT_SETSCHEDULER;
2595                         return -errno;
2596                 }
2597         }
2598
2599         if (context->cpuset)
2600                 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2601                         *exit_status = EXIT_CPUAFFINITY;
2602                         return -errno;
2603                 }
2604
2605         if (context->ioprio_set)
2606                 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2607                         *exit_status = EXIT_IOPRIO;
2608                         return -errno;
2609                 }
2610
2611         if (context->timer_slack_nsec != NSEC_INFINITY)
2612                 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2613                         *exit_status = EXIT_TIMERSLACK;
2614                         return -errno;
2615                 }
2616
2617         if (context->personality != PERSONALITY_INVALID) {
2618                 r = safe_personality(context->personality);
2619                 if (r < 0) {
2620                         *exit_status = EXIT_PERSONALITY;
2621                         return r;
2622                 }
2623         }
2624
2625         if (context->utmp_id)
2626                 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2627                                       context->tty_path,
2628                                       context->utmp_mode == EXEC_UTMP_INIT  ? INIT_PROCESS :
2629                                       context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2630                                       USER_PROCESS,
2631                                       username);
2632
2633         if (context->user) {
2634                 r = chown_terminal(STDIN_FILENO, uid);
2635                 if (r < 0) {
2636                         *exit_status = EXIT_STDIN;
2637                         return r;
2638                 }
2639         }
2640
2641         /* If delegation is enabled we'll pass ownership of the cgroup
2642          * (but only in systemd's own controller hierarchy!) to the
2643          * user of the new process. */
2644         if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
2645                 r = cg_set_task_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0644, uid, gid);
2646                 if (r < 0) {
2647                         *exit_status = EXIT_CGROUP;
2648                         return r;
2649                 }
2650
2651
2652                 r = cg_set_group_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, 0755, uid, gid);
2653                 if (r < 0) {
2654                         *exit_status = EXIT_CGROUP;
2655                         return r;
2656                 }
2657         }
2658
2659         for (dt = 0; dt < _EXEC_DIRECTORY_MAX; dt++) {
2660                 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
2661                 if (r < 0)
2662                         return r;
2663         }
2664
2665         r = build_environment(
2666                         unit,
2667                         context,
2668                         params,
2669                         n_fds,
2670                         home,
2671                         username,
2672                         shell,
2673                         journal_stream_dev,
2674                         journal_stream_ino,
2675                         &our_env);
2676         if (r < 0) {
2677                 *exit_status = EXIT_MEMORY;
2678                 return r;
2679         }
2680
2681         r = build_pass_environment(context, &pass_env);
2682         if (r < 0) {
2683                 *exit_status = EXIT_MEMORY;
2684                 return r;
2685         }
2686
2687         accum_env = strv_env_merge(5,
2688                                    params->environment,
2689                                    our_env,
2690                                    pass_env,
2691                                    context->environment,
2692                                    files_env,
2693                                    NULL);
2694         if (!accum_env) {
2695                 *exit_status = EXIT_MEMORY;
2696                 return -ENOMEM;
2697         }
2698         accum_env = strv_env_clean(accum_env);
2699
2700         (void) umask(context->umask);
2701
2702         r = setup_keyring(unit, params, uid, gid);
2703         if (r < 0) {
2704                 *exit_status = EXIT_KEYRING;
2705                 return r;
2706         }
2707
2708         /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
2709         needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2710
2711         /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
2712         needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
2713
2714         /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
2715         if (needs_ambient_hack)
2716                 needs_setuid = false;
2717         else
2718                 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
2719
2720         if (needs_sandboxing) {
2721                 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
2722                  * present. The actual MAC context application will happen later, as late as possible, to avoid
2723                  * impacting our own code paths. */
2724
2725 #ifdef HAVE_SELINUX
2726                 use_selinux = mac_selinux_use();
2727 #endif
2728 #ifdef HAVE_SMACK
2729                 use_smack = mac_smack_use();
2730 #endif
2731 #ifdef HAVE_APPARMOR
2732                 use_apparmor = mac_apparmor_use();
2733 #endif
2734         }
2735
2736         if (needs_setuid) {
2737                 if (context->pam_name && username) {
2738                         r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
2739                         if (r < 0) {
2740                                 *exit_status = EXIT_PAM;
2741                                 return r;
2742                         }
2743                 }
2744         }
2745
2746         if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
2747                 r = setup_netns(runtime->netns_storage_socket);
2748                 if (r < 0) {
2749                         *exit_status = EXIT_NETWORK;
2750                         return r;
2751                 }
2752         }
2753
2754         needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
2755         if (needs_mount_namespace) {
2756                 r = apply_mount_namespace(unit, command, context, params, runtime);
2757                 if (r < 0) {
2758                         *exit_status = EXIT_NAMESPACE;
2759                         return r;
2760                 }
2761         }
2762
2763         /* Apply just after mount namespace setup */
2764         r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
2765         if (r < 0)
2766                 return r;
2767
2768         /* Drop groups as early as possbile */
2769         if (needs_setuid) {
2770                 r = enforce_groups(context, gid, supplementary_gids, ngids);
2771                 if (r < 0) {
2772                         *exit_status = EXIT_GROUP;
2773                         return r;
2774                 }
2775         }
2776
2777         if (needs_sandboxing) {
2778 #ifdef HAVE_SELINUX
2779                 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
2780                         r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
2781                         if (r < 0) {
2782                                 *exit_status = EXIT_SELINUX_CONTEXT;
2783                                 return r;
2784                         }
2785                 }
2786 #endif
2787
2788                 if (context->private_users) {
2789                         r = setup_private_users(uid, gid);
2790                         if (r < 0) {
2791                                 *exit_status = EXIT_USER;
2792                                 return r;
2793                         }
2794                 }
2795         }
2796
2797         /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
2798          * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
2799          * was needed to upload the policy and can now be closed as well. */
2800         r = close_all_fds(fds, n_fds);
2801         if (r >= 0)
2802                 r = shift_fds(fds, n_fds);
2803         if (r >= 0)
2804                 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
2805         if (r < 0) {
2806                 *exit_status = EXIT_FDS;
2807                 return r;
2808         }
2809
2810         secure_bits = context->secure_bits;
2811
2812         if (needs_sandboxing) {
2813                 uint64_t bset;
2814
2815                 for (i = 0; i < _RLIMIT_MAX; i++) {
2816
2817                         if (!context->rlimit[i])
2818                                 continue;
2819
2820                         r = setrlimit_closest(i, context->rlimit[i]);
2821                         if (r < 0) {
2822                                 *exit_status = EXIT_LIMITS;
2823                                 return r;
2824                         }
2825                 }
2826
2827                 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
2828                 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
2829                         if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
2830                                 *exit_status = EXIT_LIMITS;
2831                                 return -errno;
2832                         }
2833                 }
2834
2835                 bset = context->capability_bounding_set;
2836                 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
2837                  * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
2838                  * instead of us doing that */
2839                 if (needs_ambient_hack)
2840                         bset |= (UINT64_C(1) << CAP_SETPCAP) |
2841                                 (UINT64_C(1) << CAP_SETUID) |
2842                                 (UINT64_C(1) << CAP_SETGID);
2843
2844                 if (!cap_test_all(bset)) {
2845                         r = capability_bounding_set_drop(bset, false);
2846                         if (r < 0) {
2847                                 *exit_status = EXIT_CAPABILITIES;
2848                                 *error_message = strdup("Failed to drop capabilities");
2849                                 return r;
2850                         }
2851                 }
2852
2853                 /* This is done before enforce_user, but ambient set
2854                  * does not survive over setresuid() if keep_caps is not set. */
2855                 if (!needs_ambient_hack &&
2856                     context->capability_ambient_set != 0) {
2857                         r = capability_ambient_set_apply(context->capability_ambient_set, true);
2858                         if (r < 0) {
2859                                 *exit_status = EXIT_CAPABILITIES;
2860                                 *error_message = strdup("Failed to apply ambient capabilities (before UID change)");
2861                                 return r;
2862                         }
2863                 }
2864         }
2865
2866         if (needs_setuid) {
2867                 if (context->user) {
2868                         r = enforce_user(context, uid);
2869                         if (r < 0) {
2870                                 *exit_status = EXIT_USER;
2871                                 (void) asprintf(error_message, "Failed to change UID to "UID_FMT, uid);
2872                                 return r;
2873                         }
2874
2875                         if (!needs_ambient_hack &&
2876                             context->capability_ambient_set != 0) {
2877
2878                                 /* Fix the ambient capabilities after user change. */
2879                                 r = capability_ambient_set_apply(context->capability_ambient_set, false);
2880                                 if (r < 0) {
2881                                         *exit_status = EXIT_CAPABILITIES;
2882                                         *error_message = strdup("Failed to apply ambient capabilities (after UID change)");
2883                                         return r;
2884                                 }
2885
2886                                 /* If we were asked to change user and ambient capabilities
2887                                  * were requested, we had to add keep-caps to the securebits
2888                                  * so that we would maintain the inherited capability set
2889                                  * through the setresuid(). Make sure that the bit is added
2890                                  * also to the context secure_bits so that we don't try to
2891                                  * drop the bit away next. */
2892
2893                                 secure_bits |= 1<<SECURE_KEEP_CAPS;
2894                         }
2895                 }
2896         }
2897
2898         if (needs_sandboxing) {
2899                 /* Apply the MAC contexts late, but before seccomp syscall filtering, as those should really be last to
2900                  * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
2901                  * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
2902                  * are restricted. */
2903
2904 #ifdef HAVE_SELINUX
2905                 if (use_selinux) {
2906                         char *exec_context = mac_selinux_context_net ?: context->selinux_context;
2907
2908                         if (exec_context) {
2909                                 r = setexeccon(exec_context);
2910                                 if (r < 0) {
2911                                         *exit_status = EXIT_SELINUX_CONTEXT;
2912                                         (void) asprintf(error_message, "Failed to set SELinux context to %s", exec_context);
2913                                         return r;
2914                                 }
2915                         }
2916                 }
2917 #endif
2918
2919 #ifdef HAVE_SMACK
2920                 if (use_smack) {
2921                         r = setup_smack(context, command);
2922                         if (r < 0) {
2923                                 *exit_status = EXIT_SMACK_PROCESS_LABEL;
2924                                 *error_message = strdup("Failed to set SMACK process label");
2925                                 return r;
2926                         }
2927                 }
2928 #endif
2929
2930 #ifdef HAVE_APPARMOR
2931                 if (use_apparmor && context->apparmor_profile) {
2932                         r = aa_change_onexec(context->apparmor_profile);
2933                         if (r < 0 && !context->apparmor_profile_ignore) {
2934                                 *exit_status = EXIT_APPARMOR_PROFILE;
2935                                 (void) asprintf(error_message,
2936                                                 "Failed to prepare AppArmor profile change to %s",
2937                                                 context->apparmor_profile);
2938                                 return -errno;
2939                         }
2940                 }
2941 #endif
2942
2943                 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
2944                  * we'll try not to call PR_SET_SECUREBITS unless necessary. */
2945                 if (prctl(PR_GET_SECUREBITS) != secure_bits)
2946                         if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
2947                                 *exit_status = EXIT_SECUREBITS;
2948                                 *error_message = strdup("Failed to set secure bits");
2949                                 return -errno;
2950                         }
2951
2952                 if (context_has_no_new_privileges(context))
2953                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2954                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
2955                                 *error_message = strdup("Failed to disable new privileges");
2956                                 return -errno;
2957                         }
2958
2959 #ifdef HAVE_SECCOMP
2960                 r = apply_address_families(unit, context);
2961                 if (r < 0) {
2962                         *exit_status = EXIT_ADDRESS_FAMILIES;
2963                         *error_message = strdup("Failed to restrict address families");
2964                         return r;
2965                 }
2966
2967                 r = apply_memory_deny_write_execute(unit, context);
2968                 if (r < 0) {
2969                         *exit_status = EXIT_SECCOMP;
2970                         *error_message = strdup("Failed to disable writing to executable memory");
2971                         return r;
2972                 }
2973
2974                 r = apply_restrict_realtime(unit, context);
2975                 if (r < 0) {
2976                         *exit_status = EXIT_SECCOMP;
2977                         *error_message = strdup("Failed to apply realtime restrictions");
2978                         return r;
2979                 }
2980
2981                 r = apply_restrict_namespaces(unit, context);
2982                 if (r < 0) {
2983                         *exit_status = EXIT_SECCOMP;
2984                         *error_message = strdup("Failed to apply namespace restrictions");
2985                         return r;
2986                 }
2987
2988                 r = apply_protect_sysctl(unit, context);
2989                 if (r < 0) {
2990                         *exit_status = EXIT_SECCOMP;
2991                         *error_message = strdup("Failed to apply sysctl restrictions");
2992                         return r;
2993                 }
2994
2995                 r = apply_protect_kernel_modules(unit, context);
2996                 if (r < 0) {
2997                         *exit_status = EXIT_SECCOMP;
2998                         *error_message = strdup("Failed to apply module loading restrictions");
2999                         return r;
3000                 }
3001
3002                 r = apply_private_devices(unit, context);
3003                 if (r < 0) {
3004                         *exit_status = EXIT_SECCOMP;
3005                         *error_message = strdup("Failed to set up private devices");
3006                         return r;
3007                 }
3008
3009                 r = apply_syscall_archs(unit, context);
3010                 if (r < 0) {
3011                         *exit_status = EXIT_SECCOMP;
3012                         *error_message = strdup("Failed to apply syscall architecture restrictions");
3013                         return r;
3014                 }
3015
3016                 r = apply_lock_personality(unit, context);
3017                 if (r < 0) {
3018                         *exit_status = EXIT_SECCOMP;
3019                         *error_message = strdup("Failed to lock personalities");
3020                         return r;
3021                 }
3022
3023                 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3024                  * by the filter as little as possible. */
3025                 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3026                 if (r < 0) {
3027                         *exit_status = EXIT_SECCOMP;
3028                         *error_message = strdup("Failed to apply syscall filters");
3029                         return r;
3030                 }
3031 #endif
3032         }
3033
3034         final_argv = replace_env_argv(argv, accum_env);
3035         if (!final_argv) {
3036                 *exit_status = EXIT_MEMORY;
3037                 *error_message = strdup("Failed to prepare process arguments");
3038                 return -ENOMEM;
3039         }
3040
3041         if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3042                 _cleanup_free_ char *line;
3043
3044                 line = exec_command_line(final_argv);
3045                 if (line) {
3046                         log_open();
3047                         log_struct(LOG_DEBUG,
3048                                    "EXECUTABLE=%s", command->path,
3049                                    LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3050                                    LOG_UNIT_ID(unit),
3051                                    NULL);
3052                         log_close();
3053                 }
3054         }
3055
3056         execve(command->path, final_argv, accum_env);
3057         *exit_status = EXIT_EXEC;
3058         return -errno;
3059 }
3060
3061 int exec_spawn(Unit *unit,
3062                ExecCommand *command,
3063                const ExecContext *context,
3064                const ExecParameters *params,
3065                ExecRuntime *runtime,
3066                DynamicCreds *dcreds,
3067                pid_t *ret) {
3068
3069         _cleanup_strv_free_ char **files_env = NULL;
3070         int *fds = NULL;
3071         unsigned n_storage_fds = 0, n_socket_fds = 0;
3072         _cleanup_free_ char *line = NULL;
3073         int socket_fd, r;
3074         int named_iofds[3] = { -1, -1, -1 };
3075         char **argv;
3076         pid_t pid;
3077
3078         assert(unit);
3079         assert(command);
3080         assert(context);
3081         assert(ret);
3082         assert(params);
3083         assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3084
3085         if (context->std_input == EXEC_INPUT_SOCKET ||
3086             context->std_output == EXEC_OUTPUT_SOCKET ||
3087             context->std_error == EXEC_OUTPUT_SOCKET) {
3088
3089                 if (params->n_socket_fds > 1) {
3090                         log_unit_error(unit, "Got more than one socket.");
3091                         return -EINVAL;
3092                 }
3093
3094                 if (params->n_socket_fds == 0) {
3095                         log_unit_error(unit, "Got no socket.");
3096                         return -EINVAL;
3097                 }
3098
3099                 socket_fd = params->fds[0];
3100         } else {
3101                 socket_fd = -1;
3102                 fds = params->fds;
3103                 n_storage_fds = params->n_storage_fds;
3104                 n_socket_fds = params->n_socket_fds;
3105         }
3106
3107         r = exec_context_named_iofds(unit, context, params, named_iofds);
3108         if (r < 0)
3109                 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3110
3111         r = exec_context_load_environment(unit, context, &files_env);
3112         if (r < 0)
3113                 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3114
3115         argv = params->argv ?: command->argv;
3116         line = exec_command_line(argv);
3117         if (!line)
3118                 return log_oom();
3119
3120         log_struct(LOG_DEBUG,
3121                    LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3122                    "EXECUTABLE=%s", command->path,
3123                    LOG_UNIT_ID(unit),
3124                    NULL);
3125         pid = fork();
3126         if (pid < 0)
3127                 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3128
3129         if (pid == 0) {
3130                 int exit_status;
3131                 _cleanup_free_ char *error_message = NULL;
3132
3133                 r = exec_child(unit,
3134                                command,
3135                                context,
3136                                params,
3137                                runtime,
3138                                dcreds,
3139                                argv,
3140                                socket_fd,
3141                                named_iofds,
3142                                fds,
3143                                n_storage_fds,
3144                                n_socket_fds,
3145                                files_env,
3146                                unit->manager->user_lookup_fds[1],
3147                                &exit_status,
3148                                &error_message);
3149                 if (r < 0) {
3150                         log_open();
3151                         if (error_message)
3152                                 log_struct_errno(LOG_ERR, r,
3153                                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3154                                                  LOG_UNIT_ID(unit),
3155                                                  LOG_UNIT_MESSAGE(unit, "%s: %m",
3156                                                                   error_message),
3157                                                  "EXECUTABLE=%s", command->path,
3158                                                  NULL);
3159                         else if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE))
3160                                 log_struct_errno(LOG_INFO, r,
3161                                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3162                                                  LOG_UNIT_ID(unit),
3163                                                  LOG_UNIT_MESSAGE(unit, "Skipped spawning %s: %m",
3164                                                                   command->path),
3165                                                  "EXECUTABLE=%s", command->path,
3166                                                  NULL);
3167                         else
3168                                 log_struct_errno(LOG_ERR, r,
3169                                                  "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3170                                                  LOG_UNIT_ID(unit),
3171                                                  LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3172                                                                   exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3173                                                                   command->path),
3174                                                  "EXECUTABLE=%s", command->path,
3175                                                  NULL);
3176                 }
3177
3178                 _exit(exit_status);
3179         }
3180
3181         log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3182
3183         /* We add the new process to the cgroup both in the child (so
3184          * that we can be sure that no user code is ever executed
3185          * outside of the cgroup) and in the parent (so that we can be
3186          * sure that when we kill the cgroup the process will be
3187          * killed too). */
3188         if (params->cgroup_path)
3189                 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3190
3191         exec_status_start(&command->exec_status, pid);
3192
3193         *ret = pid;
3194         return 0;
3195 }
3196
3197 void exec_context_init(ExecContext *c) {
3198         ExecDirectoryType i;
3199
3200         assert(c);
3201
3202         c->umask = 0022;
3203         c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3204         c->cpu_sched_policy = SCHED_OTHER;
3205         c->syslog_priority = LOG_DAEMON|LOG_INFO;
3206         c->syslog_level_prefix = true;
3207         c->ignore_sigpipe = true;
3208         c->timer_slack_nsec = NSEC_INFINITY;
3209         c->personality = PERSONALITY_INVALID;
3210         for (i = 0; i < _EXEC_DIRECTORY_MAX; i++)
3211                 c->directories[i].mode = 0755;
3212         c->capability_bounding_set = CAP_ALL;
3213         c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3214 }
3215
3216 void exec_context_done(ExecContext *c) {
3217         unsigned l;
3218         ExecDirectoryType i;
3219
3220         assert(c);
3221
3222         c->environment = strv_free(c->environment);
3223         c->environment_files = strv_free(c->environment_files);
3224         c->pass_environment = strv_free(c->pass_environment);
3225
3226         for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3227                 c->rlimit[l] = mfree(c->rlimit[l]);
3228
3229         for (l = 0; l < 3; l++)
3230                 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3231
3232         c->working_directory = mfree(c->working_directory);
3233         c->root_directory = mfree(c->root_directory);
3234         c->root_image = mfree(c->root_image);
3235         c->tty_path = mfree(c->tty_path);
3236         c->syslog_identifier = mfree(c->syslog_identifier);
3237         c->user = mfree(c->user);
3238         c->group = mfree(c->group);
3239
3240         c->supplementary_groups = strv_free(c->supplementary_groups);
3241
3242         c->pam_name = mfree(c->pam_name);
3243
3244         c->read_only_paths = strv_free(c->read_only_paths);
3245         c->read_write_paths = strv_free(c->read_write_paths);
3246         c->inaccessible_paths = strv_free(c->inaccessible_paths);
3247
3248         bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3249
3250         if (c->cpuset)
3251                 CPU_FREE(c->cpuset);
3252
3253         c->utmp_id = mfree(c->utmp_id);
3254         c->selinux_context = mfree(c->selinux_context);
3255         c->apparmor_profile = mfree(c->apparmor_profile);
3256         c->smack_process_label = mfree(c->smack_process_label);
3257
3258         c->syscall_filter = set_free(c->syscall_filter);
3259         c->syscall_archs = set_free(c->syscall_archs);
3260         c->address_families = set_free(c->address_families);
3261
3262         for (i = 0; i < _EXEC_DIRECTORY_MAX; i++)
3263                 c->directories[i].paths = strv_free(c->directories[i].paths);
3264 }
3265
3266 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3267         char **i;
3268
3269         assert(c);
3270
3271         if (!runtime_prefix)
3272                 return 0;
3273
3274         STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3275                 _cleanup_free_ char *p;
3276
3277                 p = strjoin(runtime_prefix, "/", *i);
3278                 if (!p)
3279                         return -ENOMEM;
3280
3281                 /* We execute this synchronously, since we need to be
3282                  * sure this is gone when we start the service
3283                  * next. */
3284                 (void) rm_rf(p, REMOVE_ROOT);
3285         }
3286
3287         return 0;
3288 }
3289
3290 void exec_command_done(ExecCommand *c) {
3291         assert(c);
3292
3293         c->path = mfree(c->path);
3294
3295         c->argv = strv_free(c->argv);
3296 }
3297
3298 void exec_command_done_array(ExecCommand *c, unsigned n) {
3299         unsigned i;
3300
3301         for (i = 0; i < n; i++)
3302                 exec_command_done(c+i);
3303 }
3304
3305 ExecCommand* exec_command_free_list(ExecCommand *c) {
3306         ExecCommand *i;
3307
3308         while ((i = c)) {
3309                 LIST_REMOVE(command, c, i);
3310                 exec_command_done(i);
3311                 free(i);
3312         }
3313
3314         return NULL;
3315 }
3316
3317 void exec_command_free_array(ExecCommand **c, unsigned n) {
3318         unsigned i;
3319
3320         for (i = 0; i < n; i++)
3321                 c[i] = exec_command_free_list(c[i]);
3322 }
3323
3324 typedef struct InvalidEnvInfo {
3325         Unit *unit;
3326         const char *path;
3327 } InvalidEnvInfo;
3328
3329 static void invalid_env(const char *p, void *userdata) {
3330         InvalidEnvInfo *info = userdata;
3331
3332         log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3333 }
3334
3335 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3336         assert(c);
3337
3338         switch (fd_index) {
3339         case STDIN_FILENO:
3340                 if (c->std_input != EXEC_INPUT_NAMED_FD)
3341                         return NULL;
3342                 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3343         case STDOUT_FILENO:
3344                 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3345                         return NULL;
3346                 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3347         case STDERR_FILENO:
3348                 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3349                         return NULL;
3350                 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3351         default:
3352                 return NULL;
3353         }
3354 }
3355
3356 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3357         unsigned i, targets;
3358         const char* stdio_fdname[3];
3359         unsigned n_fds;
3360
3361         assert(c);
3362         assert(p);
3363
3364         targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3365                   (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3366                   (c->std_error == EXEC_OUTPUT_NAMED_FD);
3367
3368         for (i = 0; i < 3; i++)
3369                 stdio_fdname[i] = exec_context_fdname(c, i);
3370
3371         n_fds = p->n_storage_fds + p->n_socket_fds;
3372
3373         for (i = 0; i < n_fds  && targets > 0; i++)
3374                 if (named_iofds[STDIN_FILENO] < 0 &&
3375                     c->std_input == EXEC_INPUT_NAMED_FD &&
3376                     stdio_fdname[STDIN_FILENO] &&
3377                     streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3378
3379                         named_iofds[STDIN_FILENO] = p->fds[i];
3380                         targets--;
3381
3382                 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3383                            c->std_output == EXEC_OUTPUT_NAMED_FD &&
3384                            stdio_fdname[STDOUT_FILENO] &&
3385                            streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3386
3387                         named_iofds[STDOUT_FILENO] = p->fds[i];
3388                         targets--;
3389
3390                 } else if (named_iofds[STDERR_FILENO] < 0 &&
3391                            c->std_error == EXEC_OUTPUT_NAMED_FD &&
3392                            stdio_fdname[STDERR_FILENO] &&
3393                            streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3394
3395                         named_iofds[STDERR_FILENO] = p->fds[i];
3396                         targets--;
3397                 }
3398
3399         return targets == 0 ? 0 : -ENOENT;
3400 }
3401
3402 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3403         char **i, **r = NULL;
3404
3405         assert(c);
3406         assert(l);
3407
3408         STRV_FOREACH(i, c->environment_files) {
3409                 char *fn;
3410                 int k;
3411                 unsigned n;
3412                 bool ignore = false;
3413                 char **p;
3414                 _cleanup_globfree_ glob_t pglob = {};
3415
3416                 fn = *i;
3417
3418                 if (fn[0] == '-') {
3419                         ignore = true;
3420                         fn++;
3421                 }
3422
3423                 if (!path_is_absolute(fn)) {
3424                         if (ignore)
3425                                 continue;
3426
3427                         strv_free(r);
3428                         return -EINVAL;
3429                 }
3430
3431                 /* Filename supports globbing, take all matching files */
3432                 k = safe_glob(fn, 0, &pglob);
3433                 if (k < 0) {
3434                         if (ignore)
3435                                 continue;
3436
3437                         strv_free(r);
3438                         return k;
3439                 }
3440
3441                 /* When we don't match anything, -ENOENT should be returned */
3442                 assert(pglob.gl_pathc > 0);
3443
3444                 for (n = 0; n < pglob.gl_pathc; n++) {
3445                         k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3446                         if (k < 0) {
3447                                 if (ignore)
3448                                         continue;
3449
3450                                 strv_free(r);
3451                                 return k;
3452                         }
3453                         /* Log invalid environment variables with filename */
3454                         if (p) {
3455                                 InvalidEnvInfo info = {
3456                                         .unit = unit,
3457                                         .path = pglob.gl_pathv[n]
3458                                 };
3459
3460                                 p = strv_env_clean_with_callback(p, invalid_env, &info);
3461                         }
3462
3463                         if (r == NULL)
3464                                 r = p;
3465                         else {
3466                                 char **m;
3467
3468                                 m = strv_env_merge(2, r, p);
3469                                 strv_free(r);
3470                                 strv_free(p);
3471                                 if (!m)
3472                                         return -ENOMEM;
3473
3474                                 r = m;
3475                         }
3476                 }
3477         }
3478
3479         *l = r;
3480
3481         return 0;
3482 }
3483
3484 static bool tty_may_match_dev_console(const char *tty) {
3485         _cleanup_free_ char *active = NULL;
3486         char *console;
3487
3488         if (!tty)
3489                 return true;
3490
3491         tty = skip_dev_prefix(tty);
3492
3493         /* trivial identity? */
3494         if (streq(tty, "console"))
3495                 return true;
3496
3497         console = resolve_dev_console(&active);
3498         /* if we could not resolve, assume it may */
3499         if (!console)
3500                 return true;
3501
3502         /* "tty0" means the active VC, so it may be the same sometimes */
3503         return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3504 }
3505
3506 bool exec_context_may_touch_console(ExecContext *ec) {
3507
3508         return (ec->tty_reset ||
3509                 ec->tty_vhangup ||
3510                 ec->tty_vt_disallocate ||
3511                 is_terminal_input(ec->std_input) ||
3512                 is_terminal_output(ec->std_output) ||
3513                 is_terminal_output(ec->std_error)) &&
3514                tty_may_match_dev_console(exec_context_tty_path(ec));
3515 }
3516
3517 static void strv_fprintf(FILE *f, char **l) {
3518         char **g;
3519
3520         assert(f);
3521
3522         STRV_FOREACH(g, l)
3523                 fprintf(f, " %s", *g);
3524 }
3525
3526 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3527         char **e, **d;
3528         unsigned i;
3529         ExecDirectoryType dt;
3530         int r;
3531
3532         assert(c);
3533         assert(f);
3534
3535         prefix = strempty(prefix);
3536
3537         fprintf(f,
3538                 "%sUMask: %04o\n"
3539                 "%sWorkingDirectory: %s\n"
3540                 "%sRootDirectory: %s\n"
3541                 "%sNonBlocking: %s\n"
3542                 "%sPrivateTmp: %s\n"
3543                 "%sPrivateDevices: %s\n"
3544                 "%sProtectKernelTunables: %s\n"
3545                 "%sProtectKernelModules: %s\n"
3546                 "%sProtectControlGroups: %s\n"
3547                 "%sPrivateNetwork: %s\n"
3548                 "%sPrivateUsers: %s\n"
3549                 "%sProtectHome: %s\n"
3550                 "%sProtectSystem: %s\n"
3551                 "%sMountAPIVFS: %s\n"
3552                 "%sIgnoreSIGPIPE: %s\n"
3553                 "%sMemoryDenyWriteExecute: %s\n"
3554                 "%sRestrictRealtime: %s\n",
3555                 prefix, c->umask,
3556                 prefix, c->working_directory ? c->working_directory : "/",
3557                 prefix, c->root_directory ? c->root_directory : "/",
3558                 prefix, yes_no(c->non_blocking),
3559                 prefix, yes_no(c->private_tmp),
3560                 prefix, yes_no(c->private_devices),
3561                 prefix, yes_no(c->protect_kernel_tunables),
3562                 prefix, yes_no(c->protect_kernel_modules),
3563                 prefix, yes_no(c->protect_control_groups),
3564                 prefix, yes_no(c->private_network),
3565                 prefix, yes_no(c->private_users),
3566                 prefix, protect_home_to_string(c->protect_home),
3567                 prefix, protect_system_to_string(c->protect_system),
3568                 prefix, yes_no(c->mount_apivfs),
3569                 prefix, yes_no(c->ignore_sigpipe),
3570                 prefix, yes_no(c->memory_deny_write_execute),
3571                 prefix, yes_no(c->restrict_realtime));
3572
3573         if (c->root_image)
3574                 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3575
3576         STRV_FOREACH(e, c->environment)
3577                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3578
3579         STRV_FOREACH(e, c->environment_files)
3580                 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3581
3582         STRV_FOREACH(e, c->pass_environment)
3583                 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3584
3585         fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3586
3587         for (dt = 0; dt < _EXEC_DIRECTORY_MAX; dt++) {
3588                 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3589
3590                 STRV_FOREACH(d, c->directories[dt].paths)
3591                         fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3592         }
3593
3594         if (c->nice_set)
3595                 fprintf(f,
3596                         "%sNice: %i\n",
3597                         prefix, c->nice);
3598
3599         if (c->oom_score_adjust_set)
3600                 fprintf(f,
3601                         "%sOOMScoreAdjust: %i\n",
3602                         prefix, c->oom_score_adjust);
3603
3604         for (i = 0; i < RLIM_NLIMITS; i++)
3605                 if (c->rlimit[i]) {
3606                         fprintf(f, "%s%s: " RLIM_FMT "\n",
3607                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3608                         fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3609                                 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3610                 }
3611
3612         if (c->ioprio_set) {
3613                 _cleanup_free_ char *class_str = NULL;
3614
3615                 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3616                 if (r >= 0)
3617                         fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3618
3619                 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
3620         }
3621
3622         if (c->cpu_sched_set) {
3623                 _cleanup_free_ char *policy_str = NULL;
3624
3625                 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3626                 if (r >= 0)
3627                         fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
3628
3629                 fprintf(f,
3630                         "%sCPUSchedulingPriority: %i\n"
3631                         "%sCPUSchedulingResetOnFork: %s\n",
3632                         prefix, c->cpu_sched_priority,
3633                         prefix, yes_no(c->cpu_sched_reset_on_fork));
3634         }
3635
3636         if (c->cpuset) {
3637                 fprintf(f, "%sCPUAffinity:", prefix);
3638                 for (i = 0; i < c->cpuset_ncpus; i++)
3639                         if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
3640                                 fprintf(f, " %u", i);
3641                 fputs("\n", f);
3642         }
3643
3644         if (c->timer_slack_nsec != NSEC_INFINITY)
3645                 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
3646
3647         fprintf(f,
3648                 "%sStandardInput: %s\n"
3649                 "%sStandardOutput: %s\n"
3650                 "%sStandardError: %s\n",
3651                 prefix, exec_input_to_string(c->std_input),
3652                 prefix, exec_output_to_string(c->std_output),
3653                 prefix, exec_output_to_string(c->std_error));
3654
3655         if (c->tty_path)
3656                 fprintf(f,
3657                         "%sTTYPath: %s\n"
3658                         "%sTTYReset: %s\n"
3659                         "%sTTYVHangup: %s\n"
3660                         "%sTTYVTDisallocate: %s\n",
3661                         prefix, c->tty_path,
3662                         prefix, yes_no(c->tty_reset),
3663                         prefix, yes_no(c->tty_vhangup),
3664                         prefix, yes_no(c->tty_vt_disallocate));
3665
3666         if (IN_SET(c->std_output,
3667                    EXEC_OUTPUT_SYSLOG,
3668                    EXEC_OUTPUT_KMSG,
3669                    EXEC_OUTPUT_JOURNAL,
3670                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3671                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
3672                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
3673             IN_SET(c->std_error,
3674                    EXEC_OUTPUT_SYSLOG,
3675                    EXEC_OUTPUT_KMSG,
3676                    EXEC_OUTPUT_JOURNAL,
3677                    EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
3678                    EXEC_OUTPUT_KMSG_AND_CONSOLE,
3679                    EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
3680
3681                 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
3682
3683                 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
3684                 if (r >= 0)
3685                         fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
3686
3687                 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
3688                 if (r >= 0)
3689                         fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
3690         }
3691
3692         if (c->secure_bits) {
3693                 _cleanup_free_ char *str = NULL;
3694
3695                 r = secure_bits_to_string_alloc(c->secure_bits, &str);
3696                 if (r >= 0)
3697                         fprintf(f, "%sSecure Bits: %s\n", prefix, str);
3698         }
3699
3700         if (c->capability_bounding_set != CAP_ALL) {
3701                 _cleanup_free_ char *str = NULL;
3702
3703                 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
3704                 if (r >= 0)
3705                         fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
3706         }
3707
3708         if (c->capability_ambient_set != 0) {
3709                 _cleanup_free_ char *str = NULL;
3710
3711                 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
3712                 if (r >= 0)
3713                         fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
3714         }
3715
3716         if (c->user)
3717                 fprintf(f, "%sUser: %s\n", prefix, c->user);
3718         if (c->group)
3719                 fprintf(f, "%sGroup: %s\n", prefix, c->group);
3720
3721         fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
3722
3723         if (strv_length(c->supplementary_groups) > 0) {
3724                 fprintf(f, "%sSupplementaryGroups:", prefix);
3725                 strv_fprintf(f, c->supplementary_groups);
3726                 fputs("\n", f);
3727         }
3728
3729         if (c->pam_name)
3730                 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
3731
3732         if (strv_length(c->read_write_paths) > 0) {
3733                 fprintf(f, "%sReadWritePaths:", prefix);
3734                 strv_fprintf(f, c->read_write_paths);
3735                 fputs("\n", f);
3736         }
3737
3738         if (strv_length(c->read_only_paths) > 0) {
3739                 fprintf(f, "%sReadOnlyPaths:", prefix);
3740                 strv_fprintf(f, c->read_only_paths);
3741                 fputs("\n", f);
3742         }
3743
3744         if (strv_length(c->inaccessible_paths) > 0) {
3745                 fprintf(f, "%sInaccessiblePaths:", prefix);
3746                 strv_fprintf(f, c->inaccessible_paths);
3747                 fputs("\n", f);
3748         }
3749
3750         if (c->n_bind_mounts > 0)
3751                 for (i = 0; i < c->n_bind_mounts; i++) {
3752                         fprintf(f, "%s%s: %s:%s:%s\n", prefix,
3753                                 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
3754                                 c->bind_mounts[i].source,
3755                                 c->bind_mounts[i].destination,
3756                                 c->bind_mounts[i].recursive ? "rbind" : "norbind");
3757                 }
3758
3759         if (c->utmp_id)
3760                 fprintf(f,
3761                         "%sUtmpIdentifier: %s\n",
3762                         prefix, c->utmp_id);
3763
3764         if (c->selinux_context)
3765                 fprintf(f,
3766                         "%sSELinuxContext: %s%s\n",
3767                         prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
3768
3769         if (c->apparmor_profile)
3770                 fprintf(f,
3771                         "%sAppArmorProfile: %s%s\n",
3772                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
3773
3774         if (c->smack_process_label)
3775                 fprintf(f,
3776                         "%sSmackProcessLabel: %s%s\n",
3777                         prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
3778
3779         if (c->personality != PERSONALITY_INVALID)
3780                 fprintf(f,
3781                         "%sPersonality: %s\n",
3782                         prefix, strna(personality_to_string(c->personality)));
3783
3784         fprintf(f,
3785                 "%sLockPersonality: %s\n",
3786                 prefix, yes_no(c->lock_personality));
3787
3788         if (c->syscall_filter) {
3789 #ifdef HAVE_SECCOMP
3790                 Iterator j;
3791                 void *id;
3792                 bool first = true;
3793 #endif
3794
3795                 fprintf(f,
3796                         "%sSystemCallFilter: ",
3797                         prefix);
3798
3799                 if (!c->syscall_whitelist)
3800                         fputc('~', f);
3801
3802 #ifdef HAVE_SECCOMP
3803                 SET_FOREACH(id, c->syscall_filter, j) {
3804                         _cleanup_free_ char *name = NULL;
3805
3806                         if (first)
3807                                 first = false;
3808                         else
3809                                 fputc(' ', f);
3810
3811                         name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
3812                         fputs(strna(name), f);
3813                 }
3814 #endif
3815
3816                 fputc('\n', f);
3817         }
3818
3819         if (c->syscall_archs) {
3820 #ifdef HAVE_SECCOMP
3821                 Iterator j;
3822                 void *id;
3823 #endif
3824
3825                 fprintf(f,
3826                         "%sSystemCallArchitectures:",
3827                         prefix);
3828
3829 #ifdef HAVE_SECCOMP
3830                 SET_FOREACH(id, c->syscall_archs, j)
3831                         fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
3832 #endif
3833                 fputc('\n', f);
3834         }
3835
3836         if (exec_context_restrict_namespaces_set(c)) {
3837                 _cleanup_free_ char *s = NULL;
3838
3839                 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
3840                 if (r >= 0)
3841                         fprintf(f, "%sRestrictNamespaces: %s\n",
3842                                 prefix, s);
3843         }
3844
3845         if (c->syscall_errno > 0)
3846                 fprintf(f,
3847                         "%sSystemCallErrorNumber: %s\n",
3848                         prefix, strna(errno_to_name(c->syscall_errno)));
3849
3850         if (c->apparmor_profile)
3851                 fprintf(f,
3852                         "%sAppArmorProfile: %s%s\n",
3853                         prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
3854 }
3855
3856 bool exec_context_maintains_privileges(ExecContext *c) {
3857         assert(c);
3858
3859         /* Returns true if the process forked off would run under
3860          * an unchanged UID or as root. */
3861
3862         if (!c->user)
3863                 return true;
3864
3865         if (streq(c->user, "root") || streq(c->user, "0"))
3866                 return true;
3867
3868         return false;
3869 }
3870
3871 int exec_context_get_effective_ioprio(ExecContext *c) {
3872         int p;
3873
3874         assert(c);
3875
3876         if (c->ioprio_set)
3877                 return c->ioprio;
3878
3879         p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
3880         if (p < 0)
3881                 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
3882
3883         return p;
3884 }
3885
3886 void exec_status_start(ExecStatus *s, pid_t pid) {
3887         assert(s);
3888
3889         zero(*s);
3890         s->pid = pid;
3891         dual_timestamp_get(&s->start_timestamp);
3892 }
3893
3894 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
3895         assert(s);
3896
3897         if (s->pid && s->pid != pid)
3898                 zero(*s);
3899
3900         s->pid = pid;
3901         dual_timestamp_get(&s->exit_timestamp);
3902
3903         s->code = code;
3904         s->status = status;
3905
3906         if (context) {
3907                 if (context->utmp_id)
3908                         utmp_put_dead_process(context->utmp_id, pid, code, status);
3909
3910                 exec_context_tty_reset(context, NULL);
3911         }
3912 }
3913
3914 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
3915         char buf[FORMAT_TIMESTAMP_MAX];
3916
3917         assert(s);
3918         assert(f);
3919
3920         if (s->pid <= 0)
3921                 return;
3922
3923         prefix = strempty(prefix);
3924
3925         fprintf(f,
3926                 "%sPID: "PID_FMT"\n",
3927                 prefix, s->pid);
3928
3929         if (dual_timestamp_is_set(&s->start_timestamp))
3930                 fprintf(f,
3931                         "%sStart Timestamp: %s\n",
3932                         prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
3933
3934         if (dual_timestamp_is_set(&s->exit_timestamp))
3935                 fprintf(f,
3936                         "%sExit Timestamp: %s\n"
3937                         "%sExit Code: %s\n"
3938                         "%sExit Status: %i\n",
3939                         prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
3940                         prefix, sigchld_code_to_string(s->code),
3941                         prefix, s->status);
3942 }
3943
3944 char *exec_command_line(char **argv) {
3945         size_t k;
3946         char *n, *p, **a;
3947         bool first = true;
3948
3949         assert(argv);
3950
3951         k = 1;
3952         STRV_FOREACH(a, argv)
3953                 k += strlen(*a)+3;
3954
3955         n = new(char, k);
3956         if (!n)
3957                 return NULL;
3958
3959         p = n;
3960         STRV_FOREACH(a, argv) {
3961
3962                 if (!first)
3963                         *(p++) = ' ';
3964                 else
3965                         first = false;
3966
3967                 if (strpbrk(*a, WHITESPACE)) {
3968                         *(p++) = '\'';
3969                         p = stpcpy(p, *a);
3970                         *(p++) = '\'';
3971                 } else
3972                         p = stpcpy(p, *a);
3973
3974         }
3975
3976         *p = 0;
3977
3978         /* FIXME: this doesn't really handle arguments that have
3979          * spaces and ticks in them */
3980
3981         return n;
3982 }
3983
3984 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
3985         _cleanup_free_ char *cmd = NULL;
3986         const char *prefix2;
3987
3988         assert(c);
3989         assert(f);
3990
3991         prefix = strempty(prefix);
3992         prefix2 = strjoina(prefix, "\t");
3993
3994         cmd = exec_command_line(c->argv);
3995         fprintf(f,
3996                 "%sCommand Line: %s\n",
3997                 prefix, cmd ? cmd : strerror(ENOMEM));
3998
3999         exec_status_dump(&c->exec_status, f, prefix2);
4000 }
4001
4002 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4003         assert(f);
4004
4005         prefix = strempty(prefix);
4006
4007         LIST_FOREACH(command, c, c)
4008                 exec_command_dump(c, f, prefix);
4009 }
4010
4011 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4012         ExecCommand *end;
4013
4014         assert(l);
4015         assert(e);
4016
4017         if (*l) {
4018                 /* It's kind of important, that we keep the order here */
4019                 LIST_FIND_TAIL(command, *l, end);
4020                 LIST_INSERT_AFTER(command, *l, end, e);
4021         } else
4022               *l = e;
4023 }
4024
4025 int exec_command_set(ExecCommand *c, const char *path, ...) {
4026         va_list ap;
4027         char **l, *p;
4028
4029         assert(c);
4030         assert(path);
4031
4032         va_start(ap, path);
4033         l = strv_new_ap(path, ap);
4034         va_end(ap);
4035
4036         if (!l)
4037                 return -ENOMEM;
4038
4039         p = strdup(path);
4040         if (!p) {
4041                 strv_free(l);
4042                 return -ENOMEM;
4043         }
4044
4045         free(c->path);
4046         c->path = p;
4047
4048         strv_free(c->argv);
4049         c->argv = l;
4050
4051         return 0;
4052 }
4053
4054 int exec_command_append(ExecCommand *c, const char *path, ...) {
4055         _cleanup_strv_free_ char **l = NULL;
4056         va_list ap;
4057         int r;
4058
4059         assert(c);
4060         assert(path);
4061
4062         va_start(ap, path);
4063         l = strv_new_ap(path, ap);
4064         va_end(ap);
4065
4066         if (!l)
4067                 return -ENOMEM;
4068
4069         r = strv_extend_strv(&c->argv, l, false);
4070         if (r < 0)
4071                 return r;
4072
4073         return 0;
4074 }
4075
4076
4077 static int exec_runtime_allocate(ExecRuntime **rt) {
4078
4079         if (*rt)
4080                 return 0;
4081
4082         *rt = new0(ExecRuntime, 1);
4083         if (!*rt)
4084                 return -ENOMEM;
4085
4086         (*rt)->n_ref = 1;
4087         (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4088
4089         return 0;
4090 }
4091
4092 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4093         int r;
4094
4095         assert(rt);
4096         assert(c);
4097         assert(id);
4098
4099         if (*rt)
4100                 return 1;
4101
4102         if (!c->private_network && !c->private_tmp)
4103                 return 0;
4104
4105         r = exec_runtime_allocate(rt);
4106         if (r < 0)
4107                 return r;
4108
4109         if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4110                 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4111                         return -errno;
4112         }
4113
4114         if (c->private_tmp && !(*rt)->tmp_dir) {
4115                 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4116                 if (r < 0)
4117                         return r;
4118         }
4119
4120         return 1;
4121 }
4122
4123 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4124         assert(r);
4125         assert(r->n_ref > 0);
4126
4127         r->n_ref++;
4128         return r;
4129 }
4130
4131 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4132
4133         if (!r)
4134                 return NULL;
4135
4136         assert(r->n_ref > 0);
4137
4138         r->n_ref--;
4139         if (r->n_ref > 0)
4140                 return NULL;
4141
4142         free(r->tmp_dir);
4143         free(r->var_tmp_dir);
4144         safe_close_pair(r->netns_storage_socket);
4145         return mfree(r);
4146 }
4147
4148 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4149         assert(u);
4150         assert(f);
4151         assert(fds);
4152
4153         if (!rt)
4154                 return 0;
4155
4156         if (rt->tmp_dir)
4157                 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4158
4159         if (rt->var_tmp_dir)
4160                 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4161
4162         if (rt->netns_storage_socket[0] >= 0) {
4163                 int copy;
4164
4165                 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4166                 if (copy < 0)
4167                         return copy;
4168
4169                 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4170         }
4171
4172         if (rt->netns_storage_socket[1] >= 0) {
4173                 int copy;
4174
4175                 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4176                 if (copy < 0)
4177                         return copy;
4178
4179                 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4180         }
4181
4182         return 0;
4183 }
4184
4185 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4186         int r;
4187
4188         assert(rt);
4189         assert(key);
4190         assert(value);
4191
4192         if (streq(key, "tmp-dir")) {
4193                 char *copy;
4194
4195                 r = exec_runtime_allocate(rt);
4196                 if (r < 0)
4197                         return log_oom();
4198
4199                 copy = strdup(value);
4200                 if (!copy)
4201                         return log_oom();
4202
4203                 free((*rt)->tmp_dir);
4204                 (*rt)->tmp_dir = copy;
4205
4206         } else if (streq(key, "var-tmp-dir")) {
4207                 char *copy;
4208
4209                 r = exec_runtime_allocate(rt);
4210                 if (r < 0)
4211                         return log_oom();
4212
4213                 copy = strdup(value);
4214                 if (!copy)
4215                         return log_oom();
4216
4217                 free((*rt)->var_tmp_dir);
4218                 (*rt)->var_tmp_dir = copy;
4219
4220         } else if (streq(key, "netns-socket-0")) {
4221                 int fd;
4222
4223                 r = exec_runtime_allocate(rt);
4224                 if (r < 0)
4225                         return log_oom();
4226
4227                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4228                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4229                 else {
4230                         safe_close((*rt)->netns_storage_socket[0]);
4231                         (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4232                 }
4233         } else if (streq(key, "netns-socket-1")) {
4234                 int fd;
4235
4236                 r = exec_runtime_allocate(rt);
4237                 if (r < 0)
4238                         return log_oom();
4239
4240                 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4241                         log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4242                 else {
4243                         safe_close((*rt)->netns_storage_socket[1]);
4244                         (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4245                 }
4246         } else
4247                 return 0;
4248
4249         return 1;
4250 }
4251
4252 static void *remove_tmpdir_thread(void *p) {
4253         _cleanup_free_ char *path = p;
4254
4255         (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4256         return NULL;
4257 }
4258
4259 void exec_runtime_destroy(ExecRuntime *rt) {
4260         int r;
4261
4262         if (!rt)
4263                 return;
4264
4265         /* If there are multiple users of this, let's leave the stuff around */
4266         if (rt->n_ref > 1)
4267                 return;
4268
4269         if (rt->tmp_dir) {
4270                 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4271
4272                 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4273                 if (r < 0) {
4274                         log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4275                         free(rt->tmp_dir);
4276                 }
4277
4278                 rt->tmp_dir = NULL;
4279         }
4280
4281         if (rt->var_tmp_dir) {
4282                 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4283
4284                 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4285                 if (r < 0) {
4286                         log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4287                         free(rt->var_tmp_dir);
4288                 }
4289
4290                 rt->var_tmp_dir = NULL;
4291         }
4292
4293         safe_close_pair(rt->netns_storage_socket);
4294 }
4295
4296 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4297         [EXEC_INPUT_NULL] = "null",
4298         [EXEC_INPUT_TTY] = "tty",
4299         [EXEC_INPUT_TTY_FORCE] = "tty-force",
4300         [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4301         [EXEC_INPUT_SOCKET] = "socket",
4302         [EXEC_INPUT_NAMED_FD] = "fd",
4303 };
4304
4305 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4306
4307 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4308         [EXEC_OUTPUT_INHERIT] = "inherit",
4309         [EXEC_OUTPUT_NULL] = "null",
4310         [EXEC_OUTPUT_TTY] = "tty",
4311         [EXEC_OUTPUT_SYSLOG] = "syslog",
4312         [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4313         [EXEC_OUTPUT_KMSG] = "kmsg",
4314         [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4315         [EXEC_OUTPUT_JOURNAL] = "journal",
4316         [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4317         [EXEC_OUTPUT_SOCKET] = "socket",
4318         [EXEC_OUTPUT_NAMED_FD] = "fd",
4319 };
4320
4321 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4322
4323 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4324         [EXEC_UTMP_INIT] = "init",
4325         [EXEC_UTMP_LOGIN] = "login",
4326         [EXEC_UTMP_USER] = "user",
4327 };
4328
4329 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4330
4331 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4332         [EXEC_PRESERVE_NO] = "no",
4333         [EXEC_PRESERVE_YES] = "yes",
4334         [EXEC_PRESERVE_RESTART] = "restart",
4335 };
4336
4337 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4338
4339 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_MAX] = {
4340         [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4341         [EXEC_DIRECTORY_STATE] = "StateDirectory",
4342         [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4343         [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4344         [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4345 };
4346
4347 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);