]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
cgroup-util: merge cg_set_tasks_access() and cg-set_group_access() into one
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <glob.h>
24 #include <grp.h>
25 #include <poll.h>
26 #include <signal.h>
27 #include <string.h>
28 #include <sys/capability.h>
29 #include <sys/eventfd.h>
30 #include <sys/mman.h>
31 #include <sys/personality.h>
32 #include <sys/prctl.h>
33 #include <sys/shm.h>
34 #include <sys/socket.h>
35 #include <sys/stat.h>
36 #include <sys/types.h>
37 #include <sys/un.h>
38 #include <unistd.h>
39 #include <utmpx.h>
40
41 #if HAVE_PAM
42 #include <security/pam_appl.h>
43 #endif
44
45 #if HAVE_SELINUX
46 #include <selinux/selinux.h>
47 #endif
48
49 #if HAVE_SECCOMP
50 #include <seccomp.h>
51 #endif
52
53 #if HAVE_APPARMOR
54 #include <sys/apparmor.h>
55 #endif
56
57 #include "sd-messages.h"
58
59 #include "af-list.h"
60 #include "alloc-util.h"
61 #if HAVE_APPARMOR
62 #include "apparmor-util.h"
63 #endif
64 #include "async.h"
65 #include "barrier.h"
66 #include "cap-list.h"
67 #include "capability-util.h"
68 #include "chown-recursive.h"
69 #include "def.h"
70 #include "env-util.h"
71 #include "errno-list.h"
72 #include "execute.h"
73 #include "exit-status.h"
74 #include "fd-util.h"
75 #include "fileio.h"
76 #include "format-util.h"
77 #include "fs-util.h"
78 #include "glob-util.h"
79 #include "io-util.h"
80 #include "ioprio.h"
81 #include "label.h"
82 #include "log.h"
83 #include "macro.h"
84 #include "missing.h"
85 #include "mkdir.h"
86 #include "namespace.h"
87 #include "parse-util.h"
88 #include "path-util.h"
89 #include "process-util.h"
90 #include "rlimit-util.h"
91 #include "rm-rf.h"
92 #if HAVE_SECCOMP
93 #include "seccomp-util.h"
94 #endif
95 #include "securebits.h"
96 #include "securebits-util.h"
97 #include "selinux-util.h"
98 #include "signal-util.h"
99 #include "smack-util.h"
100 #include "special.h"
101 #include "string-table.h"
102 #include "string-util.h"
103 #include "strv.h"
104 #include "syslog-util.h"
105 #include "terminal-util.h"
106 #include "unit.h"
107 #include "user-util.h"
108 #include "util.h"
109 #include "utmp-wtmp.h"
110
111 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
112 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
113
114 /* This assumes there is a 'tty' group */
115 #define TTY_MODE 0620
116
117 #define SNDBUF_SIZE (8*1024*1024)
118
119 static int shift_fds(int fds[], unsigned n_fds) {
120 int start, restart_from;
121
122 if (n_fds <= 0)
123 return 0;
124
125 /* Modifies the fds array! (sorts it) */
126
127 assert(fds);
128
129 start = 0;
130 for (;;) {
131 int i;
132
133 restart_from = -1;
134
135 for (i = start; i < (int) n_fds; i++) {
136 int nfd;
137
138 /* Already at right index? */
139 if (fds[i] == i+3)
140 continue;
141
142 nfd = fcntl(fds[i], F_DUPFD, i + 3);
143 if (nfd < 0)
144 return -errno;
145
146 safe_close(fds[i]);
147 fds[i] = nfd;
148
149 /* Hmm, the fd we wanted isn't free? Then
150 * let's remember that and try again from here */
151 if (nfd != i+3 && restart_from < 0)
152 restart_from = i;
153 }
154
155 if (restart_from < 0)
156 break;
157
158 start = restart_from;
159 }
160
161 return 0;
162 }
163
164 static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
165 unsigned i, n_fds;
166 int r;
167
168 n_fds = n_storage_fds + n_socket_fds;
169 if (n_fds <= 0)
170 return 0;
171
172 assert(fds);
173
174 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
175 * O_NONBLOCK only applies to socket activation though. */
176
177 for (i = 0; i < n_fds; i++) {
178
179 if (i < n_socket_fds) {
180 r = fd_nonblock(fds[i], nonblock);
181 if (r < 0)
182 return r;
183 }
184
185 /* We unconditionally drop FD_CLOEXEC from the fds,
186 * since after all we want to pass these fds to our
187 * children */
188
189 r = fd_cloexec(fds[i], false);
190 if (r < 0)
191 return r;
192 }
193
194 return 0;
195 }
196
197 static const char *exec_context_tty_path(const ExecContext *context) {
198 assert(context);
199
200 if (context->stdio_as_fds)
201 return NULL;
202
203 if (context->tty_path)
204 return context->tty_path;
205
206 return "/dev/console";
207 }
208
209 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
210 const char *path;
211
212 assert(context);
213
214 path = exec_context_tty_path(context);
215
216 if (context->tty_vhangup) {
217 if (p && p->stdin_fd >= 0)
218 (void) terminal_vhangup_fd(p->stdin_fd);
219 else if (path)
220 (void) terminal_vhangup(path);
221 }
222
223 if (context->tty_reset) {
224 if (p && p->stdin_fd >= 0)
225 (void) reset_terminal_fd(p->stdin_fd, true);
226 else if (path)
227 (void) reset_terminal(path);
228 }
229
230 if (context->tty_vt_disallocate && path)
231 (void) vt_disallocate(path);
232 }
233
234 static bool is_terminal_input(ExecInput i) {
235 return IN_SET(i,
236 EXEC_INPUT_TTY,
237 EXEC_INPUT_TTY_FORCE,
238 EXEC_INPUT_TTY_FAIL);
239 }
240
241 static bool is_terminal_output(ExecOutput o) {
242 return IN_SET(o,
243 EXEC_OUTPUT_TTY,
244 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
245 EXEC_OUTPUT_KMSG_AND_CONSOLE,
246 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
247 }
248
249 static bool is_syslog_output(ExecOutput o) {
250 return IN_SET(o,
251 EXEC_OUTPUT_SYSLOG,
252 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
253 }
254
255 static bool is_kmsg_output(ExecOutput o) {
256 return IN_SET(o,
257 EXEC_OUTPUT_KMSG,
258 EXEC_OUTPUT_KMSG_AND_CONSOLE);
259 }
260
261 static bool exec_context_needs_term(const ExecContext *c) {
262 assert(c);
263
264 /* Return true if the execution context suggests we should set $TERM to something useful. */
265
266 if (is_terminal_input(c->std_input))
267 return true;
268
269 if (is_terminal_output(c->std_output))
270 return true;
271
272 if (is_terminal_output(c->std_error))
273 return true;
274
275 return !!c->tty_path;
276 }
277
278 static int open_null_as(int flags, int nfd) {
279 int fd;
280
281 assert(nfd >= 0);
282
283 fd = open("/dev/null", flags|O_NOCTTY);
284 if (fd < 0)
285 return -errno;
286
287 return move_fd(fd, nfd, false);
288 }
289
290 static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
291 static const union sockaddr_union sa = {
292 .un.sun_family = AF_UNIX,
293 .un.sun_path = "/run/systemd/journal/stdout",
294 };
295 uid_t olduid = UID_INVALID;
296 gid_t oldgid = GID_INVALID;
297 int r;
298
299 if (gid_is_valid(gid)) {
300 oldgid = getgid();
301
302 if (setegid(gid) < 0)
303 return -errno;
304 }
305
306 if (uid_is_valid(uid)) {
307 olduid = getuid();
308
309 if (seteuid(uid) < 0) {
310 r = -errno;
311 goto restore_gid;
312 }
313 }
314
315 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
316
317 /* If we fail to restore the uid or gid, things will likely
318 fail later on. This should only happen if an LSM interferes. */
319
320 if (uid_is_valid(uid))
321 (void) seteuid(olduid);
322
323 restore_gid:
324 if (gid_is_valid(gid))
325 (void) setegid(oldgid);
326
327 return r;
328 }
329
330 static int connect_logger_as(
331 Unit *unit,
332 const ExecContext *context,
333 const ExecParameters *params,
334 ExecOutput output,
335 const char *ident,
336 int nfd,
337 uid_t uid,
338 gid_t gid) {
339
340 int fd, r;
341
342 assert(context);
343 assert(params);
344 assert(output < _EXEC_OUTPUT_MAX);
345 assert(ident);
346 assert(nfd >= 0);
347
348 fd = socket(AF_UNIX, SOCK_STREAM, 0);
349 if (fd < 0)
350 return -errno;
351
352 r = connect_journal_socket(fd, uid, gid);
353 if (r < 0)
354 return r;
355
356 if (shutdown(fd, SHUT_RD) < 0) {
357 safe_close(fd);
358 return -errno;
359 }
360
361 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
362
363 dprintf(fd,
364 "%s\n"
365 "%s\n"
366 "%i\n"
367 "%i\n"
368 "%i\n"
369 "%i\n"
370 "%i\n",
371 context->syslog_identifier ?: ident,
372 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
373 context->syslog_priority,
374 !!context->syslog_level_prefix,
375 is_syslog_output(output),
376 is_kmsg_output(output),
377 is_terminal_output(output));
378
379 return move_fd(fd, nfd, false);
380 }
381 static int open_terminal_as(const char *path, int flags, int nfd) {
382 int fd;
383
384 assert(path);
385 assert(nfd >= 0);
386
387 fd = open_terminal(path, flags | O_NOCTTY);
388 if (fd < 0)
389 return fd;
390
391 return move_fd(fd, nfd, false);
392 }
393
394 static int acquire_path(const char *path, int flags, mode_t mode) {
395 union sockaddr_union sa = {
396 .sa.sa_family = AF_UNIX,
397 };
398 int fd, r;
399
400 assert(path);
401
402 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
403 flags |= O_CREAT;
404
405 fd = open(path, flags|O_NOCTTY, mode);
406 if (fd >= 0)
407 return fd;
408
409 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
410 return -errno;
411 if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
412 return -ENXIO;
413
414 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
415
416 fd = socket(AF_UNIX, SOCK_STREAM, 0);
417 if (fd < 0)
418 return -errno;
419
420 strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
421 if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
422 safe_close(fd);
423 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
424 * indication that his wasn't an AF_UNIX socket after all */
425 }
426
427 if ((flags & O_ACCMODE) == O_RDONLY)
428 r = shutdown(fd, SHUT_WR);
429 else if ((flags & O_ACCMODE) == O_WRONLY)
430 r = shutdown(fd, SHUT_RD);
431 else
432 return fd;
433 if (r < 0) {
434 safe_close(fd);
435 return -errno;
436 }
437
438 return fd;
439 }
440
441 static int fixup_input(
442 const ExecContext *context,
443 int socket_fd,
444 bool apply_tty_stdin) {
445
446 ExecInput std_input;
447
448 assert(context);
449
450 std_input = context->std_input;
451
452 if (is_terminal_input(std_input) && !apply_tty_stdin)
453 return EXEC_INPUT_NULL;
454
455 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
456 return EXEC_INPUT_NULL;
457
458 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
459 return EXEC_INPUT_NULL;
460
461 return std_input;
462 }
463
464 static int fixup_output(ExecOutput std_output, int socket_fd) {
465
466 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
467 return EXEC_OUTPUT_INHERIT;
468
469 return std_output;
470 }
471
472 static int setup_input(
473 const ExecContext *context,
474 const ExecParameters *params,
475 int socket_fd,
476 int named_iofds[3]) {
477
478 ExecInput i;
479
480 assert(context);
481 assert(params);
482
483 if (params->stdin_fd >= 0) {
484 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
485 return -errno;
486
487 /* Try to make this the controlling tty, if it is a tty, and reset it */
488 if (isatty(STDIN_FILENO)) {
489 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
490 (void) reset_terminal_fd(STDIN_FILENO, true);
491 }
492
493 return STDIN_FILENO;
494 }
495
496 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
497
498 switch (i) {
499
500 case EXEC_INPUT_NULL:
501 return open_null_as(O_RDONLY, STDIN_FILENO);
502
503 case EXEC_INPUT_TTY:
504 case EXEC_INPUT_TTY_FORCE:
505 case EXEC_INPUT_TTY_FAIL: {
506 int fd;
507
508 fd = acquire_terminal(exec_context_tty_path(context),
509 i == EXEC_INPUT_TTY_FAIL,
510 i == EXEC_INPUT_TTY_FORCE,
511 false,
512 USEC_INFINITY);
513 if (fd < 0)
514 return fd;
515
516 return move_fd(fd, STDIN_FILENO, false);
517 }
518
519 case EXEC_INPUT_SOCKET:
520 assert(socket_fd >= 0);
521
522 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
523
524 case EXEC_INPUT_NAMED_FD:
525 assert(named_iofds[STDIN_FILENO] >= 0);
526
527 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
528 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
529
530 case EXEC_INPUT_DATA: {
531 int fd;
532
533 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
534 if (fd < 0)
535 return fd;
536
537 return move_fd(fd, STDIN_FILENO, false);
538 }
539
540 case EXEC_INPUT_FILE: {
541 bool rw;
542 int fd;
543
544 assert(context->stdio_file[STDIN_FILENO]);
545
546 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
547 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
548
549 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
550 if (fd < 0)
551 return fd;
552
553 return move_fd(fd, STDIN_FILENO, false);
554 }
555
556 default:
557 assert_not_reached("Unknown input type");
558 }
559 }
560
561 static int setup_output(
562 Unit *unit,
563 const ExecContext *context,
564 const ExecParameters *params,
565 int fileno,
566 int socket_fd,
567 int named_iofds[3],
568 const char *ident,
569 uid_t uid,
570 gid_t gid,
571 dev_t *journal_stream_dev,
572 ino_t *journal_stream_ino) {
573
574 ExecOutput o;
575 ExecInput i;
576 int r;
577
578 assert(unit);
579 assert(context);
580 assert(params);
581 assert(ident);
582 assert(journal_stream_dev);
583 assert(journal_stream_ino);
584
585 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
586
587 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
588 return -errno;
589
590 return STDOUT_FILENO;
591 }
592
593 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
594 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
595 return -errno;
596
597 return STDERR_FILENO;
598 }
599
600 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
601 o = fixup_output(context->std_output, socket_fd);
602
603 if (fileno == STDERR_FILENO) {
604 ExecOutput e;
605 e = fixup_output(context->std_error, socket_fd);
606
607 /* This expects the input and output are already set up */
608
609 /* Don't change the stderr file descriptor if we inherit all
610 * the way and are not on a tty */
611 if (e == EXEC_OUTPUT_INHERIT &&
612 o == EXEC_OUTPUT_INHERIT &&
613 i == EXEC_INPUT_NULL &&
614 !is_terminal_input(context->std_input) &&
615 getppid () != 1)
616 return fileno;
617
618 /* Duplicate from stdout if possible */
619 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
620 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
621
622 o = e;
623
624 } else if (o == EXEC_OUTPUT_INHERIT) {
625 /* If input got downgraded, inherit the original value */
626 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
627 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
628
629 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
630 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
631 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
632
633 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
634 if (getppid() != 1)
635 return fileno;
636
637 /* We need to open /dev/null here anew, to get the right access mode. */
638 return open_null_as(O_WRONLY, fileno);
639 }
640
641 switch (o) {
642
643 case EXEC_OUTPUT_NULL:
644 return open_null_as(O_WRONLY, fileno);
645
646 case EXEC_OUTPUT_TTY:
647 if (is_terminal_input(i))
648 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
649
650 /* We don't reset the terminal if this is just about output */
651 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
652
653 case EXEC_OUTPUT_SYSLOG:
654 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
655 case EXEC_OUTPUT_KMSG:
656 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
657 case EXEC_OUTPUT_JOURNAL:
658 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
659 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
660 if (r < 0) {
661 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
662 r = open_null_as(O_WRONLY, fileno);
663 } else {
664 struct stat st;
665
666 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
667 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
668 * services to detect whether they are connected to the journal or not.
669 *
670 * If both stdout and stderr are connected to a stream then let's make sure to store the data
671 * about STDERR as that's usually the best way to do logging. */
672
673 if (fstat(fileno, &st) >= 0 &&
674 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
675 *journal_stream_dev = st.st_dev;
676 *journal_stream_ino = st.st_ino;
677 }
678 }
679 return r;
680
681 case EXEC_OUTPUT_SOCKET:
682 assert(socket_fd >= 0);
683
684 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
685
686 case EXEC_OUTPUT_NAMED_FD:
687 assert(named_iofds[fileno] >= 0);
688
689 (void) fd_nonblock(named_iofds[fileno], false);
690 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
691
692 case EXEC_OUTPUT_FILE: {
693 bool rw;
694 int fd;
695
696 assert(context->stdio_file[fileno]);
697
698 rw = context->std_input == EXEC_INPUT_FILE &&
699 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
700
701 if (rw)
702 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
703
704 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
705 if (fd < 0)
706 return fd;
707
708 return move_fd(fd, fileno, false);
709 }
710
711 default:
712 assert_not_reached("Unknown error type");
713 }
714 }
715
716 static int chown_terminal(int fd, uid_t uid) {
717 struct stat st;
718
719 assert(fd >= 0);
720
721 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
722 if (isatty(fd) < 1)
723 return 0;
724
725 /* This might fail. What matters are the results. */
726 (void) fchown(fd, uid, -1);
727 (void) fchmod(fd, TTY_MODE);
728
729 if (fstat(fd, &st) < 0)
730 return -errno;
731
732 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
733 return -EPERM;
734
735 return 0;
736 }
737
738 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
739 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
740 int r;
741
742 assert(_saved_stdin);
743 assert(_saved_stdout);
744
745 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
746 if (saved_stdin < 0)
747 return -errno;
748
749 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
750 if (saved_stdout < 0)
751 return -errno;
752
753 fd = acquire_terminal(vc, false, false, false, DEFAULT_CONFIRM_USEC);
754 if (fd < 0)
755 return fd;
756
757 r = chown_terminal(fd, getuid());
758 if (r < 0)
759 return r;
760
761 r = reset_terminal_fd(fd, true);
762 if (r < 0)
763 return r;
764
765 if (dup2(fd, STDIN_FILENO) < 0)
766 return -errno;
767
768 if (dup2(fd, STDOUT_FILENO) < 0)
769 return -errno;
770
771 if (fd >= 2)
772 safe_close(fd);
773 fd = -1;
774
775 *_saved_stdin = saved_stdin;
776 *_saved_stdout = saved_stdout;
777
778 saved_stdin = saved_stdout = -1;
779
780 return 0;
781 }
782
783 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
784 assert(err < 0);
785
786 if (err == -ETIMEDOUT)
787 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
788 else {
789 errno = -err;
790 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
791 }
792 }
793
794 static void write_confirm_error(int err, const char *vc, const Unit *u) {
795 _cleanup_close_ int fd = -1;
796
797 assert(vc);
798
799 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
800 if (fd < 0)
801 return;
802
803 write_confirm_error_fd(err, fd, u);
804 }
805
806 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
807 int r = 0;
808
809 assert(saved_stdin);
810 assert(saved_stdout);
811
812 release_terminal();
813
814 if (*saved_stdin >= 0)
815 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
816 r = -errno;
817
818 if (*saved_stdout >= 0)
819 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
820 r = -errno;
821
822 *saved_stdin = safe_close(*saved_stdin);
823 *saved_stdout = safe_close(*saved_stdout);
824
825 return r;
826 }
827
828 enum {
829 CONFIRM_PRETEND_FAILURE = -1,
830 CONFIRM_PRETEND_SUCCESS = 0,
831 CONFIRM_EXECUTE = 1,
832 };
833
834 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
835 int saved_stdout = -1, saved_stdin = -1, r;
836 _cleanup_free_ char *e = NULL;
837 char c;
838
839 /* For any internal errors, assume a positive response. */
840 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
841 if (r < 0) {
842 write_confirm_error(r, vc, u);
843 return CONFIRM_EXECUTE;
844 }
845
846 /* confirm_spawn might have been disabled while we were sleeping. */
847 if (manager_is_confirm_spawn_disabled(u->manager)) {
848 r = 1;
849 goto restore_stdio;
850 }
851
852 e = ellipsize(cmdline, 60, 100);
853 if (!e) {
854 log_oom();
855 r = CONFIRM_EXECUTE;
856 goto restore_stdio;
857 }
858
859 for (;;) {
860 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
861 if (r < 0) {
862 write_confirm_error_fd(r, STDOUT_FILENO, u);
863 r = CONFIRM_EXECUTE;
864 goto restore_stdio;
865 }
866
867 switch (c) {
868 case 'c':
869 printf("Resuming normal execution.\n");
870 manager_disable_confirm_spawn();
871 r = 1;
872 break;
873 case 'D':
874 unit_dump(u, stdout, " ");
875 continue; /* ask again */
876 case 'f':
877 printf("Failing execution.\n");
878 r = CONFIRM_PRETEND_FAILURE;
879 break;
880 case 'h':
881 printf(" c - continue, proceed without asking anymore\n"
882 " D - dump, show the state of the unit\n"
883 " f - fail, don't execute the command and pretend it failed\n"
884 " h - help\n"
885 " i - info, show a short summary of the unit\n"
886 " j - jobs, show jobs that are in progress\n"
887 " s - skip, don't execute the command and pretend it succeeded\n"
888 " y - yes, execute the command\n");
889 continue; /* ask again */
890 case 'i':
891 printf(" Description: %s\n"
892 " Unit: %s\n"
893 " Command: %s\n",
894 u->id, u->description, cmdline);
895 continue; /* ask again */
896 case 'j':
897 manager_dump_jobs(u->manager, stdout, " ");
898 continue; /* ask again */
899 case 'n':
900 /* 'n' was removed in favor of 'f'. */
901 printf("Didn't understand 'n', did you mean 'f'?\n");
902 continue; /* ask again */
903 case 's':
904 printf("Skipping execution.\n");
905 r = CONFIRM_PRETEND_SUCCESS;
906 break;
907 case 'y':
908 r = CONFIRM_EXECUTE;
909 break;
910 default:
911 assert_not_reached("Unhandled choice");
912 }
913 break;
914 }
915
916 restore_stdio:
917 restore_confirm_stdio(&saved_stdin, &saved_stdout);
918 return r;
919 }
920
921 static int get_fixed_user(const ExecContext *c, const char **user,
922 uid_t *uid, gid_t *gid,
923 const char **home, const char **shell) {
924 int r;
925 const char *name;
926
927 assert(c);
928
929 if (!c->user)
930 return 0;
931
932 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
933 * (i.e. are "/" or "/bin/nologin"). */
934
935 name = c->user;
936 r = get_user_creds_clean(&name, uid, gid, home, shell);
937 if (r < 0)
938 return r;
939
940 *user = name;
941 return 0;
942 }
943
944 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
945 int r;
946 const char *name;
947
948 assert(c);
949
950 if (!c->group)
951 return 0;
952
953 name = c->group;
954 r = get_group_creds(&name, gid);
955 if (r < 0)
956 return r;
957
958 *group = name;
959 return 0;
960 }
961
962 static int get_supplementary_groups(const ExecContext *c, const char *user,
963 const char *group, gid_t gid,
964 gid_t **supplementary_gids, int *ngids) {
965 char **i;
966 int r, k = 0;
967 int ngroups_max;
968 bool keep_groups = false;
969 gid_t *groups = NULL;
970 _cleanup_free_ gid_t *l_gids = NULL;
971
972 assert(c);
973
974 /*
975 * If user is given, then lookup GID and supplementary groups list.
976 * We avoid NSS lookups for gid=0. Also we have to initialize groups
977 * here and as early as possible so we keep the list of supplementary
978 * groups of the caller.
979 */
980 if (user && gid_is_valid(gid) && gid != 0) {
981 /* First step, initialize groups from /etc/groups */
982 if (initgroups(user, gid) < 0)
983 return -errno;
984
985 keep_groups = true;
986 }
987
988 if (strv_isempty(c->supplementary_groups))
989 return 0;
990
991 /*
992 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
993 * be positive, otherwise fail.
994 */
995 errno = 0;
996 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
997 if (ngroups_max <= 0) {
998 if (errno > 0)
999 return -errno;
1000 else
1001 return -EOPNOTSUPP; /* For all other values */
1002 }
1003
1004 l_gids = new(gid_t, ngroups_max);
1005 if (!l_gids)
1006 return -ENOMEM;
1007
1008 if (keep_groups) {
1009 /*
1010 * Lookup the list of groups that the user belongs to, we
1011 * avoid NSS lookups here too for gid=0.
1012 */
1013 k = ngroups_max;
1014 if (getgrouplist(user, gid, l_gids, &k) < 0)
1015 return -EINVAL;
1016 } else
1017 k = 0;
1018
1019 STRV_FOREACH(i, c->supplementary_groups) {
1020 const char *g;
1021
1022 if (k >= ngroups_max)
1023 return -E2BIG;
1024
1025 g = *i;
1026 r = get_group_creds(&g, l_gids+k);
1027 if (r < 0)
1028 return r;
1029
1030 k++;
1031 }
1032
1033 /*
1034 * Sets ngids to zero to drop all supplementary groups, happens
1035 * when we are under root and SupplementaryGroups= is empty.
1036 */
1037 if (k == 0) {
1038 *ngids = 0;
1039 return 0;
1040 }
1041
1042 /* Otherwise get the final list of supplementary groups */
1043 groups = memdup(l_gids, sizeof(gid_t) * k);
1044 if (!groups)
1045 return -ENOMEM;
1046
1047 *supplementary_gids = groups;
1048 *ngids = k;
1049
1050 groups = NULL;
1051
1052 return 0;
1053 }
1054
1055 static int enforce_groups(gid_t gid, gid_t *supplementary_gids, int ngids) {
1056 int r;
1057
1058 /* Handle SupplementaryGroups= if it is not empty */
1059 if (ngids > 0) {
1060 r = maybe_setgroups(ngids, supplementary_gids);
1061 if (r < 0)
1062 return r;
1063 }
1064
1065 if (gid_is_valid(gid)) {
1066 /* Then set our gids */
1067 if (setresgid(gid, gid, gid) < 0)
1068 return -errno;
1069 }
1070
1071 return 0;
1072 }
1073
1074 static int enforce_user(const ExecContext *context, uid_t uid) {
1075 assert(context);
1076
1077 if (!uid_is_valid(uid))
1078 return 0;
1079
1080 /* Sets (but doesn't look up) the uid and make sure we keep the
1081 * capabilities while doing so. */
1082
1083 if (context->capability_ambient_set != 0) {
1084
1085 /* First step: If we need to keep capabilities but
1086 * drop privileges we need to make sure we keep our
1087 * caps, while we drop privileges. */
1088 if (uid != 0) {
1089 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
1090
1091 if (prctl(PR_GET_SECUREBITS) != sb)
1092 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1093 return -errno;
1094 }
1095 }
1096
1097 /* Second step: actually set the uids */
1098 if (setresuid(uid, uid, uid) < 0)
1099 return -errno;
1100
1101 /* At this point we should have all necessary capabilities but
1102 are otherwise a normal user. However, the caps might got
1103 corrupted due to the setresuid() so we need clean them up
1104 later. This is done outside of this call. */
1105
1106 return 0;
1107 }
1108
1109 #if HAVE_PAM
1110
1111 static int null_conv(
1112 int num_msg,
1113 const struct pam_message **msg,
1114 struct pam_response **resp,
1115 void *appdata_ptr) {
1116
1117 /* We don't support conversations */
1118
1119 return PAM_CONV_ERR;
1120 }
1121
1122 #endif
1123
1124 static int setup_pam(
1125 const char *name,
1126 const char *user,
1127 uid_t uid,
1128 gid_t gid,
1129 const char *tty,
1130 char ***env,
1131 int fds[], unsigned n_fds) {
1132
1133 #if HAVE_PAM
1134
1135 static const struct pam_conv conv = {
1136 .conv = null_conv,
1137 .appdata_ptr = NULL
1138 };
1139
1140 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1141 pam_handle_t *handle = NULL;
1142 sigset_t old_ss;
1143 int pam_code = PAM_SUCCESS, r;
1144 char **nv, **e = NULL;
1145 bool close_session = false;
1146 pid_t pam_pid = 0, parent_pid;
1147 int flags = 0;
1148
1149 assert(name);
1150 assert(user);
1151 assert(env);
1152
1153 /* We set up PAM in the parent process, then fork. The child
1154 * will then stay around until killed via PR_GET_PDEATHSIG or
1155 * systemd via the cgroup logic. It will then remove the PAM
1156 * session again. The parent process will exec() the actual
1157 * daemon. We do things this way to ensure that the main PID
1158 * of the daemon is the one we initially fork()ed. */
1159
1160 r = barrier_create(&barrier);
1161 if (r < 0)
1162 goto fail;
1163
1164 if (log_get_max_level() < LOG_DEBUG)
1165 flags |= PAM_SILENT;
1166
1167 pam_code = pam_start(name, user, &conv, &handle);
1168 if (pam_code != PAM_SUCCESS) {
1169 handle = NULL;
1170 goto fail;
1171 }
1172
1173 if (tty) {
1174 pam_code = pam_set_item(handle, PAM_TTY, tty);
1175 if (pam_code != PAM_SUCCESS)
1176 goto fail;
1177 }
1178
1179 STRV_FOREACH(nv, *env) {
1180 pam_code = pam_putenv(handle, *nv);
1181 if (pam_code != PAM_SUCCESS)
1182 goto fail;
1183 }
1184
1185 pam_code = pam_acct_mgmt(handle, flags);
1186 if (pam_code != PAM_SUCCESS)
1187 goto fail;
1188
1189 pam_code = pam_open_session(handle, flags);
1190 if (pam_code != PAM_SUCCESS)
1191 goto fail;
1192
1193 close_session = true;
1194
1195 e = pam_getenvlist(handle);
1196 if (!e) {
1197 pam_code = PAM_BUF_ERR;
1198 goto fail;
1199 }
1200
1201 /* Block SIGTERM, so that we know that it won't get lost in
1202 * the child */
1203
1204 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1205
1206 parent_pid = getpid_cached();
1207
1208 pam_pid = fork();
1209 if (pam_pid < 0) {
1210 r = -errno;
1211 goto fail;
1212 }
1213
1214 if (pam_pid == 0) {
1215 int sig, ret = EXIT_PAM;
1216
1217 /* The child's job is to reset the PAM session on
1218 * termination */
1219 barrier_set_role(&barrier, BARRIER_CHILD);
1220
1221 /* This string must fit in 10 chars (i.e. the length
1222 * of "/sbin/init"), to look pretty in /bin/ps */
1223 rename_process("(sd-pam)");
1224
1225 /* Make sure we don't keep open the passed fds in this
1226 child. We assume that otherwise only those fds are
1227 open here that have been opened by PAM. */
1228 close_many(fds, n_fds);
1229
1230 /* Drop privileges - we don't need any to pam_close_session
1231 * and this will make PR_SET_PDEATHSIG work in most cases.
1232 * If this fails, ignore the error - but expect sd-pam threads
1233 * to fail to exit normally */
1234
1235 r = maybe_setgroups(0, NULL);
1236 if (r < 0)
1237 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1238 if (setresgid(gid, gid, gid) < 0)
1239 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1240 if (setresuid(uid, uid, uid) < 0)
1241 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1242
1243 (void) ignore_signals(SIGPIPE, -1);
1244
1245 /* Wait until our parent died. This will only work if
1246 * the above setresuid() succeeds, otherwise the kernel
1247 * will not allow unprivileged parents kill their privileged
1248 * children this way. We rely on the control groups kill logic
1249 * to do the rest for us. */
1250 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1251 goto child_finish;
1252
1253 /* Tell the parent that our setup is done. This is especially
1254 * important regarding dropping privileges. Otherwise, unit
1255 * setup might race against our setresuid(2) call.
1256 *
1257 * If the parent aborted, we'll detect this below, hence ignore
1258 * return failure here. */
1259 (void) barrier_place(&barrier);
1260
1261 /* Check if our parent process might already have died? */
1262 if (getppid() == parent_pid) {
1263 sigset_t ss;
1264
1265 assert_se(sigemptyset(&ss) >= 0);
1266 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1267
1268 for (;;) {
1269 if (sigwait(&ss, &sig) < 0) {
1270 if (errno == EINTR)
1271 continue;
1272
1273 goto child_finish;
1274 }
1275
1276 assert(sig == SIGTERM);
1277 break;
1278 }
1279 }
1280
1281 /* If our parent died we'll end the session */
1282 if (getppid() != parent_pid) {
1283 pam_code = pam_close_session(handle, flags);
1284 if (pam_code != PAM_SUCCESS)
1285 goto child_finish;
1286 }
1287
1288 ret = 0;
1289
1290 child_finish:
1291 pam_end(handle, pam_code | flags);
1292 _exit(ret);
1293 }
1294
1295 barrier_set_role(&barrier, BARRIER_PARENT);
1296
1297 /* If the child was forked off successfully it will do all the
1298 * cleanups, so forget about the handle here. */
1299 handle = NULL;
1300
1301 /* Unblock SIGTERM again in the parent */
1302 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1303
1304 /* We close the log explicitly here, since the PAM modules
1305 * might have opened it, but we don't want this fd around. */
1306 closelog();
1307
1308 /* Synchronously wait for the child to initialize. We don't care for
1309 * errors as we cannot recover. However, warn loudly if it happens. */
1310 if (!barrier_place_and_sync(&barrier))
1311 log_error("PAM initialization failed");
1312
1313 strv_free(*env);
1314 *env = e;
1315
1316 return 0;
1317
1318 fail:
1319 if (pam_code != PAM_SUCCESS) {
1320 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1321 r = -EPERM; /* PAM errors do not map to errno */
1322 } else
1323 log_error_errno(r, "PAM failed: %m");
1324
1325 if (handle) {
1326 if (close_session)
1327 pam_code = pam_close_session(handle, flags);
1328
1329 pam_end(handle, pam_code | flags);
1330 }
1331
1332 strv_free(e);
1333 closelog();
1334
1335 return r;
1336 #else
1337 return 0;
1338 #endif
1339 }
1340
1341 static void rename_process_from_path(const char *path) {
1342 char process_name[11];
1343 const char *p;
1344 size_t l;
1345
1346 /* This resulting string must fit in 10 chars (i.e. the length
1347 * of "/sbin/init") to look pretty in /bin/ps */
1348
1349 p = basename(path);
1350 if (isempty(p)) {
1351 rename_process("(...)");
1352 return;
1353 }
1354
1355 l = strlen(p);
1356 if (l > 8) {
1357 /* The end of the process name is usually more
1358 * interesting, since the first bit might just be
1359 * "systemd-" */
1360 p = p + l - 8;
1361 l = 8;
1362 }
1363
1364 process_name[0] = '(';
1365 memcpy(process_name+1, p, l);
1366 process_name[1+l] = ')';
1367 process_name[1+l+1] = 0;
1368
1369 rename_process(process_name);
1370 }
1371
1372 static bool context_has_address_families(const ExecContext *c) {
1373 assert(c);
1374
1375 return c->address_families_whitelist ||
1376 !set_isempty(c->address_families);
1377 }
1378
1379 static bool context_has_syscall_filters(const ExecContext *c) {
1380 assert(c);
1381
1382 return c->syscall_whitelist ||
1383 !hashmap_isempty(c->syscall_filter);
1384 }
1385
1386 static bool context_has_no_new_privileges(const ExecContext *c) {
1387 assert(c);
1388
1389 if (c->no_new_privileges)
1390 return true;
1391
1392 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1393 return false;
1394
1395 /* We need NNP if we have any form of seccomp and are unprivileged */
1396 return context_has_address_families(c) ||
1397 c->memory_deny_write_execute ||
1398 c->restrict_realtime ||
1399 exec_context_restrict_namespaces_set(c) ||
1400 c->protect_kernel_tunables ||
1401 c->protect_kernel_modules ||
1402 c->private_devices ||
1403 context_has_syscall_filters(c) ||
1404 !set_isempty(c->syscall_archs) ||
1405 c->lock_personality;
1406 }
1407
1408 #if HAVE_SECCOMP
1409
1410 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1411
1412 if (is_seccomp_available())
1413 return false;
1414
1415 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1416 return true;
1417 }
1418
1419 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1420 uint32_t negative_action, default_action, action;
1421 int r;
1422
1423 assert(u);
1424 assert(c);
1425
1426 if (!context_has_syscall_filters(c))
1427 return 0;
1428
1429 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1430 return 0;
1431
1432 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
1433
1434 if (c->syscall_whitelist) {
1435 default_action = negative_action;
1436 action = SCMP_ACT_ALLOW;
1437 } else {
1438 default_action = SCMP_ACT_ALLOW;
1439 action = negative_action;
1440 }
1441
1442 if (needs_ambient_hack) {
1443 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1444 if (r < 0)
1445 return r;
1446 }
1447
1448 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
1449 }
1450
1451 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1452 assert(u);
1453 assert(c);
1454
1455 if (set_isempty(c->syscall_archs))
1456 return 0;
1457
1458 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1459 return 0;
1460
1461 return seccomp_restrict_archs(c->syscall_archs);
1462 }
1463
1464 static int apply_address_families(const Unit* u, const ExecContext *c) {
1465 assert(u);
1466 assert(c);
1467
1468 if (!context_has_address_families(c))
1469 return 0;
1470
1471 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1472 return 0;
1473
1474 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
1475 }
1476
1477 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1478 assert(u);
1479 assert(c);
1480
1481 if (!c->memory_deny_write_execute)
1482 return 0;
1483
1484 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1485 return 0;
1486
1487 return seccomp_memory_deny_write_execute();
1488 }
1489
1490 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1491 assert(u);
1492 assert(c);
1493
1494 if (!c->restrict_realtime)
1495 return 0;
1496
1497 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1498 return 0;
1499
1500 return seccomp_restrict_realtime();
1501 }
1502
1503 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1504 assert(u);
1505 assert(c);
1506
1507 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1508 * let's protect even those systems where this is left on in the kernel. */
1509
1510 if (!c->protect_kernel_tunables)
1511 return 0;
1512
1513 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1514 return 0;
1515
1516 return seccomp_protect_sysctl();
1517 }
1518
1519 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1520 assert(u);
1521 assert(c);
1522
1523 /* Turn off module syscalls on ProtectKernelModules=yes */
1524
1525 if (!c->protect_kernel_modules)
1526 return 0;
1527
1528 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1529 return 0;
1530
1531 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
1532 }
1533
1534 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1535 assert(u);
1536 assert(c);
1537
1538 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1539
1540 if (!c->private_devices)
1541 return 0;
1542
1543 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1544 return 0;
1545
1546 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
1547 }
1548
1549 static int apply_restrict_namespaces(Unit *u, const ExecContext *c) {
1550 assert(u);
1551 assert(c);
1552
1553 if (!exec_context_restrict_namespaces_set(c))
1554 return 0;
1555
1556 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1557 return 0;
1558
1559 return seccomp_restrict_namespaces(c->restrict_namespaces);
1560 }
1561
1562 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1563 unsigned long personality;
1564 int r;
1565
1566 assert(u);
1567 assert(c);
1568
1569 if (!c->lock_personality)
1570 return 0;
1571
1572 if (skip_seccomp_unavailable(u, "LockPersonality="))
1573 return 0;
1574
1575 personality = c->personality;
1576
1577 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1578 if (personality == PERSONALITY_INVALID) {
1579
1580 r = opinionated_personality(&personality);
1581 if (r < 0)
1582 return r;
1583 }
1584
1585 return seccomp_lock_personality(personality);
1586 }
1587
1588 #endif
1589
1590 static void do_idle_pipe_dance(int idle_pipe[4]) {
1591 assert(idle_pipe);
1592
1593 idle_pipe[1] = safe_close(idle_pipe[1]);
1594 idle_pipe[2] = safe_close(idle_pipe[2]);
1595
1596 if (idle_pipe[0] >= 0) {
1597 int r;
1598
1599 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1600
1601 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1602 ssize_t n;
1603
1604 /* Signal systemd that we are bored and want to continue. */
1605 n = write(idle_pipe[3], "x", 1);
1606 if (n > 0)
1607 /* Wait for systemd to react to the signal above. */
1608 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1609 }
1610
1611 idle_pipe[0] = safe_close(idle_pipe[0]);
1612
1613 }
1614
1615 idle_pipe[3] = safe_close(idle_pipe[3]);
1616 }
1617
1618 static int build_environment(
1619 Unit *u,
1620 const ExecContext *c,
1621 const ExecParameters *p,
1622 unsigned n_fds,
1623 const char *home,
1624 const char *username,
1625 const char *shell,
1626 dev_t journal_stream_dev,
1627 ino_t journal_stream_ino,
1628 char ***ret) {
1629
1630 _cleanup_strv_free_ char **our_env = NULL;
1631 unsigned n_env = 0;
1632 char *x;
1633
1634 assert(u);
1635 assert(c);
1636 assert(ret);
1637
1638 our_env = new0(char*, 14);
1639 if (!our_env)
1640 return -ENOMEM;
1641
1642 if (n_fds > 0) {
1643 _cleanup_free_ char *joined = NULL;
1644
1645 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1646 return -ENOMEM;
1647 our_env[n_env++] = x;
1648
1649 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1650 return -ENOMEM;
1651 our_env[n_env++] = x;
1652
1653 joined = strv_join(p->fd_names, ":");
1654 if (!joined)
1655 return -ENOMEM;
1656
1657 x = strjoin("LISTEN_FDNAMES=", joined);
1658 if (!x)
1659 return -ENOMEM;
1660 our_env[n_env++] = x;
1661 }
1662
1663 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1664 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1665 return -ENOMEM;
1666 our_env[n_env++] = x;
1667
1668 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1669 return -ENOMEM;
1670 our_env[n_env++] = x;
1671 }
1672
1673 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1674 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1675 * check the database directly. */
1676 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1677 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1678 if (!x)
1679 return -ENOMEM;
1680 our_env[n_env++] = x;
1681 }
1682
1683 if (home) {
1684 x = strappend("HOME=", home);
1685 if (!x)
1686 return -ENOMEM;
1687 our_env[n_env++] = x;
1688 }
1689
1690 if (username) {
1691 x = strappend("LOGNAME=", username);
1692 if (!x)
1693 return -ENOMEM;
1694 our_env[n_env++] = x;
1695
1696 x = strappend("USER=", username);
1697 if (!x)
1698 return -ENOMEM;
1699 our_env[n_env++] = x;
1700 }
1701
1702 if (shell) {
1703 x = strappend("SHELL=", shell);
1704 if (!x)
1705 return -ENOMEM;
1706 our_env[n_env++] = x;
1707 }
1708
1709 if (!sd_id128_is_null(u->invocation_id)) {
1710 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1711 return -ENOMEM;
1712
1713 our_env[n_env++] = x;
1714 }
1715
1716 if (exec_context_needs_term(c)) {
1717 const char *tty_path, *term = NULL;
1718
1719 tty_path = exec_context_tty_path(c);
1720
1721 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1722 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1723 * passes to PID 1 ends up all the way in the console login shown. */
1724
1725 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1726 term = getenv("TERM");
1727 if (!term)
1728 term = default_term_for_tty(tty_path);
1729
1730 x = strappend("TERM=", term);
1731 if (!x)
1732 return -ENOMEM;
1733 our_env[n_env++] = x;
1734 }
1735
1736 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1737 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1738 return -ENOMEM;
1739
1740 our_env[n_env++] = x;
1741 }
1742
1743 our_env[n_env++] = NULL;
1744 assert(n_env <= 12);
1745
1746 *ret = our_env;
1747 our_env = NULL;
1748
1749 return 0;
1750 }
1751
1752 static int build_pass_environment(const ExecContext *c, char ***ret) {
1753 _cleanup_strv_free_ char **pass_env = NULL;
1754 size_t n_env = 0, n_bufsize = 0;
1755 char **i;
1756
1757 STRV_FOREACH(i, c->pass_environment) {
1758 _cleanup_free_ char *x = NULL;
1759 char *v;
1760
1761 v = getenv(*i);
1762 if (!v)
1763 continue;
1764 x = strjoin(*i, "=", v);
1765 if (!x)
1766 return -ENOMEM;
1767
1768 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1769 return -ENOMEM;
1770
1771 pass_env[n_env++] = x;
1772 pass_env[n_env] = NULL;
1773 x = NULL;
1774 }
1775
1776 *ret = pass_env;
1777 pass_env = NULL;
1778
1779 return 0;
1780 }
1781
1782 static bool exec_needs_mount_namespace(
1783 const ExecContext *context,
1784 const ExecParameters *params,
1785 ExecRuntime *runtime) {
1786
1787 assert(context);
1788 assert(params);
1789
1790 if (context->root_image)
1791 return true;
1792
1793 if (!strv_isempty(context->read_write_paths) ||
1794 !strv_isempty(context->read_only_paths) ||
1795 !strv_isempty(context->inaccessible_paths))
1796 return true;
1797
1798 if (context->n_bind_mounts > 0 ||
1799 !strv_isempty(context->directories[EXEC_DIRECTORY_RUNTIME].paths) ||
1800 !strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
1801 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1802 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths) ||
1803 !strv_isempty(context->directories[EXEC_DIRECTORY_CONFIGURATION].paths))
1804 return true;
1805
1806 if (context->mount_flags != 0)
1807 return true;
1808
1809 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1810 return true;
1811
1812 if (context->private_devices ||
1813 context->protect_system != PROTECT_SYSTEM_NO ||
1814 context->protect_home != PROTECT_HOME_NO ||
1815 context->protect_kernel_tunables ||
1816 context->protect_kernel_modules ||
1817 context->protect_control_groups)
1818 return true;
1819
1820 if (context->mount_apivfs && (context->root_image || context->root_directory))
1821 return true;
1822
1823 return false;
1824 }
1825
1826 static int setup_private_users(uid_t uid, gid_t gid) {
1827 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1828 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1829 _cleanup_close_ int unshare_ready_fd = -1;
1830 _cleanup_(sigkill_waitp) pid_t pid = 0;
1831 uint64_t c = 1;
1832 siginfo_t si;
1833 ssize_t n;
1834 int r;
1835
1836 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1837 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1838 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1839 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1840 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1841 * continues execution normally. */
1842
1843 if (uid != 0 && uid_is_valid(uid)) {
1844 r = asprintf(&uid_map,
1845 "0 0 1\n" /* Map root → root */
1846 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1847 uid, uid);
1848 if (r < 0)
1849 return -ENOMEM;
1850 } else {
1851 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1852 if (!uid_map)
1853 return -ENOMEM;
1854 }
1855
1856 if (gid != 0 && gid_is_valid(gid)) {
1857 r = asprintf(&gid_map,
1858 "0 0 1\n" /* Map root → root */
1859 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1860 gid, gid);
1861 if (r < 0)
1862 return -ENOMEM;
1863 } else {
1864 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
1865 if (!gid_map)
1866 return -ENOMEM;
1867 }
1868
1869 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1870 * namespace. */
1871 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1872 if (unshare_ready_fd < 0)
1873 return -errno;
1874
1875 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1876 * failed. */
1877 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1878 return -errno;
1879
1880 pid = fork();
1881 if (pid < 0)
1882 return -errno;
1883
1884 if (pid == 0) {
1885 _cleanup_close_ int fd = -1;
1886 const char *a;
1887 pid_t ppid;
1888
1889 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1890 * here, after the parent opened its own user namespace. */
1891
1892 ppid = getppid();
1893 errno_pipe[0] = safe_close(errno_pipe[0]);
1894
1895 /* Wait until the parent unshared the user namespace */
1896 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1897 r = -errno;
1898 goto child_fail;
1899 }
1900
1901 /* Disable the setgroups() system call in the child user namespace, for good. */
1902 a = procfs_file_alloca(ppid, "setgroups");
1903 fd = open(a, O_WRONLY|O_CLOEXEC);
1904 if (fd < 0) {
1905 if (errno != ENOENT) {
1906 r = -errno;
1907 goto child_fail;
1908 }
1909
1910 /* If the file is missing the kernel is too old, let's continue anyway. */
1911 } else {
1912 if (write(fd, "deny\n", 5) < 0) {
1913 r = -errno;
1914 goto child_fail;
1915 }
1916
1917 fd = safe_close(fd);
1918 }
1919
1920 /* First write the GID map */
1921 a = procfs_file_alloca(ppid, "gid_map");
1922 fd = open(a, O_WRONLY|O_CLOEXEC);
1923 if (fd < 0) {
1924 r = -errno;
1925 goto child_fail;
1926 }
1927 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1928 r = -errno;
1929 goto child_fail;
1930 }
1931 fd = safe_close(fd);
1932
1933 /* The write the UID map */
1934 a = procfs_file_alloca(ppid, "uid_map");
1935 fd = open(a, O_WRONLY|O_CLOEXEC);
1936 if (fd < 0) {
1937 r = -errno;
1938 goto child_fail;
1939 }
1940 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1941 r = -errno;
1942 goto child_fail;
1943 }
1944
1945 _exit(EXIT_SUCCESS);
1946
1947 child_fail:
1948 (void) write(errno_pipe[1], &r, sizeof(r));
1949 _exit(EXIT_FAILURE);
1950 }
1951
1952 errno_pipe[1] = safe_close(errno_pipe[1]);
1953
1954 if (unshare(CLONE_NEWUSER) < 0)
1955 return -errno;
1956
1957 /* Let the child know that the namespace is ready now */
1958 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1959 return -errno;
1960
1961 /* Try to read an error code from the child */
1962 n = read(errno_pipe[0], &r, sizeof(r));
1963 if (n < 0)
1964 return -errno;
1965 if (n == sizeof(r)) { /* an error code was sent to us */
1966 if (r < 0)
1967 return r;
1968 return -EIO;
1969 }
1970 if (n != 0) /* on success we should have read 0 bytes */
1971 return -EIO;
1972
1973 r = wait_for_terminate(pid, &si);
1974 if (r < 0)
1975 return r;
1976 pid = 0;
1977
1978 /* If something strange happened with the child, let's consider this fatal, too */
1979 if (si.si_code != CLD_EXITED || si.si_status != 0)
1980 return -EIO;
1981
1982 return 0;
1983 }
1984
1985 static int setup_exec_directory(
1986 const ExecContext *context,
1987 const ExecParameters *params,
1988 uid_t uid,
1989 gid_t gid,
1990 ExecDirectoryType type,
1991 int *exit_status) {
1992
1993 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
1994 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1995 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1996 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1997 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1998 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1999 };
2000 char **rt;
2001 int r;
2002
2003 assert(context);
2004 assert(params);
2005 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2006 assert(exit_status);
2007
2008 if (!params->prefix[type])
2009 return 0;
2010
2011 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2012 if (!uid_is_valid(uid))
2013 uid = 0;
2014 if (!gid_is_valid(gid))
2015 gid = 0;
2016 }
2017
2018 STRV_FOREACH(rt, context->directories[type].paths) {
2019 _cleanup_free_ char *p = NULL, *pp = NULL;
2020 const char *effective;
2021
2022 p = strjoin(params->prefix[type], "/", *rt);
2023 if (!p) {
2024 r = -ENOMEM;
2025 goto fail;
2026 }
2027
2028 r = mkdir_parents_label(p, 0755);
2029 if (r < 0)
2030 goto fail;
2031
2032 if (context->dynamic_user &&
2033 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2034 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2035
2036 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2037 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2038 * whose UID is later on reused. To lock this down we use the same trick used by container
2039 * managers to prohibit host users to get access to files of the same UID in containers: we
2040 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2041 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2042 * to make this directory permeable for the service itself.
2043 *
2044 * Specifically: for a service which wants a special directory "foo/" we first create a
2045 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2046 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2047 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2048 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2049 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2050 * disabling the access boundary for the service and making sure it only gets access to the
2051 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2052 *
2053 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
2054 * owned by the service itself.
2055 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2056 * files or sockets with other services. */
2057
2058 private_root = strjoin(params->prefix[type], "/private");
2059 if (!private_root) {
2060 r = -ENOMEM;
2061 goto fail;
2062 }
2063
2064 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2065 r = mkdir_safe_label(private_root, 0700, 0, 0, false);
2066 if (r < 0)
2067 goto fail;
2068
2069 pp = strjoin(private_root, "/", *rt);
2070 if (!pp) {
2071 r = -ENOMEM;
2072 goto fail;
2073 }
2074
2075 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2076 r = mkdir_parents_label(pp, 0755);
2077 if (r < 0)
2078 goto fail;
2079
2080 /* Finally, create the actual directory for the service */
2081 r = mkdir_label(pp, context->directories[type].mode);
2082 if (r < 0 && r != -EEXIST)
2083 goto fail;
2084
2085 parent = dirname_malloc(p);
2086 if (!parent) {
2087 r = -ENOMEM;
2088 goto fail;
2089 }
2090
2091 r = path_make_relative(parent, pp, &relative);
2092 if (r < 0)
2093 goto fail;
2094
2095 /* And link it up from the original place */
2096 r = symlink_idempotent(relative, p);
2097 if (r < 0)
2098 goto fail;
2099
2100 effective = pp;
2101
2102 } else {
2103 r = mkdir_label(p, context->directories[type].mode);
2104 if (r < 0 && r != -EEXIST)
2105 goto fail;
2106
2107 effective = p;
2108 }
2109
2110 /* First lock down the access mode */
2111 if (chmod(effective, context->directories[type].mode) < 0) {
2112 r = -errno;
2113 goto fail;
2114 }
2115
2116 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2117 * a service, and shall not be writable. */
2118 if (type == EXEC_DIRECTORY_CONFIGURATION)
2119 continue;
2120
2121 /* Then, change the ownership of the whole tree, if necessary */
2122 r = path_chown_recursive(effective, uid, gid);
2123 if (r < 0)
2124 goto fail;
2125 }
2126
2127 return 0;
2128
2129 fail:
2130 *exit_status = exit_status_table[type];
2131 return r;
2132 }
2133
2134 static int setup_smack(
2135 const ExecContext *context,
2136 const ExecCommand *command) {
2137
2138 int r;
2139
2140 assert(context);
2141 assert(command);
2142
2143 if (context->smack_process_label) {
2144 r = mac_smack_apply_pid(0, context->smack_process_label);
2145 if (r < 0)
2146 return r;
2147 }
2148 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2149 else {
2150 _cleanup_free_ char *exec_label = NULL;
2151
2152 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
2153 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2154 return r;
2155
2156 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2157 if (r < 0)
2158 return r;
2159 }
2160 #endif
2161
2162 return 0;
2163 }
2164
2165 static int compile_bind_mounts(
2166 const ExecContext *context,
2167 const ExecParameters *params,
2168 BindMount **ret_bind_mounts,
2169 unsigned *ret_n_bind_mounts,
2170 char ***ret_empty_directories) {
2171
2172 _cleanup_strv_free_ char **empty_directories = NULL;
2173 BindMount *bind_mounts;
2174 unsigned n, h = 0, i;
2175 ExecDirectoryType t;
2176 int r;
2177
2178 assert(context);
2179 assert(params);
2180 assert(ret_bind_mounts);
2181 assert(ret_n_bind_mounts);
2182 assert(ret_empty_directories);
2183
2184 n = context->n_bind_mounts;
2185 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2186 if (!params->prefix[t])
2187 continue;
2188
2189 n += strv_length(context->directories[t].paths);
2190 }
2191
2192 if (n <= 0) {
2193 *ret_bind_mounts = NULL;
2194 *ret_n_bind_mounts = 0;
2195 *ret_empty_directories = NULL;
2196 return 0;
2197 }
2198
2199 bind_mounts = new(BindMount, n);
2200 if (!bind_mounts)
2201 return -ENOMEM;
2202
2203 for (i = 0; i < context->n_bind_mounts; i++) {
2204 BindMount *item = context->bind_mounts + i;
2205 char *s, *d;
2206
2207 s = strdup(item->source);
2208 if (!s) {
2209 r = -ENOMEM;
2210 goto finish;
2211 }
2212
2213 d = strdup(item->destination);
2214 if (!d) {
2215 free(s);
2216 r = -ENOMEM;
2217 goto finish;
2218 }
2219
2220 bind_mounts[h++] = (BindMount) {
2221 .source = s,
2222 .destination = d,
2223 .read_only = item->read_only,
2224 .recursive = item->recursive,
2225 .ignore_enoent = item->ignore_enoent,
2226 };
2227 }
2228
2229 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2230 char **suffix;
2231
2232 if (!params->prefix[t])
2233 continue;
2234
2235 if (strv_isempty(context->directories[t].paths))
2236 continue;
2237
2238 if (context->dynamic_user &&
2239 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
2240 char *private_root;
2241
2242 /* So this is for a dynamic user, and we need to make sure the process can access its own
2243 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2244 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2245
2246 private_root = strjoin(params->prefix[t], "/private");
2247 if (!private_root) {
2248 r = -ENOMEM;
2249 goto finish;
2250 }
2251
2252 r = strv_consume(&empty_directories, private_root);
2253 if (r < 0) {
2254 r = -ENOMEM;
2255 goto finish;
2256 }
2257 }
2258
2259 STRV_FOREACH(suffix, context->directories[t].paths) {
2260 char *s, *d;
2261
2262 if (context->dynamic_user &&
2263 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
2264 s = strjoin(params->prefix[t], "/private/", *suffix);
2265 else
2266 s = strjoin(params->prefix[t], "/", *suffix);
2267 if (!s) {
2268 r = -ENOMEM;
2269 goto finish;
2270 }
2271
2272 d = strdup(s);
2273 if (!d) {
2274 free(s);
2275 r = -ENOMEM;
2276 goto finish;
2277 }
2278
2279 bind_mounts[h++] = (BindMount) {
2280 .source = s,
2281 .destination = d,
2282 .read_only = false,
2283 .recursive = true,
2284 .ignore_enoent = false,
2285 };
2286 }
2287 }
2288
2289 assert(h == n);
2290
2291 *ret_bind_mounts = bind_mounts;
2292 *ret_n_bind_mounts = n;
2293 *ret_empty_directories = empty_directories;
2294
2295 empty_directories = NULL;
2296
2297 return (int) n;
2298
2299 finish:
2300 bind_mount_free_many(bind_mounts, h);
2301 return r;
2302 }
2303
2304 static int apply_mount_namespace(
2305 Unit *u,
2306 ExecCommand *command,
2307 const ExecContext *context,
2308 const ExecParameters *params,
2309 ExecRuntime *runtime) {
2310
2311 _cleanup_strv_free_ char **empty_directories = NULL;
2312 char *tmp = NULL, *var = NULL;
2313 const char *root_dir = NULL, *root_image = NULL;
2314 NamespaceInfo ns_info = {
2315 .ignore_protect_paths = false,
2316 .private_dev = context->private_devices,
2317 .protect_control_groups = context->protect_control_groups,
2318 .protect_kernel_tunables = context->protect_kernel_tunables,
2319 .protect_kernel_modules = context->protect_kernel_modules,
2320 .mount_apivfs = context->mount_apivfs,
2321 };
2322 bool needs_sandboxing;
2323 BindMount *bind_mounts = NULL;
2324 unsigned n_bind_mounts = 0;
2325 int r;
2326
2327 assert(context);
2328
2329 /* The runtime struct only contains the parent of the private /tmp,
2330 * which is non-accessible to world users. Inside of it there's a /tmp
2331 * that is sticky, and that's the one we want to use here. */
2332
2333 if (context->private_tmp && runtime) {
2334 if (runtime->tmp_dir)
2335 tmp = strjoina(runtime->tmp_dir, "/tmp");
2336 if (runtime->var_tmp_dir)
2337 var = strjoina(runtime->var_tmp_dir, "/tmp");
2338 }
2339
2340 if (params->flags & EXEC_APPLY_CHROOT) {
2341 root_image = context->root_image;
2342
2343 if (!root_image)
2344 root_dir = context->root_directory;
2345 }
2346
2347 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2348 if (r < 0)
2349 return r;
2350
2351 /*
2352 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2353 * sandbox info, otherwise enforce it, don't ignore protected paths and
2354 * fail if we are enable to apply the sandbox inside the mount namespace.
2355 */
2356 if (!context->dynamic_user && root_dir)
2357 ns_info.ignore_protect_paths = true;
2358
2359 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
2360
2361 r = setup_namespace(root_dir, root_image,
2362 &ns_info, context->read_write_paths,
2363 needs_sandboxing ? context->read_only_paths : NULL,
2364 needs_sandboxing ? context->inaccessible_paths : NULL,
2365 empty_directories,
2366 bind_mounts,
2367 n_bind_mounts,
2368 tmp,
2369 var,
2370 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2371 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
2372 context->mount_flags,
2373 DISSECT_IMAGE_DISCARD_ON_LOOP);
2374
2375 bind_mount_free_many(bind_mounts, n_bind_mounts);
2376
2377 /* If we couldn't set up the namespace this is probably due to a
2378 * missing capability. In this case, silently proceeed. */
2379 if (IN_SET(r, -EPERM, -EACCES)) {
2380 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
2381 return 0;
2382 }
2383
2384 return r;
2385 }
2386
2387 static int apply_working_directory(
2388 const ExecContext *context,
2389 const ExecParameters *params,
2390 const char *home,
2391 const bool needs_mount_ns,
2392 int *exit_status) {
2393
2394 const char *d, *wd;
2395
2396 assert(context);
2397 assert(exit_status);
2398
2399 if (context->working_directory_home) {
2400
2401 if (!home) {
2402 *exit_status = EXIT_CHDIR;
2403 return -ENXIO;
2404 }
2405
2406 wd = home;
2407
2408 } else if (context->working_directory)
2409 wd = context->working_directory;
2410 else
2411 wd = "/";
2412
2413 if (params->flags & EXEC_APPLY_CHROOT) {
2414 if (!needs_mount_ns && context->root_directory)
2415 if (chroot(context->root_directory) < 0) {
2416 *exit_status = EXIT_CHROOT;
2417 return -errno;
2418 }
2419
2420 d = wd;
2421 } else
2422 d = prefix_roota(context->root_directory, wd);
2423
2424 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2425 *exit_status = EXIT_CHDIR;
2426 return -errno;
2427 }
2428
2429 return 0;
2430 }
2431
2432 static int setup_keyring(
2433 Unit *u,
2434 const ExecContext *context,
2435 const ExecParameters *p,
2436 uid_t uid, gid_t gid) {
2437
2438 key_serial_t keyring;
2439 int r;
2440
2441 assert(u);
2442 assert(context);
2443 assert(p);
2444
2445 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2446 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2447 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2448 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2449 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2450 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2451
2452 if (!(p->flags & EXEC_NEW_KEYRING))
2453 return 0;
2454
2455 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2456 return 0;
2457
2458 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2459 if (keyring == -1) {
2460 if (errno == ENOSYS)
2461 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
2462 else if (IN_SET(errno, EACCES, EPERM))
2463 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
2464 else if (errno == EDQUOT)
2465 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
2466 else
2467 return log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
2468
2469 return 0;
2470 }
2471
2472 /* Populate they keyring with the invocation ID by default. */
2473 if (!sd_id128_is_null(u->invocation_id)) {
2474 key_serial_t key;
2475
2476 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2477 if (key == -1)
2478 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
2479 else {
2480 if (keyctl(KEYCTL_SETPERM, key,
2481 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2482 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
2483 return log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
2484 }
2485 }
2486
2487 /* And now, make the keyring owned by the service's user */
2488 if (uid_is_valid(uid) || gid_is_valid(gid))
2489 if (keyctl(KEYCTL_CHOWN, keyring, uid, gid, 0) < 0)
2490 return log_unit_error_errno(u, errno, "Failed to change ownership of session keyring: %m");
2491
2492 /* When requested link the user keyring into the session keyring. */
2493 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2494 uid_t saved_uid;
2495 gid_t saved_gid;
2496
2497 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things
2498 * set up properly by the kernel. If we don't do that then we can't create it atomically, and that
2499 * sucks for parallel execution. This mimics what pam_keyinit does, too.*/
2500
2501 saved_uid = getuid();
2502 saved_gid = getgid();
2503
2504 if (gid_is_valid(gid) && gid != saved_gid) {
2505 if (setregid(gid, -1) < 0)
2506 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2507 }
2508
2509 if (uid_is_valid(uid) && uid != saved_uid) {
2510 if (setreuid(uid, -1) < 0) {
2511 (void) setregid(saved_gid, -1);
2512 return log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2513 }
2514 }
2515
2516 if (keyctl(KEYCTL_LINK,
2517 KEY_SPEC_USER_KEYRING,
2518 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2519
2520 r = -errno;
2521
2522 (void) setreuid(saved_uid, -1);
2523 (void) setregid(saved_gid, -1);
2524
2525 return log_unit_error_errno(u, r, "Failed to link user keyring into session keyring: %m");
2526 }
2527
2528 if (uid_is_valid(uid) && uid != saved_uid) {
2529 if (setreuid(saved_uid, -1) < 0) {
2530 (void) setregid(saved_gid, -1);
2531 return log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2532 }
2533 }
2534
2535 if (gid_is_valid(gid) && gid != saved_gid) {
2536 if (setregid(saved_gid, -1) < 0)
2537 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2538 }
2539 }
2540
2541 return 0;
2542 }
2543
2544 static void append_socket_pair(int *array, unsigned *n, int pair[2]) {
2545 assert(array);
2546 assert(n);
2547
2548 if (!pair)
2549 return;
2550
2551 if (pair[0] >= 0)
2552 array[(*n)++] = pair[0];
2553 if (pair[1] >= 0)
2554 array[(*n)++] = pair[1];
2555 }
2556
2557 static int close_remaining_fds(
2558 const ExecParameters *params,
2559 ExecRuntime *runtime,
2560 DynamicCreds *dcreds,
2561 int user_lookup_fd,
2562 int socket_fd,
2563 int *fds, unsigned n_fds) {
2564
2565 unsigned n_dont_close = 0;
2566 int dont_close[n_fds + 12];
2567
2568 assert(params);
2569
2570 if (params->stdin_fd >= 0)
2571 dont_close[n_dont_close++] = params->stdin_fd;
2572 if (params->stdout_fd >= 0)
2573 dont_close[n_dont_close++] = params->stdout_fd;
2574 if (params->stderr_fd >= 0)
2575 dont_close[n_dont_close++] = params->stderr_fd;
2576
2577 if (socket_fd >= 0)
2578 dont_close[n_dont_close++] = socket_fd;
2579 if (n_fds > 0) {
2580 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2581 n_dont_close += n_fds;
2582 }
2583
2584 if (runtime)
2585 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2586
2587 if (dcreds) {
2588 if (dcreds->user)
2589 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2590 if (dcreds->group)
2591 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
2592 }
2593
2594 if (user_lookup_fd >= 0)
2595 dont_close[n_dont_close++] = user_lookup_fd;
2596
2597 return close_all_fds(dont_close, n_dont_close);
2598 }
2599
2600 static int send_user_lookup(
2601 Unit *unit,
2602 int user_lookup_fd,
2603 uid_t uid,
2604 gid_t gid) {
2605
2606 assert(unit);
2607
2608 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2609 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2610 * specified. */
2611
2612 if (user_lookup_fd < 0)
2613 return 0;
2614
2615 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2616 return 0;
2617
2618 if (writev(user_lookup_fd,
2619 (struct iovec[]) {
2620 IOVEC_INIT(&uid, sizeof(uid)),
2621 IOVEC_INIT(&gid, sizeof(gid)),
2622 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
2623 return -errno;
2624
2625 return 0;
2626 }
2627
2628 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2629 int r;
2630
2631 assert(c);
2632 assert(home);
2633 assert(buf);
2634
2635 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2636
2637 if (*home)
2638 return 0;
2639
2640 if (!c->working_directory_home)
2641 return 0;
2642
2643 if (uid == 0) {
2644 /* Hardcode /root as home directory for UID 0 */
2645 *home = "/root";
2646 return 1;
2647 }
2648
2649 r = get_home_dir(buf);
2650 if (r < 0)
2651 return r;
2652
2653 *home = *buf;
2654 return 1;
2655 }
2656
2657 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2658 _cleanup_strv_free_ char ** list = NULL;
2659 ExecDirectoryType t;
2660 int r;
2661
2662 assert(c);
2663 assert(p);
2664 assert(ret);
2665
2666 assert(c->dynamic_user);
2667
2668 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2669 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2670 * directories. */
2671
2672 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2673 char **i;
2674
2675 if (t == EXEC_DIRECTORY_CONFIGURATION)
2676 continue;
2677
2678 if (!p->prefix[t])
2679 continue;
2680
2681 STRV_FOREACH(i, c->directories[t].paths) {
2682 char *e;
2683
2684 if (t == EXEC_DIRECTORY_RUNTIME)
2685 e = strjoin(p->prefix[t], "/", *i);
2686 else
2687 e = strjoin(p->prefix[t], "/private/", *i);
2688 if (!e)
2689 return -ENOMEM;
2690
2691 r = strv_consume(&list, e);
2692 if (r < 0)
2693 return r;
2694 }
2695 }
2696
2697 *ret = list;
2698 list = NULL;
2699
2700 return 0;
2701 }
2702
2703 static int exec_child(
2704 Unit *unit,
2705 ExecCommand *command,
2706 const ExecContext *context,
2707 const ExecParameters *params,
2708 ExecRuntime *runtime,
2709 DynamicCreds *dcreds,
2710 char **argv,
2711 int socket_fd,
2712 int named_iofds[3],
2713 int *fds,
2714 unsigned n_storage_fds,
2715 unsigned n_socket_fds,
2716 char **files_env,
2717 int user_lookup_fd,
2718 int *exit_status) {
2719
2720 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
2721 _cleanup_free_ char *mac_selinux_context_net = NULL, *home_buffer = NULL;
2722 _cleanup_free_ gid_t *supplementary_gids = NULL;
2723 const char *username = NULL, *groupname = NULL;
2724 const char *home = NULL, *shell = NULL;
2725 dev_t journal_stream_dev = 0;
2726 ino_t journal_stream_ino = 0;
2727 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2728 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2729 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2730 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
2731 #if HAVE_SELINUX
2732 bool use_selinux = false;
2733 #endif
2734 #if ENABLE_SMACK
2735 bool use_smack = false;
2736 #endif
2737 #if HAVE_APPARMOR
2738 bool use_apparmor = false;
2739 #endif
2740 uid_t uid = UID_INVALID;
2741 gid_t gid = GID_INVALID;
2742 int i, r, ngids = 0;
2743 unsigned n_fds;
2744 ExecDirectoryType dt;
2745 int secure_bits;
2746
2747 assert(unit);
2748 assert(command);
2749 assert(context);
2750 assert(params);
2751 assert(exit_status);
2752
2753 rename_process_from_path(command->path);
2754
2755 /* We reset exactly these signals, since they are the
2756 * only ones we set to SIG_IGN in the main daemon. All
2757 * others we leave untouched because we set them to
2758 * SIG_DFL or a valid handler initially, both of which
2759 * will be demoted to SIG_DFL. */
2760 (void) default_signals(SIGNALS_CRASH_HANDLER,
2761 SIGNALS_IGNORE, -1);
2762
2763 if (context->ignore_sigpipe)
2764 (void) ignore_signals(SIGPIPE, -1);
2765
2766 r = reset_signal_mask();
2767 if (r < 0) {
2768 *exit_status = EXIT_SIGNAL_MASK;
2769 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
2770 }
2771
2772 if (params->idle_pipe)
2773 do_idle_pipe_dance(params->idle_pipe);
2774
2775 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2776 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2777 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2778 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
2779
2780 log_forget_fds();
2781 log_set_open_when_needed(true);
2782
2783 /* In case anything used libc syslog(), close this here, too */
2784 closelog();
2785
2786 n_fds = n_storage_fds + n_socket_fds;
2787 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
2788 if (r < 0) {
2789 *exit_status = EXIT_FDS;
2790 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
2791 }
2792
2793 if (!context->same_pgrp)
2794 if (setsid() < 0) {
2795 *exit_status = EXIT_SETSID;
2796 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
2797 }
2798
2799 exec_context_tty_reset(context, params);
2800
2801 if (unit_shall_confirm_spawn(unit)) {
2802 const char *vc = params->confirm_spawn;
2803 _cleanup_free_ char *cmdline = NULL;
2804
2805 cmdline = exec_command_line(argv);
2806 if (!cmdline) {
2807 *exit_status = EXIT_MEMORY;
2808 return log_oom();
2809 }
2810
2811 r = ask_for_confirmation(vc, unit, cmdline);
2812 if (r != CONFIRM_EXECUTE) {
2813 if (r == CONFIRM_PRETEND_SUCCESS) {
2814 *exit_status = EXIT_SUCCESS;
2815 return 0;
2816 }
2817 *exit_status = EXIT_CONFIRM;
2818 log_unit_error(unit, "Execution cancelled by the user");
2819 return -ECANCELED;
2820 }
2821 }
2822
2823 if (context->dynamic_user && dcreds) {
2824 _cleanup_strv_free_ char **suggested_paths = NULL;
2825
2826 /* Make sure we bypass our own NSS module for any NSS checks */
2827 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2828 *exit_status = EXIT_USER;
2829 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
2830 }
2831
2832 r = compile_suggested_paths(context, params, &suggested_paths);
2833 if (r < 0) {
2834 *exit_status = EXIT_MEMORY;
2835 return log_oom();
2836 }
2837
2838 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
2839 if (r < 0) {
2840 *exit_status = EXIT_USER;
2841 if (r == -EILSEQ) {
2842 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2843 return -EOPNOTSUPP;
2844 }
2845 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
2846 }
2847
2848 if (!uid_is_valid(uid)) {
2849 *exit_status = EXIT_USER;
2850 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
2851 return -ESRCH;
2852 }
2853
2854 if (!gid_is_valid(gid)) {
2855 *exit_status = EXIT_USER;
2856 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
2857 return -ESRCH;
2858 }
2859
2860 if (dcreds->user)
2861 username = dcreds->user->name;
2862
2863 } else {
2864 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2865 if (r < 0) {
2866 *exit_status = EXIT_USER;
2867 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
2868 }
2869
2870 r = get_fixed_group(context, &groupname, &gid);
2871 if (r < 0) {
2872 *exit_status = EXIT_GROUP;
2873 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
2874 }
2875 }
2876
2877 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2878 r = get_supplementary_groups(context, username, groupname, gid,
2879 &supplementary_gids, &ngids);
2880 if (r < 0) {
2881 *exit_status = EXIT_GROUP;
2882 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
2883 }
2884
2885 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2886 if (r < 0) {
2887 *exit_status = EXIT_USER;
2888 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
2889 }
2890
2891 user_lookup_fd = safe_close(user_lookup_fd);
2892
2893 r = acquire_home(context, uid, &home, &home_buffer);
2894 if (r < 0) {
2895 *exit_status = EXIT_CHDIR;
2896 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
2897 }
2898
2899 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2900 * must sure to drop O_NONBLOCK */
2901 if (socket_fd >= 0)
2902 (void) fd_nonblock(socket_fd, false);
2903
2904 r = setup_input(context, params, socket_fd, named_iofds);
2905 if (r < 0) {
2906 *exit_status = EXIT_STDIN;
2907 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
2908 }
2909
2910 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2911 if (r < 0) {
2912 *exit_status = EXIT_STDOUT;
2913 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
2914 }
2915
2916 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
2917 if (r < 0) {
2918 *exit_status = EXIT_STDERR;
2919 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
2920 }
2921
2922 if (params->cgroup_path) {
2923 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2924 if (r < 0) {
2925 *exit_status = EXIT_CGROUP;
2926 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
2927 }
2928 }
2929
2930 if (context->oom_score_adjust_set) {
2931 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
2932
2933 /* When we can't make this change due to EPERM, then
2934 * let's silently skip over it. User namespaces
2935 * prohibit write access to this file, and we
2936 * shouldn't trip up over that. */
2937
2938 sprintf(t, "%i", context->oom_score_adjust);
2939 r = write_string_file("/proc/self/oom_score_adj", t, 0);
2940 if (IN_SET(r, -EPERM, -EACCES))
2941 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
2942 else if (r < 0) {
2943 *exit_status = EXIT_OOM_ADJUST;
2944 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
2945 }
2946 }
2947
2948 if (context->nice_set)
2949 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
2950 *exit_status = EXIT_NICE;
2951 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
2952 }
2953
2954 if (context->cpu_sched_set) {
2955 struct sched_param param = {
2956 .sched_priority = context->cpu_sched_priority,
2957 };
2958
2959 r = sched_setscheduler(0,
2960 context->cpu_sched_policy |
2961 (context->cpu_sched_reset_on_fork ?
2962 SCHED_RESET_ON_FORK : 0),
2963 &param);
2964 if (r < 0) {
2965 *exit_status = EXIT_SETSCHEDULER;
2966 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
2967 }
2968 }
2969
2970 if (context->cpuset)
2971 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
2972 *exit_status = EXIT_CPUAFFINITY;
2973 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
2974 }
2975
2976 if (context->ioprio_set)
2977 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
2978 *exit_status = EXIT_IOPRIO;
2979 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
2980 }
2981
2982 if (context->timer_slack_nsec != NSEC_INFINITY)
2983 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
2984 *exit_status = EXIT_TIMERSLACK;
2985 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
2986 }
2987
2988 if (context->personality != PERSONALITY_INVALID) {
2989 r = safe_personality(context->personality);
2990 if (r < 0) {
2991 *exit_status = EXIT_PERSONALITY;
2992 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
2993 }
2994 }
2995
2996 if (context->utmp_id)
2997 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
2998 context->tty_path,
2999 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3000 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3001 USER_PROCESS,
3002 username);
3003
3004 if (context->user) {
3005 r = chown_terminal(STDIN_FILENO, uid);
3006 if (r < 0) {
3007 *exit_status = EXIT_STDIN;
3008 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
3009 }
3010 }
3011
3012 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3013 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3014 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3015 * touch a single hierarchy too. */
3016 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
3017 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
3018 if (r < 0) {
3019 *exit_status = EXIT_CGROUP;
3020 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
3021 }
3022 }
3023
3024 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3025 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
3026 if (r < 0)
3027 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
3028 }
3029
3030 r = build_environment(
3031 unit,
3032 context,
3033 params,
3034 n_fds,
3035 home,
3036 username,
3037 shell,
3038 journal_stream_dev,
3039 journal_stream_ino,
3040 &our_env);
3041 if (r < 0) {
3042 *exit_status = EXIT_MEMORY;
3043 return log_oom();
3044 }
3045
3046 r = build_pass_environment(context, &pass_env);
3047 if (r < 0) {
3048 *exit_status = EXIT_MEMORY;
3049 return log_oom();
3050 }
3051
3052 accum_env = strv_env_merge(5,
3053 params->environment,
3054 our_env,
3055 pass_env,
3056 context->environment,
3057 files_env,
3058 NULL);
3059 if (!accum_env) {
3060 *exit_status = EXIT_MEMORY;
3061 return log_oom();
3062 }
3063 accum_env = strv_env_clean(accum_env);
3064
3065 (void) umask(context->umask);
3066
3067 r = setup_keyring(unit, context, params, uid, gid);
3068 if (r < 0) {
3069 *exit_status = EXIT_KEYRING;
3070 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
3071 }
3072
3073 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
3074 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3075
3076 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3077 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
3078
3079 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3080 if (needs_ambient_hack)
3081 needs_setuid = false;
3082 else
3083 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3084
3085 if (needs_sandboxing) {
3086 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3087 * present. The actual MAC context application will happen later, as late as possible, to avoid
3088 * impacting our own code paths. */
3089
3090 #if HAVE_SELINUX
3091 use_selinux = mac_selinux_use();
3092 #endif
3093 #if ENABLE_SMACK
3094 use_smack = mac_smack_use();
3095 #endif
3096 #if HAVE_APPARMOR
3097 use_apparmor = mac_apparmor_use();
3098 #endif
3099 }
3100
3101 if (needs_setuid) {
3102 if (context->pam_name && username) {
3103 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3104 if (r < 0) {
3105 *exit_status = EXIT_PAM;
3106 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
3107 }
3108 }
3109 }
3110
3111 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
3112 if (ns_type_supported(NAMESPACE_NET)) {
3113 r = setup_netns(runtime->netns_storage_socket);
3114 if (r < 0) {
3115 *exit_status = EXIT_NETWORK;
3116 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3117 }
3118 } else
3119 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
3120 }
3121
3122 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
3123 if (needs_mount_namespace) {
3124 r = apply_mount_namespace(unit, command, context, params, runtime);
3125 if (r < 0) {
3126 *exit_status = EXIT_NAMESPACE;
3127 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3128 }
3129 }
3130
3131 /* Apply just after mount namespace setup */
3132 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
3133 if (r < 0)
3134 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3135
3136 /* Drop groups as early as possbile */
3137 if (needs_setuid) {
3138 r = enforce_groups(gid, supplementary_gids, ngids);
3139 if (r < 0) {
3140 *exit_status = EXIT_GROUP;
3141 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
3142 }
3143 }
3144
3145 if (needs_sandboxing) {
3146 #if HAVE_SELINUX
3147 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
3148 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3149 if (r < 0) {
3150 *exit_status = EXIT_SELINUX_CONTEXT;
3151 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
3152 }
3153 }
3154 #endif
3155
3156 if (context->private_users) {
3157 r = setup_private_users(uid, gid);
3158 if (r < 0) {
3159 *exit_status = EXIT_USER;
3160 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
3161 }
3162 }
3163 }
3164
3165 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3166 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3167 * was needed to upload the policy and can now be closed as well. */
3168 r = close_all_fds(fds, n_fds);
3169 if (r >= 0)
3170 r = shift_fds(fds, n_fds);
3171 if (r >= 0)
3172 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
3173 if (r < 0) {
3174 *exit_status = EXIT_FDS;
3175 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
3176 }
3177
3178 secure_bits = context->secure_bits;
3179
3180 if (needs_sandboxing) {
3181 uint64_t bset;
3182
3183 for (i = 0; i < _RLIMIT_MAX; i++) {
3184
3185 if (!context->rlimit[i])
3186 continue;
3187
3188 r = setrlimit_closest(i, context->rlimit[i]);
3189 if (r < 0) {
3190 *exit_status = EXIT_LIMITS;
3191 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
3192 }
3193 }
3194
3195 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3196 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3197 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3198 *exit_status = EXIT_LIMITS;
3199 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
3200 }
3201 }
3202
3203 #if ENABLE_SMACK
3204 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3205 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3206 if (use_smack) {
3207 r = setup_smack(context, command);
3208 if (r < 0) {
3209 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3210 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3211 }
3212 }
3213 #endif
3214
3215 bset = context->capability_bounding_set;
3216 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3217 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3218 * instead of us doing that */
3219 if (needs_ambient_hack)
3220 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3221 (UINT64_C(1) << CAP_SETUID) |
3222 (UINT64_C(1) << CAP_SETGID);
3223
3224 if (!cap_test_all(bset)) {
3225 r = capability_bounding_set_drop(bset, false);
3226 if (r < 0) {
3227 *exit_status = EXIT_CAPABILITIES;
3228 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3229 }
3230 }
3231
3232 /* This is done before enforce_user, but ambient set
3233 * does not survive over setresuid() if keep_caps is not set. */
3234 if (!needs_ambient_hack &&
3235 context->capability_ambient_set != 0) {
3236 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3237 if (r < 0) {
3238 *exit_status = EXIT_CAPABILITIES;
3239 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
3240 }
3241 }
3242 }
3243
3244 if (needs_setuid) {
3245 if (context->user) {
3246 r = enforce_user(context, uid);
3247 if (r < 0) {
3248 *exit_status = EXIT_USER;
3249 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
3250 }
3251
3252 if (!needs_ambient_hack &&
3253 context->capability_ambient_set != 0) {
3254
3255 /* Fix the ambient capabilities after user change. */
3256 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3257 if (r < 0) {
3258 *exit_status = EXIT_CAPABILITIES;
3259 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
3260 }
3261
3262 /* If we were asked to change user and ambient capabilities
3263 * were requested, we had to add keep-caps to the securebits
3264 * so that we would maintain the inherited capability set
3265 * through the setresuid(). Make sure that the bit is added
3266 * also to the context secure_bits so that we don't try to
3267 * drop the bit away next. */
3268
3269 secure_bits |= 1<<SECURE_KEEP_CAPS;
3270 }
3271 }
3272 }
3273
3274 if (needs_sandboxing) {
3275 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
3276 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3277 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3278 * are restricted. */
3279
3280 #if HAVE_SELINUX
3281 if (use_selinux) {
3282 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3283
3284 if (exec_context) {
3285 r = setexeccon(exec_context);
3286 if (r < 0) {
3287 *exit_status = EXIT_SELINUX_CONTEXT;
3288 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
3289 }
3290 }
3291 }
3292 #endif
3293
3294 #if HAVE_APPARMOR
3295 if (use_apparmor && context->apparmor_profile) {
3296 r = aa_change_onexec(context->apparmor_profile);
3297 if (r < 0 && !context->apparmor_profile_ignore) {
3298 *exit_status = EXIT_APPARMOR_PROFILE;
3299 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
3300 }
3301 }
3302 #endif
3303
3304 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3305 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
3306 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3307 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
3308 *exit_status = EXIT_SECUREBITS;
3309 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
3310 }
3311
3312 if (context_has_no_new_privileges(context))
3313 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
3314 *exit_status = EXIT_NO_NEW_PRIVILEGES;
3315 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
3316 }
3317
3318 #if HAVE_SECCOMP
3319 r = apply_address_families(unit, context);
3320 if (r < 0) {
3321 *exit_status = EXIT_ADDRESS_FAMILIES;
3322 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
3323 }
3324
3325 r = apply_memory_deny_write_execute(unit, context);
3326 if (r < 0) {
3327 *exit_status = EXIT_SECCOMP;
3328 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
3329 }
3330
3331 r = apply_restrict_realtime(unit, context);
3332 if (r < 0) {
3333 *exit_status = EXIT_SECCOMP;
3334 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
3335 }
3336
3337 r = apply_restrict_namespaces(unit, context);
3338 if (r < 0) {
3339 *exit_status = EXIT_SECCOMP;
3340 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
3341 }
3342
3343 r = apply_protect_sysctl(unit, context);
3344 if (r < 0) {
3345 *exit_status = EXIT_SECCOMP;
3346 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
3347 }
3348
3349 r = apply_protect_kernel_modules(unit, context);
3350 if (r < 0) {
3351 *exit_status = EXIT_SECCOMP;
3352 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
3353 }
3354
3355 r = apply_private_devices(unit, context);
3356 if (r < 0) {
3357 *exit_status = EXIT_SECCOMP;
3358 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
3359 }
3360
3361 r = apply_syscall_archs(unit, context);
3362 if (r < 0) {
3363 *exit_status = EXIT_SECCOMP;
3364 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
3365 }
3366
3367 r = apply_lock_personality(unit, context);
3368 if (r < 0) {
3369 *exit_status = EXIT_SECCOMP;
3370 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
3371 }
3372
3373 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3374 * by the filter as little as possible. */
3375 r = apply_syscall_filter(unit, context, needs_ambient_hack);
3376 if (r < 0) {
3377 *exit_status = EXIT_SECCOMP;
3378 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
3379 }
3380 #endif
3381 }
3382
3383 if (!strv_isempty(context->unset_environment)) {
3384 char **ee = NULL;
3385
3386 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3387 if (!ee) {
3388 *exit_status = EXIT_MEMORY;
3389 return log_oom();
3390 }
3391
3392 strv_free(accum_env);
3393 accum_env = ee;
3394 }
3395
3396 final_argv = replace_env_argv(argv, accum_env);
3397 if (!final_argv) {
3398 *exit_status = EXIT_MEMORY;
3399 return log_oom();
3400 }
3401
3402 if (_unlikely_(log_get_max_level() >= LOG_DEBUG)) {
3403 _cleanup_free_ char *line;
3404
3405 line = exec_command_line(final_argv);
3406 if (line) {
3407 log_struct(LOG_DEBUG,
3408 "EXECUTABLE=%s", command->path,
3409 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
3410 LOG_UNIT_ID(unit),
3411 LOG_UNIT_INVOCATION_ID(unit),
3412 NULL);
3413 }
3414 }
3415
3416 execve(command->path, final_argv, accum_env);
3417
3418 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3419
3420 log_struct_errno(LOG_INFO, errno,
3421 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3422 LOG_UNIT_ID(unit),
3423 LOG_UNIT_INVOCATION_ID(unit),
3424 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3425 command->path),
3426 "EXECUTABLE=%s", command->path,
3427 NULL);
3428
3429 return 0;
3430 }
3431
3432 *exit_status = EXIT_EXEC;
3433 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
3434 }
3435
3436 int exec_spawn(Unit *unit,
3437 ExecCommand *command,
3438 const ExecContext *context,
3439 const ExecParameters *params,
3440 ExecRuntime *runtime,
3441 DynamicCreds *dcreds,
3442 pid_t *ret) {
3443
3444 _cleanup_strv_free_ char **files_env = NULL;
3445 int *fds = NULL;
3446 unsigned n_storage_fds = 0, n_socket_fds = 0;
3447 _cleanup_free_ char *line = NULL;
3448 int socket_fd, r;
3449 int named_iofds[3] = { -1, -1, -1 };
3450 char **argv;
3451 pid_t pid;
3452
3453 assert(unit);
3454 assert(command);
3455 assert(context);
3456 assert(ret);
3457 assert(params);
3458 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
3459
3460 if (context->std_input == EXEC_INPUT_SOCKET ||
3461 context->std_output == EXEC_OUTPUT_SOCKET ||
3462 context->std_error == EXEC_OUTPUT_SOCKET) {
3463
3464 if (params->n_socket_fds > 1) {
3465 log_unit_error(unit, "Got more than one socket.");
3466 return -EINVAL;
3467 }
3468
3469 if (params->n_socket_fds == 0) {
3470 log_unit_error(unit, "Got no socket.");
3471 return -EINVAL;
3472 }
3473
3474 socket_fd = params->fds[0];
3475 } else {
3476 socket_fd = -1;
3477 fds = params->fds;
3478 n_storage_fds = params->n_storage_fds;
3479 n_socket_fds = params->n_socket_fds;
3480 }
3481
3482 r = exec_context_named_iofds(unit, context, params, named_iofds);
3483 if (r < 0)
3484 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3485
3486 r = exec_context_load_environment(unit, context, &files_env);
3487 if (r < 0)
3488 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
3489
3490 argv = params->argv ?: command->argv;
3491 line = exec_command_line(argv);
3492 if (!line)
3493 return log_oom();
3494
3495 log_struct(LOG_DEBUG,
3496 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3497 "EXECUTABLE=%s", command->path,
3498 LOG_UNIT_ID(unit),
3499 LOG_UNIT_INVOCATION_ID(unit),
3500 NULL);
3501
3502 pid = fork();
3503 if (pid < 0)
3504 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
3505
3506 if (pid == 0) {
3507 int exit_status = EXIT_SUCCESS;
3508
3509 r = exec_child(unit,
3510 command,
3511 context,
3512 params,
3513 runtime,
3514 dcreds,
3515 argv,
3516 socket_fd,
3517 named_iofds,
3518 fds,
3519 n_storage_fds,
3520 n_socket_fds,
3521 files_env,
3522 unit->manager->user_lookup_fds[1],
3523 &exit_status);
3524
3525 if (r < 0) {
3526 log_struct_errno(LOG_ERR, r,
3527 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3528 LOG_UNIT_ID(unit),
3529 LOG_UNIT_INVOCATION_ID(unit),
3530 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3531 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3532 command->path),
3533 "EXECUTABLE=%s", command->path,
3534 NULL);
3535 }
3536
3537 _exit(exit_status);
3538 }
3539
3540 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
3541
3542 /* We add the new process to the cgroup both in the child (so
3543 * that we can be sure that no user code is ever executed
3544 * outside of the cgroup) and in the parent (so that we can be
3545 * sure that when we kill the cgroup the process will be
3546 * killed too). */
3547 if (params->cgroup_path)
3548 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
3549
3550 exec_status_start(&command->exec_status, pid);
3551
3552 *ret = pid;
3553 return 0;
3554 }
3555
3556 void exec_context_init(ExecContext *c) {
3557 ExecDirectoryType i;
3558
3559 assert(c);
3560
3561 c->umask = 0022;
3562 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
3563 c->cpu_sched_policy = SCHED_OTHER;
3564 c->syslog_priority = LOG_DAEMON|LOG_INFO;
3565 c->syslog_level_prefix = true;
3566 c->ignore_sigpipe = true;
3567 c->timer_slack_nsec = NSEC_INFINITY;
3568 c->personality = PERSONALITY_INVALID;
3569 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3570 c->directories[i].mode = 0755;
3571 c->capability_bounding_set = CAP_ALL;
3572 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
3573 c->log_level_max = -1;
3574 }
3575
3576 void exec_context_done(ExecContext *c) {
3577 ExecDirectoryType i;
3578 size_t l;
3579
3580 assert(c);
3581
3582 c->environment = strv_free(c->environment);
3583 c->environment_files = strv_free(c->environment_files);
3584 c->pass_environment = strv_free(c->pass_environment);
3585 c->unset_environment = strv_free(c->unset_environment);
3586
3587 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
3588 c->rlimit[l] = mfree(c->rlimit[l]);
3589
3590 for (l = 0; l < 3; l++) {
3591 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
3592 c->stdio_file[l] = mfree(c->stdio_file[l]);
3593 }
3594
3595 c->working_directory = mfree(c->working_directory);
3596 c->root_directory = mfree(c->root_directory);
3597 c->root_image = mfree(c->root_image);
3598 c->tty_path = mfree(c->tty_path);
3599 c->syslog_identifier = mfree(c->syslog_identifier);
3600 c->user = mfree(c->user);
3601 c->group = mfree(c->group);
3602
3603 c->supplementary_groups = strv_free(c->supplementary_groups);
3604
3605 c->pam_name = mfree(c->pam_name);
3606
3607 c->read_only_paths = strv_free(c->read_only_paths);
3608 c->read_write_paths = strv_free(c->read_write_paths);
3609 c->inaccessible_paths = strv_free(c->inaccessible_paths);
3610
3611 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
3612
3613 if (c->cpuset)
3614 CPU_FREE(c->cpuset);
3615
3616 c->utmp_id = mfree(c->utmp_id);
3617 c->selinux_context = mfree(c->selinux_context);
3618 c->apparmor_profile = mfree(c->apparmor_profile);
3619 c->smack_process_label = mfree(c->smack_process_label);
3620
3621 c->syscall_filter = hashmap_free(c->syscall_filter);
3622 c->syscall_archs = set_free(c->syscall_archs);
3623 c->address_families = set_free(c->address_families);
3624
3625 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3626 c->directories[i].paths = strv_free(c->directories[i].paths);
3627
3628 c->log_level_max = -1;
3629
3630 exec_context_free_log_extra_fields(c);
3631
3632 c->stdin_data = mfree(c->stdin_data);
3633 c->stdin_data_size = 0;
3634 }
3635
3636 int exec_context_destroy_runtime_directory(ExecContext *c, const char *runtime_prefix) {
3637 char **i;
3638
3639 assert(c);
3640
3641 if (!runtime_prefix)
3642 return 0;
3643
3644 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
3645 _cleanup_free_ char *p;
3646
3647 p = strjoin(runtime_prefix, "/", *i);
3648 if (!p)
3649 return -ENOMEM;
3650
3651 /* We execute this synchronously, since we need to be sure this is gone when we start the service
3652 * next. */
3653 (void) rm_rf(p, REMOVE_ROOT);
3654 }
3655
3656 return 0;
3657 }
3658
3659 void exec_command_done(ExecCommand *c) {
3660 assert(c);
3661
3662 c->path = mfree(c->path);
3663
3664 c->argv = strv_free(c->argv);
3665 }
3666
3667 void exec_command_done_array(ExecCommand *c, unsigned n) {
3668 unsigned i;
3669
3670 for (i = 0; i < n; i++)
3671 exec_command_done(c+i);
3672 }
3673
3674 ExecCommand* exec_command_free_list(ExecCommand *c) {
3675 ExecCommand *i;
3676
3677 while ((i = c)) {
3678 LIST_REMOVE(command, c, i);
3679 exec_command_done(i);
3680 free(i);
3681 }
3682
3683 return NULL;
3684 }
3685
3686 void exec_command_free_array(ExecCommand **c, unsigned n) {
3687 unsigned i;
3688
3689 for (i = 0; i < n; i++)
3690 c[i] = exec_command_free_list(c[i]);
3691 }
3692
3693 typedef struct InvalidEnvInfo {
3694 Unit *unit;
3695 const char *path;
3696 } InvalidEnvInfo;
3697
3698 static void invalid_env(const char *p, void *userdata) {
3699 InvalidEnvInfo *info = userdata;
3700
3701 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
3702 }
3703
3704 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3705 assert(c);
3706
3707 switch (fd_index) {
3708
3709 case STDIN_FILENO:
3710 if (c->std_input != EXEC_INPUT_NAMED_FD)
3711 return NULL;
3712
3713 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
3714
3715 case STDOUT_FILENO:
3716 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3717 return NULL;
3718
3719 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
3720
3721 case STDERR_FILENO:
3722 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3723 return NULL;
3724
3725 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
3726
3727 default:
3728 return NULL;
3729 }
3730 }
3731
3732 int exec_context_named_iofds(Unit *unit, const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
3733 unsigned i, targets;
3734 const char* stdio_fdname[3];
3735 unsigned n_fds;
3736
3737 assert(c);
3738 assert(p);
3739
3740 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3741 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3742 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3743
3744 for (i = 0; i < 3; i++)
3745 stdio_fdname[i] = exec_context_fdname(c, i);
3746
3747 n_fds = p->n_storage_fds + p->n_socket_fds;
3748
3749 for (i = 0; i < n_fds && targets > 0; i++)
3750 if (named_iofds[STDIN_FILENO] < 0 &&
3751 c->std_input == EXEC_INPUT_NAMED_FD &&
3752 stdio_fdname[STDIN_FILENO] &&
3753 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3754
3755 named_iofds[STDIN_FILENO] = p->fds[i];
3756 targets--;
3757
3758 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3759 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3760 stdio_fdname[STDOUT_FILENO] &&
3761 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3762
3763 named_iofds[STDOUT_FILENO] = p->fds[i];
3764 targets--;
3765
3766 } else if (named_iofds[STDERR_FILENO] < 0 &&
3767 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3768 stdio_fdname[STDERR_FILENO] &&
3769 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3770
3771 named_iofds[STDERR_FILENO] = p->fds[i];
3772 targets--;
3773 }
3774
3775 return targets == 0 ? 0 : -ENOENT;
3776 }
3777
3778 int exec_context_load_environment(Unit *unit, const ExecContext *c, char ***l) {
3779 char **i, **r = NULL;
3780
3781 assert(c);
3782 assert(l);
3783
3784 STRV_FOREACH(i, c->environment_files) {
3785 char *fn;
3786 int k;
3787 unsigned n;
3788 bool ignore = false;
3789 char **p;
3790 _cleanup_globfree_ glob_t pglob = {};
3791
3792 fn = *i;
3793
3794 if (fn[0] == '-') {
3795 ignore = true;
3796 fn++;
3797 }
3798
3799 if (!path_is_absolute(fn)) {
3800 if (ignore)
3801 continue;
3802
3803 strv_free(r);
3804 return -EINVAL;
3805 }
3806
3807 /* Filename supports globbing, take all matching files */
3808 k = safe_glob(fn, 0, &pglob);
3809 if (k < 0) {
3810 if (ignore)
3811 continue;
3812
3813 strv_free(r);
3814 return k;
3815 }
3816
3817 /* When we don't match anything, -ENOENT should be returned */
3818 assert(pglob.gl_pathc > 0);
3819
3820 for (n = 0; n < pglob.gl_pathc; n++) {
3821 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
3822 if (k < 0) {
3823 if (ignore)
3824 continue;
3825
3826 strv_free(r);
3827 return k;
3828 }
3829 /* Log invalid environment variables with filename */
3830 if (p) {
3831 InvalidEnvInfo info = {
3832 .unit = unit,
3833 .path = pglob.gl_pathv[n]
3834 };
3835
3836 p = strv_env_clean_with_callback(p, invalid_env, &info);
3837 }
3838
3839 if (r == NULL)
3840 r = p;
3841 else {
3842 char **m;
3843
3844 m = strv_env_merge(2, r, p);
3845 strv_free(r);
3846 strv_free(p);
3847 if (!m)
3848 return -ENOMEM;
3849
3850 r = m;
3851 }
3852 }
3853 }
3854
3855 *l = r;
3856
3857 return 0;
3858 }
3859
3860 static bool tty_may_match_dev_console(const char *tty) {
3861 _cleanup_free_ char *active = NULL;
3862 char *console;
3863
3864 if (!tty)
3865 return true;
3866
3867 tty = skip_dev_prefix(tty);
3868
3869 /* trivial identity? */
3870 if (streq(tty, "console"))
3871 return true;
3872
3873 console = resolve_dev_console(&active);
3874 /* if we could not resolve, assume it may */
3875 if (!console)
3876 return true;
3877
3878 /* "tty0" means the active VC, so it may be the same sometimes */
3879 return streq(console, tty) || (streq(console, "tty0") && tty_is_vc(tty));
3880 }
3881
3882 bool exec_context_may_touch_console(ExecContext *ec) {
3883
3884 return (ec->tty_reset ||
3885 ec->tty_vhangup ||
3886 ec->tty_vt_disallocate ||
3887 is_terminal_input(ec->std_input) ||
3888 is_terminal_output(ec->std_output) ||
3889 is_terminal_output(ec->std_error)) &&
3890 tty_may_match_dev_console(exec_context_tty_path(ec));
3891 }
3892
3893 static void strv_fprintf(FILE *f, char **l) {
3894 char **g;
3895
3896 assert(f);
3897
3898 STRV_FOREACH(g, l)
3899 fprintf(f, " %s", *g);
3900 }
3901
3902 void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
3903 ExecDirectoryType dt;
3904 char **e, **d;
3905 unsigned i;
3906 int r;
3907
3908 assert(c);
3909 assert(f);
3910
3911 prefix = strempty(prefix);
3912
3913 fprintf(f,
3914 "%sUMask: %04o\n"
3915 "%sWorkingDirectory: %s\n"
3916 "%sRootDirectory: %s\n"
3917 "%sNonBlocking: %s\n"
3918 "%sPrivateTmp: %s\n"
3919 "%sPrivateDevices: %s\n"
3920 "%sProtectKernelTunables: %s\n"
3921 "%sProtectKernelModules: %s\n"
3922 "%sProtectControlGroups: %s\n"
3923 "%sPrivateNetwork: %s\n"
3924 "%sPrivateUsers: %s\n"
3925 "%sProtectHome: %s\n"
3926 "%sProtectSystem: %s\n"
3927 "%sMountAPIVFS: %s\n"
3928 "%sIgnoreSIGPIPE: %s\n"
3929 "%sMemoryDenyWriteExecute: %s\n"
3930 "%sRestrictRealtime: %s\n"
3931 "%sKeyringMode: %s\n",
3932 prefix, c->umask,
3933 prefix, c->working_directory ? c->working_directory : "/",
3934 prefix, c->root_directory ? c->root_directory : "/",
3935 prefix, yes_no(c->non_blocking),
3936 prefix, yes_no(c->private_tmp),
3937 prefix, yes_no(c->private_devices),
3938 prefix, yes_no(c->protect_kernel_tunables),
3939 prefix, yes_no(c->protect_kernel_modules),
3940 prefix, yes_no(c->protect_control_groups),
3941 prefix, yes_no(c->private_network),
3942 prefix, yes_no(c->private_users),
3943 prefix, protect_home_to_string(c->protect_home),
3944 prefix, protect_system_to_string(c->protect_system),
3945 prefix, yes_no(c->mount_apivfs),
3946 prefix, yes_no(c->ignore_sigpipe),
3947 prefix, yes_no(c->memory_deny_write_execute),
3948 prefix, yes_no(c->restrict_realtime),
3949 prefix, exec_keyring_mode_to_string(c->keyring_mode));
3950
3951 if (c->root_image)
3952 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3953
3954 STRV_FOREACH(e, c->environment)
3955 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3956
3957 STRV_FOREACH(e, c->environment_files)
3958 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
3959
3960 STRV_FOREACH(e, c->pass_environment)
3961 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3962
3963 STRV_FOREACH(e, c->unset_environment)
3964 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3965
3966 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3967
3968 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3969 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3970
3971 STRV_FOREACH(d, c->directories[dt].paths)
3972 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3973 }
3974
3975 if (c->nice_set)
3976 fprintf(f,
3977 "%sNice: %i\n",
3978 prefix, c->nice);
3979
3980 if (c->oom_score_adjust_set)
3981 fprintf(f,
3982 "%sOOMScoreAdjust: %i\n",
3983 prefix, c->oom_score_adjust);
3984
3985 for (i = 0; i < RLIM_NLIMITS; i++)
3986 if (c->rlimit[i]) {
3987 fprintf(f, "%s%s: " RLIM_FMT "\n",
3988 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3989 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3990 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3991 }
3992
3993 if (c->ioprio_set) {
3994 _cleanup_free_ char *class_str = NULL;
3995
3996 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3997 if (r >= 0)
3998 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3999
4000 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
4001 }
4002
4003 if (c->cpu_sched_set) {
4004 _cleanup_free_ char *policy_str = NULL;
4005
4006 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4007 if (r >= 0)
4008 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4009
4010 fprintf(f,
4011 "%sCPUSchedulingPriority: %i\n"
4012 "%sCPUSchedulingResetOnFork: %s\n",
4013 prefix, c->cpu_sched_priority,
4014 prefix, yes_no(c->cpu_sched_reset_on_fork));
4015 }
4016
4017 if (c->cpuset) {
4018 fprintf(f, "%sCPUAffinity:", prefix);
4019 for (i = 0; i < c->cpuset_ncpus; i++)
4020 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
4021 fprintf(f, " %u", i);
4022 fputs("\n", f);
4023 }
4024
4025 if (c->timer_slack_nsec != NSEC_INFINITY)
4026 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
4027
4028 fprintf(f,
4029 "%sStandardInput: %s\n"
4030 "%sStandardOutput: %s\n"
4031 "%sStandardError: %s\n",
4032 prefix, exec_input_to_string(c->std_input),
4033 prefix, exec_output_to_string(c->std_output),
4034 prefix, exec_output_to_string(c->std_error));
4035
4036 if (c->std_input == EXEC_INPUT_NAMED_FD)
4037 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4038 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4039 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4040 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4041 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4042
4043 if (c->std_input == EXEC_INPUT_FILE)
4044 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4045 if (c->std_output == EXEC_OUTPUT_FILE)
4046 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4047 if (c->std_error == EXEC_OUTPUT_FILE)
4048 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4049
4050 if (c->tty_path)
4051 fprintf(f,
4052 "%sTTYPath: %s\n"
4053 "%sTTYReset: %s\n"
4054 "%sTTYVHangup: %s\n"
4055 "%sTTYVTDisallocate: %s\n",
4056 prefix, c->tty_path,
4057 prefix, yes_no(c->tty_reset),
4058 prefix, yes_no(c->tty_vhangup),
4059 prefix, yes_no(c->tty_vt_disallocate));
4060
4061 if (IN_SET(c->std_output,
4062 EXEC_OUTPUT_SYSLOG,
4063 EXEC_OUTPUT_KMSG,
4064 EXEC_OUTPUT_JOURNAL,
4065 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4066 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4067 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4068 IN_SET(c->std_error,
4069 EXEC_OUTPUT_SYSLOG,
4070 EXEC_OUTPUT_KMSG,
4071 EXEC_OUTPUT_JOURNAL,
4072 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4073 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4074 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
4075
4076 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
4077
4078 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4079 if (r >= 0)
4080 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
4081
4082 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4083 if (r >= 0)
4084 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
4085 }
4086
4087 if (c->log_level_max >= 0) {
4088 _cleanup_free_ char *t = NULL;
4089
4090 (void) log_level_to_string_alloc(c->log_level_max, &t);
4091
4092 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4093 }
4094
4095 if (c->n_log_extra_fields > 0) {
4096 size_t j;
4097
4098 for (j = 0; j < c->n_log_extra_fields; j++) {
4099 fprintf(f, "%sLogExtraFields: ", prefix);
4100 fwrite(c->log_extra_fields[j].iov_base,
4101 1, c->log_extra_fields[j].iov_len,
4102 f);
4103 fputc('\n', f);
4104 }
4105 }
4106
4107 if (c->secure_bits) {
4108 _cleanup_free_ char *str = NULL;
4109
4110 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4111 if (r >= 0)
4112 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4113 }
4114
4115 if (c->capability_bounding_set != CAP_ALL) {
4116 _cleanup_free_ char *str = NULL;
4117
4118 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4119 if (r >= 0)
4120 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
4121 }
4122
4123 if (c->capability_ambient_set != 0) {
4124 _cleanup_free_ char *str = NULL;
4125
4126 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4127 if (r >= 0)
4128 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
4129 }
4130
4131 if (c->user)
4132 fprintf(f, "%sUser: %s\n", prefix, c->user);
4133 if (c->group)
4134 fprintf(f, "%sGroup: %s\n", prefix, c->group);
4135
4136 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4137
4138 if (!strv_isempty(c->supplementary_groups)) {
4139 fprintf(f, "%sSupplementaryGroups:", prefix);
4140 strv_fprintf(f, c->supplementary_groups);
4141 fputs("\n", f);
4142 }
4143
4144 if (c->pam_name)
4145 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
4146
4147 if (strv_length(c->read_write_paths) > 0) {
4148 fprintf(f, "%sReadWritePaths:", prefix);
4149 strv_fprintf(f, c->read_write_paths);
4150 fputs("\n", f);
4151 }
4152
4153 if (strv_length(c->read_only_paths) > 0) {
4154 fprintf(f, "%sReadOnlyPaths:", prefix);
4155 strv_fprintf(f, c->read_only_paths);
4156 fputs("\n", f);
4157 }
4158
4159 if (strv_length(c->inaccessible_paths) > 0) {
4160 fprintf(f, "%sInaccessiblePaths:", prefix);
4161 strv_fprintf(f, c->inaccessible_paths);
4162 fputs("\n", f);
4163 }
4164
4165 if (c->n_bind_mounts > 0)
4166 for (i = 0; i < c->n_bind_mounts; i++) {
4167 fprintf(f, "%s%s: %s:%s:%s\n", prefix,
4168 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4169 c->bind_mounts[i].source,
4170 c->bind_mounts[i].destination,
4171 c->bind_mounts[i].recursive ? "rbind" : "norbind");
4172 }
4173
4174 if (c->utmp_id)
4175 fprintf(f,
4176 "%sUtmpIdentifier: %s\n",
4177 prefix, c->utmp_id);
4178
4179 if (c->selinux_context)
4180 fprintf(f,
4181 "%sSELinuxContext: %s%s\n",
4182 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
4183
4184 if (c->apparmor_profile)
4185 fprintf(f,
4186 "%sAppArmorProfile: %s%s\n",
4187 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4188
4189 if (c->smack_process_label)
4190 fprintf(f,
4191 "%sSmackProcessLabel: %s%s\n",
4192 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4193
4194 if (c->personality != PERSONALITY_INVALID)
4195 fprintf(f,
4196 "%sPersonality: %s\n",
4197 prefix, strna(personality_to_string(c->personality)));
4198
4199 fprintf(f,
4200 "%sLockPersonality: %s\n",
4201 prefix, yes_no(c->lock_personality));
4202
4203 if (c->syscall_filter) {
4204 #if HAVE_SECCOMP
4205 Iterator j;
4206 void *id, *val;
4207 bool first = true;
4208 #endif
4209
4210 fprintf(f,
4211 "%sSystemCallFilter: ",
4212 prefix);
4213
4214 if (!c->syscall_whitelist)
4215 fputc('~', f);
4216
4217 #if HAVE_SECCOMP
4218 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
4219 _cleanup_free_ char *name = NULL;
4220 const char *errno_name = NULL;
4221 int num = PTR_TO_INT(val);
4222
4223 if (first)
4224 first = false;
4225 else
4226 fputc(' ', f);
4227
4228 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
4229 fputs(strna(name), f);
4230
4231 if (num >= 0) {
4232 errno_name = errno_to_name(num);
4233 if (errno_name)
4234 fprintf(f, ":%s", errno_name);
4235 else
4236 fprintf(f, ":%d", num);
4237 }
4238 }
4239 #endif
4240
4241 fputc('\n', f);
4242 }
4243
4244 if (c->syscall_archs) {
4245 #if HAVE_SECCOMP
4246 Iterator j;
4247 void *id;
4248 #endif
4249
4250 fprintf(f,
4251 "%sSystemCallArchitectures:",
4252 prefix);
4253
4254 #if HAVE_SECCOMP
4255 SET_FOREACH(id, c->syscall_archs, j)
4256 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4257 #endif
4258 fputc('\n', f);
4259 }
4260
4261 if (exec_context_restrict_namespaces_set(c)) {
4262 _cleanup_free_ char *s = NULL;
4263
4264 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4265 if (r >= 0)
4266 fprintf(f, "%sRestrictNamespaces: %s\n",
4267 prefix, s);
4268 }
4269
4270 if (c->syscall_errno > 0) {
4271 const char *errno_name;
4272
4273 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4274
4275 errno_name = errno_to_name(c->syscall_errno);
4276 if (errno_name)
4277 fprintf(f, "%s\n", errno_name);
4278 else
4279 fprintf(f, "%d\n", c->syscall_errno);
4280 }
4281
4282 if (c->apparmor_profile)
4283 fprintf(f,
4284 "%sAppArmorProfile: %s%s\n",
4285 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4286 }
4287
4288 bool exec_context_maintains_privileges(ExecContext *c) {
4289 assert(c);
4290
4291 /* Returns true if the process forked off would run under
4292 * an unchanged UID or as root. */
4293
4294 if (!c->user)
4295 return true;
4296
4297 if (streq(c->user, "root") || streq(c->user, "0"))
4298 return true;
4299
4300 return false;
4301 }
4302
4303 int exec_context_get_effective_ioprio(ExecContext *c) {
4304 int p;
4305
4306 assert(c);
4307
4308 if (c->ioprio_set)
4309 return c->ioprio;
4310
4311 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4312 if (p < 0)
4313 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4314
4315 return p;
4316 }
4317
4318 void exec_context_free_log_extra_fields(ExecContext *c) {
4319 size_t l;
4320
4321 assert(c);
4322
4323 for (l = 0; l < c->n_log_extra_fields; l++)
4324 free(c->log_extra_fields[l].iov_base);
4325 c->log_extra_fields = mfree(c->log_extra_fields);
4326 c->n_log_extra_fields = 0;
4327 }
4328
4329 void exec_status_start(ExecStatus *s, pid_t pid) {
4330 assert(s);
4331
4332 zero(*s);
4333 s->pid = pid;
4334 dual_timestamp_get(&s->start_timestamp);
4335 }
4336
4337 void exec_status_exit(ExecStatus *s, ExecContext *context, pid_t pid, int code, int status) {
4338 assert(s);
4339
4340 if (s->pid && s->pid != pid)
4341 zero(*s);
4342
4343 s->pid = pid;
4344 dual_timestamp_get(&s->exit_timestamp);
4345
4346 s->code = code;
4347 s->status = status;
4348
4349 if (context) {
4350 if (context->utmp_id)
4351 utmp_put_dead_process(context->utmp_id, pid, code, status);
4352
4353 exec_context_tty_reset(context, NULL);
4354 }
4355 }
4356
4357 void exec_status_dump(ExecStatus *s, FILE *f, const char *prefix) {
4358 char buf[FORMAT_TIMESTAMP_MAX];
4359
4360 assert(s);
4361 assert(f);
4362
4363 if (s->pid <= 0)
4364 return;
4365
4366 prefix = strempty(prefix);
4367
4368 fprintf(f,
4369 "%sPID: "PID_FMT"\n",
4370 prefix, s->pid);
4371
4372 if (dual_timestamp_is_set(&s->start_timestamp))
4373 fprintf(f,
4374 "%sStart Timestamp: %s\n",
4375 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
4376
4377 if (dual_timestamp_is_set(&s->exit_timestamp))
4378 fprintf(f,
4379 "%sExit Timestamp: %s\n"
4380 "%sExit Code: %s\n"
4381 "%sExit Status: %i\n",
4382 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
4383 prefix, sigchld_code_to_string(s->code),
4384 prefix, s->status);
4385 }
4386
4387 char *exec_command_line(char **argv) {
4388 size_t k;
4389 char *n, *p, **a;
4390 bool first = true;
4391
4392 assert(argv);
4393
4394 k = 1;
4395 STRV_FOREACH(a, argv)
4396 k += strlen(*a)+3;
4397
4398 n = new(char, k);
4399 if (!n)
4400 return NULL;
4401
4402 p = n;
4403 STRV_FOREACH(a, argv) {
4404
4405 if (!first)
4406 *(p++) = ' ';
4407 else
4408 first = false;
4409
4410 if (strpbrk(*a, WHITESPACE)) {
4411 *(p++) = '\'';
4412 p = stpcpy(p, *a);
4413 *(p++) = '\'';
4414 } else
4415 p = stpcpy(p, *a);
4416
4417 }
4418
4419 *p = 0;
4420
4421 /* FIXME: this doesn't really handle arguments that have
4422 * spaces and ticks in them */
4423
4424 return n;
4425 }
4426
4427 void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
4428 _cleanup_free_ char *cmd = NULL;
4429 const char *prefix2;
4430
4431 assert(c);
4432 assert(f);
4433
4434 prefix = strempty(prefix);
4435 prefix2 = strjoina(prefix, "\t");
4436
4437 cmd = exec_command_line(c->argv);
4438 fprintf(f,
4439 "%sCommand Line: %s\n",
4440 prefix, cmd ? cmd : strerror(ENOMEM));
4441
4442 exec_status_dump(&c->exec_status, f, prefix2);
4443 }
4444
4445 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4446 assert(f);
4447
4448 prefix = strempty(prefix);
4449
4450 LIST_FOREACH(command, c, c)
4451 exec_command_dump(c, f, prefix);
4452 }
4453
4454 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4455 ExecCommand *end;
4456
4457 assert(l);
4458 assert(e);
4459
4460 if (*l) {
4461 /* It's kind of important, that we keep the order here */
4462 LIST_FIND_TAIL(command, *l, end);
4463 LIST_INSERT_AFTER(command, *l, end, e);
4464 } else
4465 *l = e;
4466 }
4467
4468 int exec_command_set(ExecCommand *c, const char *path, ...) {
4469 va_list ap;
4470 char **l, *p;
4471
4472 assert(c);
4473 assert(path);
4474
4475 va_start(ap, path);
4476 l = strv_new_ap(path, ap);
4477 va_end(ap);
4478
4479 if (!l)
4480 return -ENOMEM;
4481
4482 p = strdup(path);
4483 if (!p) {
4484 strv_free(l);
4485 return -ENOMEM;
4486 }
4487
4488 free(c->path);
4489 c->path = p;
4490
4491 strv_free(c->argv);
4492 c->argv = l;
4493
4494 return 0;
4495 }
4496
4497 int exec_command_append(ExecCommand *c, const char *path, ...) {
4498 _cleanup_strv_free_ char **l = NULL;
4499 va_list ap;
4500 int r;
4501
4502 assert(c);
4503 assert(path);
4504
4505 va_start(ap, path);
4506 l = strv_new_ap(path, ap);
4507 va_end(ap);
4508
4509 if (!l)
4510 return -ENOMEM;
4511
4512 r = strv_extend_strv(&c->argv, l, false);
4513 if (r < 0)
4514 return r;
4515
4516 return 0;
4517 }
4518
4519
4520 static int exec_runtime_allocate(ExecRuntime **rt) {
4521
4522 if (*rt)
4523 return 0;
4524
4525 *rt = new0(ExecRuntime, 1);
4526 if (!*rt)
4527 return -ENOMEM;
4528
4529 (*rt)->n_ref = 1;
4530 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
4531
4532 return 0;
4533 }
4534
4535 int exec_runtime_make(ExecRuntime **rt, ExecContext *c, const char *id) {
4536 int r;
4537
4538 assert(rt);
4539 assert(c);
4540 assert(id);
4541
4542 if (*rt)
4543 return 1;
4544
4545 if (!c->private_network && !c->private_tmp)
4546 return 0;
4547
4548 r = exec_runtime_allocate(rt);
4549 if (r < 0)
4550 return r;
4551
4552 if (c->private_network && (*rt)->netns_storage_socket[0] < 0) {
4553 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, (*rt)->netns_storage_socket) < 0)
4554 return -errno;
4555 }
4556
4557 if (c->private_tmp && !(*rt)->tmp_dir) {
4558 r = setup_tmp_dirs(id, &(*rt)->tmp_dir, &(*rt)->var_tmp_dir);
4559 if (r < 0)
4560 return r;
4561 }
4562
4563 return 1;
4564 }
4565
4566 ExecRuntime *exec_runtime_ref(ExecRuntime *r) {
4567 assert(r);
4568 assert(r->n_ref > 0);
4569
4570 r->n_ref++;
4571 return r;
4572 }
4573
4574 ExecRuntime *exec_runtime_unref(ExecRuntime *r) {
4575
4576 if (!r)
4577 return NULL;
4578
4579 assert(r->n_ref > 0);
4580
4581 r->n_ref--;
4582 if (r->n_ref > 0)
4583 return NULL;
4584
4585 free(r->tmp_dir);
4586 free(r->var_tmp_dir);
4587 safe_close_pair(r->netns_storage_socket);
4588 return mfree(r);
4589 }
4590
4591 int exec_runtime_serialize(Unit *u, ExecRuntime *rt, FILE *f, FDSet *fds) {
4592 assert(u);
4593 assert(f);
4594 assert(fds);
4595
4596 if (!rt)
4597 return 0;
4598
4599 if (rt->tmp_dir)
4600 unit_serialize_item(u, f, "tmp-dir", rt->tmp_dir);
4601
4602 if (rt->var_tmp_dir)
4603 unit_serialize_item(u, f, "var-tmp-dir", rt->var_tmp_dir);
4604
4605 if (rt->netns_storage_socket[0] >= 0) {
4606 int copy;
4607
4608 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4609 if (copy < 0)
4610 return copy;
4611
4612 unit_serialize_item_format(u, f, "netns-socket-0", "%i", copy);
4613 }
4614
4615 if (rt->netns_storage_socket[1] >= 0) {
4616 int copy;
4617
4618 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4619 if (copy < 0)
4620 return copy;
4621
4622 unit_serialize_item_format(u, f, "netns-socket-1", "%i", copy);
4623 }
4624
4625 return 0;
4626 }
4627
4628 int exec_runtime_deserialize_item(Unit *u, ExecRuntime **rt, const char *key, const char *value, FDSet *fds) {
4629 int r;
4630
4631 assert(rt);
4632 assert(key);
4633 assert(value);
4634
4635 if (streq(key, "tmp-dir")) {
4636 char *copy;
4637
4638 r = exec_runtime_allocate(rt);
4639 if (r < 0)
4640 return log_oom();
4641
4642 copy = strdup(value);
4643 if (!copy)
4644 return log_oom();
4645
4646 free((*rt)->tmp_dir);
4647 (*rt)->tmp_dir = copy;
4648
4649 } else if (streq(key, "var-tmp-dir")) {
4650 char *copy;
4651
4652 r = exec_runtime_allocate(rt);
4653 if (r < 0)
4654 return log_oom();
4655
4656 copy = strdup(value);
4657 if (!copy)
4658 return log_oom();
4659
4660 free((*rt)->var_tmp_dir);
4661 (*rt)->var_tmp_dir = copy;
4662
4663 } else if (streq(key, "netns-socket-0")) {
4664 int fd;
4665
4666 r = exec_runtime_allocate(rt);
4667 if (r < 0)
4668 return log_oom();
4669
4670 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4671 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4672 else {
4673 safe_close((*rt)->netns_storage_socket[0]);
4674 (*rt)->netns_storage_socket[0] = fdset_remove(fds, fd);
4675 }
4676 } else if (streq(key, "netns-socket-1")) {
4677 int fd;
4678
4679 r = exec_runtime_allocate(rt);
4680 if (r < 0)
4681 return log_oom();
4682
4683 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd))
4684 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
4685 else {
4686 safe_close((*rt)->netns_storage_socket[1]);
4687 (*rt)->netns_storage_socket[1] = fdset_remove(fds, fd);
4688 }
4689 } else
4690 return 0;
4691
4692 return 1;
4693 }
4694
4695 static void *remove_tmpdir_thread(void *p) {
4696 _cleanup_free_ char *path = p;
4697
4698 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4699 return NULL;
4700 }
4701
4702 void exec_runtime_destroy(ExecRuntime *rt) {
4703 int r;
4704
4705 if (!rt)
4706 return;
4707
4708 /* If there are multiple users of this, let's leave the stuff around */
4709 if (rt->n_ref > 1)
4710 return;
4711
4712 if (rt->tmp_dir) {
4713 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4714
4715 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4716 if (r < 0) {
4717 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4718 free(rt->tmp_dir);
4719 }
4720
4721 rt->tmp_dir = NULL;
4722 }
4723
4724 if (rt->var_tmp_dir) {
4725 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4726
4727 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4728 if (r < 0) {
4729 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4730 free(rt->var_tmp_dir);
4731 }
4732
4733 rt->var_tmp_dir = NULL;
4734 }
4735
4736 safe_close_pair(rt->netns_storage_socket);
4737 }
4738
4739 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4740 [EXEC_INPUT_NULL] = "null",
4741 [EXEC_INPUT_TTY] = "tty",
4742 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4743 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
4744 [EXEC_INPUT_SOCKET] = "socket",
4745 [EXEC_INPUT_NAMED_FD] = "fd",
4746 [EXEC_INPUT_DATA] = "data",
4747 [EXEC_INPUT_FILE] = "file",
4748 };
4749
4750 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4751
4752 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
4753 [EXEC_OUTPUT_INHERIT] = "inherit",
4754 [EXEC_OUTPUT_NULL] = "null",
4755 [EXEC_OUTPUT_TTY] = "tty",
4756 [EXEC_OUTPUT_SYSLOG] = "syslog",
4757 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
4758 [EXEC_OUTPUT_KMSG] = "kmsg",
4759 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
4760 [EXEC_OUTPUT_JOURNAL] = "journal",
4761 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
4762 [EXEC_OUTPUT_SOCKET] = "socket",
4763 [EXEC_OUTPUT_NAMED_FD] = "fd",
4764 [EXEC_OUTPUT_FILE] = "file",
4765 };
4766
4767 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
4768
4769 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4770 [EXEC_UTMP_INIT] = "init",
4771 [EXEC_UTMP_LOGIN] = "login",
4772 [EXEC_UTMP_USER] = "user",
4773 };
4774
4775 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
4776
4777 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4778 [EXEC_PRESERVE_NO] = "no",
4779 [EXEC_PRESERVE_YES] = "yes",
4780 [EXEC_PRESERVE_RESTART] = "restart",
4781 };
4782
4783 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
4784
4785 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
4786 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4787 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4788 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4789 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4790 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
4791 };
4792
4793 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
4794
4795 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
4796 [EXEC_KEYRING_INHERIT] = "inherit",
4797 [EXEC_KEYRING_PRIVATE] = "private",
4798 [EXEC_KEYRING_SHARED] = "shared",
4799 };
4800
4801 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);