]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
tree-wide: add missing whitespace at the end of comments
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #if HAVE_PAM
19 #include <security/pam_appl.h>
20 #endif
21
22 #if HAVE_SELINUX
23 #include <selinux/selinux.h>
24 #endif
25
26 #if HAVE_SECCOMP
27 #include <seccomp.h>
28 #endif
29
30 #if HAVE_APPARMOR
31 #include <sys/apparmor.h>
32 #endif
33
34 #include "sd-messages.h"
35
36 #include "acl-util.h"
37 #include "af-list.h"
38 #include "alloc-util.h"
39 #if HAVE_APPARMOR
40 #include "apparmor-util.h"
41 #endif
42 #include "async.h"
43 #include "barrier.h"
44 #include "cap-list.h"
45 #include "capability-util.h"
46 #include "cgroup-setup.h"
47 #include "chown-recursive.h"
48 #include "cpu-set-util.h"
49 #include "def.h"
50 #include "env-file.h"
51 #include "env-util.h"
52 #include "errno-list.h"
53 #include "execute.h"
54 #include "exit-status.h"
55 #include "fd-util.h"
56 #include "fileio.h"
57 #include "format-util.h"
58 #include "fs-util.h"
59 #include "glob-util.h"
60 #include "hexdecoct.h"
61 #include "io-util.h"
62 #include "ioprio.h"
63 #include "label.h"
64 #include "log.h"
65 #include "macro.h"
66 #include "manager.h"
67 #include "manager-dump.h"
68 #include "memory-util.h"
69 #include "missing_fs.h"
70 #include "mkdir.h"
71 #include "mount-util.h"
72 #include "mountpoint-util.h"
73 #include "namespace.h"
74 #include "parse-util.h"
75 #include "path-util.h"
76 #include "process-util.h"
77 #include "random-util.h"
78 #include "rlimit-util.h"
79 #include "rm-rf.h"
80 #if HAVE_SECCOMP
81 #include "seccomp-util.h"
82 #endif
83 #include "securebits-util.h"
84 #include "selinux-util.h"
85 #include "signal-util.h"
86 #include "smack-util.h"
87 #include "socket-util.h"
88 #include "special.h"
89 #include "stat-util.h"
90 #include "string-table.h"
91 #include "string-util.h"
92 #include "strv.h"
93 #include "syslog-util.h"
94 #include "terminal-util.h"
95 #include "tmpfile-util.h"
96 #include "umask-util.h"
97 #include "unit-serialize.h"
98 #include "user-util.h"
99 #include "utmp-wtmp.h"
100
101 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
102 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
103
104 #define SNDBUF_SIZE (8*1024*1024)
105
106 static int shift_fds(int fds[], size_t n_fds) {
107 if (n_fds <= 0)
108 return 0;
109
110 /* Modifies the fds array! (sorts it) */
111
112 assert(fds);
113
114 for (int start = 0;;) {
115 int restart_from = -1;
116
117 for (int i = start; i < (int) n_fds; i++) {
118 int nfd;
119
120 /* Already at right index? */
121 if (fds[i] == i+3)
122 continue;
123
124 nfd = fcntl(fds[i], F_DUPFD, i + 3);
125 if (nfd < 0)
126 return -errno;
127
128 safe_close(fds[i]);
129 fds[i] = nfd;
130
131 /* Hmm, the fd we wanted isn't free? Then
132 * let's remember that and try again from here */
133 if (nfd != i+3 && restart_from < 0)
134 restart_from = i;
135 }
136
137 if (restart_from < 0)
138 break;
139
140 start = restart_from;
141 }
142
143 return 0;
144 }
145
146 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
147 size_t n_fds;
148 int r;
149
150 n_fds = n_socket_fds + n_storage_fds;
151 if (n_fds <= 0)
152 return 0;
153
154 assert(fds);
155
156 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
157 * O_NONBLOCK only applies to socket activation though. */
158
159 for (size_t i = 0; i < n_fds; i++) {
160
161 if (i < n_socket_fds) {
162 r = fd_nonblock(fds[i], nonblock);
163 if (r < 0)
164 return r;
165 }
166
167 /* We unconditionally drop FD_CLOEXEC from the fds,
168 * since after all we want to pass these fds to our
169 * children */
170
171 r = fd_cloexec(fds[i], false);
172 if (r < 0)
173 return r;
174 }
175
176 return 0;
177 }
178
179 static const char *exec_context_tty_path(const ExecContext *context) {
180 assert(context);
181
182 if (context->stdio_as_fds)
183 return NULL;
184
185 if (context->tty_path)
186 return context->tty_path;
187
188 return "/dev/console";
189 }
190
191 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
192 const char *path;
193
194 assert(context);
195
196 path = exec_context_tty_path(context);
197
198 if (context->tty_vhangup) {
199 if (p && p->stdin_fd >= 0)
200 (void) terminal_vhangup_fd(p->stdin_fd);
201 else if (path)
202 (void) terminal_vhangup(path);
203 }
204
205 if (context->tty_reset) {
206 if (p && p->stdin_fd >= 0)
207 (void) reset_terminal_fd(p->stdin_fd, true);
208 else if (path)
209 (void) reset_terminal(path);
210 }
211
212 if (context->tty_vt_disallocate && path)
213 (void) vt_disallocate(path);
214 }
215
216 static bool is_terminal_input(ExecInput i) {
217 return IN_SET(i,
218 EXEC_INPUT_TTY,
219 EXEC_INPUT_TTY_FORCE,
220 EXEC_INPUT_TTY_FAIL);
221 }
222
223 static bool is_terminal_output(ExecOutput o) {
224 return IN_SET(o,
225 EXEC_OUTPUT_TTY,
226 EXEC_OUTPUT_KMSG_AND_CONSOLE,
227 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
228 }
229
230 static bool is_kmsg_output(ExecOutput o) {
231 return IN_SET(o,
232 EXEC_OUTPUT_KMSG,
233 EXEC_OUTPUT_KMSG_AND_CONSOLE);
234 }
235
236 static bool exec_context_needs_term(const ExecContext *c) {
237 assert(c);
238
239 /* Return true if the execution context suggests we should set $TERM to something useful. */
240
241 if (is_terminal_input(c->std_input))
242 return true;
243
244 if (is_terminal_output(c->std_output))
245 return true;
246
247 if (is_terminal_output(c->std_error))
248 return true;
249
250 return !!c->tty_path;
251 }
252
253 static int open_null_as(int flags, int nfd) {
254 int fd;
255
256 assert(nfd >= 0);
257
258 fd = open("/dev/null", flags|O_NOCTTY);
259 if (fd < 0)
260 return -errno;
261
262 return move_fd(fd, nfd, false);
263 }
264
265 static int connect_journal_socket(
266 int fd,
267 const char *log_namespace,
268 uid_t uid,
269 gid_t gid) {
270
271 union sockaddr_union sa;
272 socklen_t sa_len;
273 uid_t olduid = UID_INVALID;
274 gid_t oldgid = GID_INVALID;
275 const char *j;
276 int r;
277
278 j = log_namespace ?
279 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
280 "/run/systemd/journal/stdout";
281 r = sockaddr_un_set_path(&sa.un, j);
282 if (r < 0)
283 return r;
284 sa_len = r;
285
286 if (gid_is_valid(gid)) {
287 oldgid = getgid();
288
289 if (setegid(gid) < 0)
290 return -errno;
291 }
292
293 if (uid_is_valid(uid)) {
294 olduid = getuid();
295
296 if (seteuid(uid) < 0) {
297 r = -errno;
298 goto restore_gid;
299 }
300 }
301
302 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
303
304 /* If we fail to restore the uid or gid, things will likely
305 fail later on. This should only happen if an LSM interferes. */
306
307 if (uid_is_valid(uid))
308 (void) seteuid(olduid);
309
310 restore_gid:
311 if (gid_is_valid(gid))
312 (void) setegid(oldgid);
313
314 return r;
315 }
316
317 static int connect_logger_as(
318 const Unit *unit,
319 const ExecContext *context,
320 const ExecParameters *params,
321 ExecOutput output,
322 const char *ident,
323 int nfd,
324 uid_t uid,
325 gid_t gid) {
326
327 _cleanup_close_ int fd = -1;
328 int r;
329
330 assert(context);
331 assert(params);
332 assert(output < _EXEC_OUTPUT_MAX);
333 assert(ident);
334 assert(nfd >= 0);
335
336 fd = socket(AF_UNIX, SOCK_STREAM, 0);
337 if (fd < 0)
338 return -errno;
339
340 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
341 if (r < 0)
342 return r;
343
344 if (shutdown(fd, SHUT_RD) < 0)
345 return -errno;
346
347 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
348
349 if (dprintf(fd,
350 "%s\n"
351 "%s\n"
352 "%i\n"
353 "%i\n"
354 "%i\n"
355 "%i\n"
356 "%i\n",
357 context->syslog_identifier ?: ident,
358 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
359 context->syslog_priority,
360 !!context->syslog_level_prefix,
361 false,
362 is_kmsg_output(output),
363 is_terminal_output(output)) < 0)
364 return -errno;
365
366 return move_fd(TAKE_FD(fd), nfd, false);
367 }
368
369 static int open_terminal_as(const char *path, int flags, int nfd) {
370 int fd;
371
372 assert(path);
373 assert(nfd >= 0);
374
375 fd = open_terminal(path, flags | O_NOCTTY);
376 if (fd < 0)
377 return fd;
378
379 return move_fd(fd, nfd, false);
380 }
381
382 static int acquire_path(const char *path, int flags, mode_t mode) {
383 union sockaddr_union sa;
384 socklen_t sa_len;
385 _cleanup_close_ int fd = -1;
386 int r;
387
388 assert(path);
389
390 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
391 flags |= O_CREAT;
392
393 fd = open(path, flags|O_NOCTTY, mode);
394 if (fd >= 0)
395 return TAKE_FD(fd);
396
397 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
398 return -errno;
399
400 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
401
402 r = sockaddr_un_set_path(&sa.un, path);
403 if (r < 0)
404 return r == -EINVAL ? -ENXIO : r;
405 sa_len = r;
406
407 fd = socket(AF_UNIX, SOCK_STREAM, 0);
408 if (fd < 0)
409 return -errno;
410
411 if (connect(fd, &sa.sa, sa_len) < 0)
412 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
413 * indication that this wasn't an AF_UNIX socket after all */
414
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
420 r = 0;
421 if (r < 0)
422 return -errno;
423
424 return TAKE_FD(fd);
425 }
426
427 static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
440
441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
442 return EXEC_INPUT_NULL;
443
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
447 return std_input;
448 }
449
450 static int fixup_output(ExecOutput output, int socket_fd) {
451
452 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
453 return EXEC_OUTPUT_INHERIT;
454
455 return output;
456 }
457
458 static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
461 int socket_fd,
462 const int named_iofds[static 3]) {
463
464 ExecInput i;
465
466 assert(context);
467 assert(params);
468 assert(named_iofds);
469
470 if (params->stdin_fd >= 0) {
471 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
472 return -errno;
473
474 /* Try to make this the controlling tty, if it is a tty, and reset it */
475 if (isatty(STDIN_FILENO)) {
476 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
477 (void) reset_terminal_fd(STDIN_FILENO, true);
478 }
479
480 return STDIN_FILENO;
481 }
482
483 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
484
485 switch (i) {
486
487 case EXEC_INPUT_NULL:
488 return open_null_as(O_RDONLY, STDIN_FILENO);
489
490 case EXEC_INPUT_TTY:
491 case EXEC_INPUT_TTY_FORCE:
492 case EXEC_INPUT_TTY_FAIL: {
493 int fd;
494
495 fd = acquire_terminal(exec_context_tty_path(context),
496 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
497 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
498 ACQUIRE_TERMINAL_WAIT,
499 USEC_INFINITY);
500 if (fd < 0)
501 return fd;
502
503 return move_fd(fd, STDIN_FILENO, false);
504 }
505
506 case EXEC_INPUT_SOCKET:
507 assert(socket_fd >= 0);
508
509 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
510
511 case EXEC_INPUT_NAMED_FD:
512 assert(named_iofds[STDIN_FILENO] >= 0);
513
514 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
515 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
516
517 case EXEC_INPUT_DATA: {
518 int fd;
519
520 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
521 if (fd < 0)
522 return fd;
523
524 return move_fd(fd, STDIN_FILENO, false);
525 }
526
527 case EXEC_INPUT_FILE: {
528 bool rw;
529 int fd;
530
531 assert(context->stdio_file[STDIN_FILENO]);
532
533 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
534 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
535
536 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
537 if (fd < 0)
538 return fd;
539
540 return move_fd(fd, STDIN_FILENO, false);
541 }
542
543 default:
544 assert_not_reached("Unknown input type");
545 }
546 }
547
548 static bool can_inherit_stderr_from_stdout(
549 const ExecContext *context,
550 ExecOutput o,
551 ExecOutput e) {
552
553 assert(context);
554
555 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
556 * stderr fd */
557
558 if (e == EXEC_OUTPUT_INHERIT)
559 return true;
560 if (e != o)
561 return false;
562
563 if (e == EXEC_OUTPUT_NAMED_FD)
564 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
565
566 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
567 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
568
569 return true;
570 }
571
572 static int setup_output(
573 const Unit *unit,
574 const ExecContext *context,
575 const ExecParameters *params,
576 int fileno,
577 int socket_fd,
578 const int named_iofds[static 3],
579 const char *ident,
580 uid_t uid,
581 gid_t gid,
582 dev_t *journal_stream_dev,
583 ino_t *journal_stream_ino) {
584
585 ExecOutput o;
586 ExecInput i;
587 int r;
588
589 assert(unit);
590 assert(context);
591 assert(params);
592 assert(ident);
593 assert(journal_stream_dev);
594 assert(journal_stream_ino);
595
596 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
597
598 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
599 return -errno;
600
601 return STDOUT_FILENO;
602 }
603
604 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
605 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
606 return -errno;
607
608 return STDERR_FILENO;
609 }
610
611 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
612 o = fixup_output(context->std_output, socket_fd);
613
614 if (fileno == STDERR_FILENO) {
615 ExecOutput e;
616 e = fixup_output(context->std_error, socket_fd);
617
618 /* This expects the input and output are already set up */
619
620 /* Don't change the stderr file descriptor if we inherit all
621 * the way and are not on a tty */
622 if (e == EXEC_OUTPUT_INHERIT &&
623 o == EXEC_OUTPUT_INHERIT &&
624 i == EXEC_INPUT_NULL &&
625 !is_terminal_input(context->std_input) &&
626 getppid() != 1)
627 return fileno;
628
629 /* Duplicate from stdout if possible */
630 if (can_inherit_stderr_from_stdout(context, o, e))
631 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
632
633 o = e;
634
635 } else if (o == EXEC_OUTPUT_INHERIT) {
636 /* If input got downgraded, inherit the original value */
637 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
638 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
639
640 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
641 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
642 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
643
644 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
645 if (getppid() != 1)
646 return fileno;
647
648 /* We need to open /dev/null here anew, to get the right access mode. */
649 return open_null_as(O_WRONLY, fileno);
650 }
651
652 switch (o) {
653
654 case EXEC_OUTPUT_NULL:
655 return open_null_as(O_WRONLY, fileno);
656
657 case EXEC_OUTPUT_TTY:
658 if (is_terminal_input(i))
659 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
660
661 /* We don't reset the terminal if this is just about output */
662 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
663
664 case EXEC_OUTPUT_KMSG:
665 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
666 case EXEC_OUTPUT_JOURNAL:
667 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
668 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
669 if (r < 0) {
670 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
671 fileno == STDOUT_FILENO ? "stdout" : "stderr");
672 r = open_null_as(O_WRONLY, fileno);
673 } else {
674 struct stat st;
675
676 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
677 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
678 * services to detect whether they are connected to the journal or not.
679 *
680 * If both stdout and stderr are connected to a stream then let's make sure to store the data
681 * about STDERR as that's usually the best way to do logging. */
682
683 if (fstat(fileno, &st) >= 0 &&
684 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
685 *journal_stream_dev = st.st_dev;
686 *journal_stream_ino = st.st_ino;
687 }
688 }
689 return r;
690
691 case EXEC_OUTPUT_SOCKET:
692 assert(socket_fd >= 0);
693
694 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
695
696 case EXEC_OUTPUT_NAMED_FD:
697 assert(named_iofds[fileno] >= 0);
698
699 (void) fd_nonblock(named_iofds[fileno], false);
700 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
701
702 case EXEC_OUTPUT_FILE:
703 case EXEC_OUTPUT_FILE_APPEND:
704 case EXEC_OUTPUT_FILE_TRUNCATE: {
705 bool rw;
706 int fd, flags;
707
708 assert(context->stdio_file[fileno]);
709
710 rw = context->std_input == EXEC_INPUT_FILE &&
711 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
712
713 if (rw)
714 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
715
716 flags = O_WRONLY;
717 if (o == EXEC_OUTPUT_FILE_APPEND)
718 flags |= O_APPEND;
719 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
720 flags |= O_TRUNC;
721
722 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
723 if (fd < 0)
724 return fd;
725
726 return move_fd(fd, fileno, 0);
727 }
728
729 default:
730 assert_not_reached("Unknown error type");
731 }
732 }
733
734 static int chown_terminal(int fd, uid_t uid) {
735 int r;
736
737 assert(fd >= 0);
738
739 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
740 if (isatty(fd) < 1) {
741 if (IN_SET(errno, EINVAL, ENOTTY))
742 return 0; /* not a tty */
743
744 return -errno;
745 }
746
747 /* This might fail. What matters are the results. */
748 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
749 if (r < 0)
750 return r;
751
752 return 1;
753 }
754
755 static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
756 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
757 int r;
758
759 assert(_saved_stdin);
760 assert(_saved_stdout);
761
762 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
763 if (saved_stdin < 0)
764 return -errno;
765
766 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
767 if (saved_stdout < 0)
768 return -errno;
769
770 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
771 if (fd < 0)
772 return fd;
773
774 r = chown_terminal(fd, getuid());
775 if (r < 0)
776 return r;
777
778 r = reset_terminal_fd(fd, true);
779 if (r < 0)
780 return r;
781
782 r = rearrange_stdio(fd, fd, STDERR_FILENO);
783 fd = -1;
784 if (r < 0)
785 return r;
786
787 *_saved_stdin = saved_stdin;
788 *_saved_stdout = saved_stdout;
789
790 saved_stdin = saved_stdout = -1;
791
792 return 0;
793 }
794
795 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
796 assert(err < 0);
797
798 if (err == -ETIMEDOUT)
799 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
800 else {
801 errno = -err;
802 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
803 }
804 }
805
806 static void write_confirm_error(int err, const char *vc, const Unit *u) {
807 _cleanup_close_ int fd = -1;
808
809 assert(vc);
810
811 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
812 if (fd < 0)
813 return;
814
815 write_confirm_error_fd(err, fd, u);
816 }
817
818 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
819 int r = 0;
820
821 assert(saved_stdin);
822 assert(saved_stdout);
823
824 release_terminal();
825
826 if (*saved_stdin >= 0)
827 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
828 r = -errno;
829
830 if (*saved_stdout >= 0)
831 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
832 r = -errno;
833
834 *saved_stdin = safe_close(*saved_stdin);
835 *saved_stdout = safe_close(*saved_stdout);
836
837 return r;
838 }
839
840 enum {
841 CONFIRM_PRETEND_FAILURE = -1,
842 CONFIRM_PRETEND_SUCCESS = 0,
843 CONFIRM_EXECUTE = 1,
844 };
845
846 static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
847 int saved_stdout = -1, saved_stdin = -1, r;
848 _cleanup_free_ char *e = NULL;
849 char c;
850
851 /* For any internal errors, assume a positive response. */
852 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
853 if (r < 0) {
854 write_confirm_error(r, vc, u);
855 return CONFIRM_EXECUTE;
856 }
857
858 /* confirm_spawn might have been disabled while we were sleeping. */
859 if (manager_is_confirm_spawn_disabled(u->manager)) {
860 r = 1;
861 goto restore_stdio;
862 }
863
864 e = ellipsize(cmdline, 60, 100);
865 if (!e) {
866 log_oom();
867 r = CONFIRM_EXECUTE;
868 goto restore_stdio;
869 }
870
871 for (;;) {
872 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
873 if (r < 0) {
874 write_confirm_error_fd(r, STDOUT_FILENO, u);
875 r = CONFIRM_EXECUTE;
876 goto restore_stdio;
877 }
878
879 switch (c) {
880 case 'c':
881 printf("Resuming normal execution.\n");
882 manager_disable_confirm_spawn();
883 r = 1;
884 break;
885 case 'D':
886 unit_dump(u, stdout, " ");
887 continue; /* ask again */
888 case 'f':
889 printf("Failing execution.\n");
890 r = CONFIRM_PRETEND_FAILURE;
891 break;
892 case 'h':
893 printf(" c - continue, proceed without asking anymore\n"
894 " D - dump, show the state of the unit\n"
895 " f - fail, don't execute the command and pretend it failed\n"
896 " h - help\n"
897 " i - info, show a short summary of the unit\n"
898 " j - jobs, show jobs that are in progress\n"
899 " s - skip, don't execute the command and pretend it succeeded\n"
900 " y - yes, execute the command\n");
901 continue; /* ask again */
902 case 'i':
903 printf(" Description: %s\n"
904 " Unit: %s\n"
905 " Command: %s\n",
906 u->id, u->description, cmdline);
907 continue; /* ask again */
908 case 'j':
909 manager_dump_jobs(u->manager, stdout, " ");
910 continue; /* ask again */
911 case 'n':
912 /* 'n' was removed in favor of 'f'. */
913 printf("Didn't understand 'n', did you mean 'f'?\n");
914 continue; /* ask again */
915 case 's':
916 printf("Skipping execution.\n");
917 r = CONFIRM_PRETEND_SUCCESS;
918 break;
919 case 'y':
920 r = CONFIRM_EXECUTE;
921 break;
922 default:
923 assert_not_reached("Unhandled choice");
924 }
925 break;
926 }
927
928 restore_stdio:
929 restore_confirm_stdio(&saved_stdin, &saved_stdout);
930 return r;
931 }
932
933 static int get_fixed_user(const ExecContext *c, const char **user,
934 uid_t *uid, gid_t *gid,
935 const char **home, const char **shell) {
936 int r;
937 const char *name;
938
939 assert(c);
940
941 if (!c->user)
942 return 0;
943
944 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
945 * (i.e. are "/" or "/bin/nologin"). */
946
947 name = c->user;
948 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
949 if (r < 0)
950 return r;
951
952 *user = name;
953 return 0;
954 }
955
956 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
957 int r;
958 const char *name;
959
960 assert(c);
961
962 if (!c->group)
963 return 0;
964
965 name = c->group;
966 r = get_group_creds(&name, gid, 0);
967 if (r < 0)
968 return r;
969
970 *group = name;
971 return 0;
972 }
973
974 static int get_supplementary_groups(const ExecContext *c, const char *user,
975 const char *group, gid_t gid,
976 gid_t **supplementary_gids, int *ngids) {
977 char **i;
978 int r, k = 0;
979 int ngroups_max;
980 bool keep_groups = false;
981 gid_t *groups = NULL;
982 _cleanup_free_ gid_t *l_gids = NULL;
983
984 assert(c);
985
986 /*
987 * If user is given, then lookup GID and supplementary groups list.
988 * We avoid NSS lookups for gid=0. Also we have to initialize groups
989 * here and as early as possible so we keep the list of supplementary
990 * groups of the caller.
991 */
992 if (user && gid_is_valid(gid) && gid != 0) {
993 /* First step, initialize groups from /etc/groups */
994 if (initgroups(user, gid) < 0)
995 return -errno;
996
997 keep_groups = true;
998 }
999
1000 if (strv_isempty(c->supplementary_groups))
1001 return 0;
1002
1003 /*
1004 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1005 * be positive, otherwise fail.
1006 */
1007 errno = 0;
1008 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1009 if (ngroups_max <= 0)
1010 return errno_or_else(EOPNOTSUPP);
1011
1012 l_gids = new(gid_t, ngroups_max);
1013 if (!l_gids)
1014 return -ENOMEM;
1015
1016 if (keep_groups) {
1017 /*
1018 * Lookup the list of groups that the user belongs to, we
1019 * avoid NSS lookups here too for gid=0.
1020 */
1021 k = ngroups_max;
1022 if (getgrouplist(user, gid, l_gids, &k) < 0)
1023 return -EINVAL;
1024 } else
1025 k = 0;
1026
1027 STRV_FOREACH(i, c->supplementary_groups) {
1028 const char *g;
1029
1030 if (k >= ngroups_max)
1031 return -E2BIG;
1032
1033 g = *i;
1034 r = get_group_creds(&g, l_gids+k, 0);
1035 if (r < 0)
1036 return r;
1037
1038 k++;
1039 }
1040
1041 /*
1042 * Sets ngids to zero to drop all supplementary groups, happens
1043 * when we are under root and SupplementaryGroups= is empty.
1044 */
1045 if (k == 0) {
1046 *ngids = 0;
1047 return 0;
1048 }
1049
1050 /* Otherwise get the final list of supplementary groups */
1051 groups = memdup(l_gids, sizeof(gid_t) * k);
1052 if (!groups)
1053 return -ENOMEM;
1054
1055 *supplementary_gids = groups;
1056 *ngids = k;
1057
1058 groups = NULL;
1059
1060 return 0;
1061 }
1062
1063 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1064 int r;
1065
1066 /* Handle SupplementaryGroups= if it is not empty */
1067 if (ngids > 0) {
1068 r = maybe_setgroups(ngids, supplementary_gids);
1069 if (r < 0)
1070 return r;
1071 }
1072
1073 if (gid_is_valid(gid)) {
1074 /* Then set our gids */
1075 if (setresgid(gid, gid, gid) < 0)
1076 return -errno;
1077 }
1078
1079 return 0;
1080 }
1081
1082 static int set_securebits(int bits, int mask) {
1083 int current, applied;
1084 current = prctl(PR_GET_SECUREBITS);
1085 if (current < 0)
1086 return -errno;
1087 /* Clear all securebits defined in mask and set bits */
1088 applied = (current & ~mask) | bits;
1089 if (current == applied)
1090 return 0;
1091 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1092 return -errno;
1093 return 1;
1094 }
1095
1096 static int enforce_user(const ExecContext *context, uid_t uid) {
1097 assert(context);
1098 int r;
1099
1100 if (!uid_is_valid(uid))
1101 return 0;
1102
1103 /* Sets (but doesn't look up) the uid and make sure we keep the
1104 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1105 * required, so we also need keep-caps in this case.
1106 */
1107
1108 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1109
1110 /* First step: If we need to keep capabilities but
1111 * drop privileges we need to make sure we keep our
1112 * caps, while we drop privileges. */
1113 if (uid != 0) {
1114 /* Add KEEP_CAPS to the securebits */
1115 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1116 if (r < 0)
1117 return r;
1118 }
1119 }
1120
1121 /* Second step: actually set the uids */
1122 if (setresuid(uid, uid, uid) < 0)
1123 return -errno;
1124
1125 /* At this point we should have all necessary capabilities but
1126 are otherwise a normal user. However, the caps might got
1127 corrupted due to the setresuid() so we need clean them up
1128 later. This is done outside of this call. */
1129
1130 return 0;
1131 }
1132
1133 #if HAVE_PAM
1134
1135 static int null_conv(
1136 int num_msg,
1137 const struct pam_message **msg,
1138 struct pam_response **resp,
1139 void *appdata_ptr) {
1140
1141 /* We don't support conversations */
1142
1143 return PAM_CONV_ERR;
1144 }
1145
1146 #endif
1147
1148 static int setup_pam(
1149 const char *name,
1150 const char *user,
1151 uid_t uid,
1152 gid_t gid,
1153 const char *tty,
1154 char ***env,
1155 const int fds[], size_t n_fds) {
1156
1157 #if HAVE_PAM
1158
1159 static const struct pam_conv conv = {
1160 .conv = null_conv,
1161 .appdata_ptr = NULL
1162 };
1163
1164 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1165 pam_handle_t *handle = NULL;
1166 sigset_t old_ss;
1167 int pam_code = PAM_SUCCESS, r;
1168 char **nv, **e = NULL;
1169 bool close_session = false;
1170 pid_t pam_pid = 0, parent_pid;
1171 int flags = 0;
1172
1173 assert(name);
1174 assert(user);
1175 assert(env);
1176
1177 /* We set up PAM in the parent process, then fork. The child
1178 * will then stay around until killed via PR_GET_PDEATHSIG or
1179 * systemd via the cgroup logic. It will then remove the PAM
1180 * session again. The parent process will exec() the actual
1181 * daemon. We do things this way to ensure that the main PID
1182 * of the daemon is the one we initially fork()ed. */
1183
1184 r = barrier_create(&barrier);
1185 if (r < 0)
1186 goto fail;
1187
1188 if (log_get_max_level() < LOG_DEBUG)
1189 flags |= PAM_SILENT;
1190
1191 pam_code = pam_start(name, user, &conv, &handle);
1192 if (pam_code != PAM_SUCCESS) {
1193 handle = NULL;
1194 goto fail;
1195 }
1196
1197 if (!tty) {
1198 _cleanup_free_ char *q = NULL;
1199
1200 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1201 * out if that's the case, and read the TTY off it. */
1202
1203 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1204 tty = strjoina("/dev/", q);
1205 }
1206
1207 if (tty) {
1208 pam_code = pam_set_item(handle, PAM_TTY, tty);
1209 if (pam_code != PAM_SUCCESS)
1210 goto fail;
1211 }
1212
1213 STRV_FOREACH(nv, *env) {
1214 pam_code = pam_putenv(handle, *nv);
1215 if (pam_code != PAM_SUCCESS)
1216 goto fail;
1217 }
1218
1219 pam_code = pam_acct_mgmt(handle, flags);
1220 if (pam_code != PAM_SUCCESS)
1221 goto fail;
1222
1223 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1224 if (pam_code != PAM_SUCCESS)
1225 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1226
1227 pam_code = pam_open_session(handle, flags);
1228 if (pam_code != PAM_SUCCESS)
1229 goto fail;
1230
1231 close_session = true;
1232
1233 e = pam_getenvlist(handle);
1234 if (!e) {
1235 pam_code = PAM_BUF_ERR;
1236 goto fail;
1237 }
1238
1239 /* Block SIGTERM, so that we know that it won't get lost in
1240 * the child */
1241
1242 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1243
1244 parent_pid = getpid_cached();
1245
1246 r = safe_fork("(sd-pam)", 0, &pam_pid);
1247 if (r < 0)
1248 goto fail;
1249 if (r == 0) {
1250 int sig, ret = EXIT_PAM;
1251
1252 /* The child's job is to reset the PAM session on
1253 * termination */
1254 barrier_set_role(&barrier, BARRIER_CHILD);
1255
1256 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1257 * those fds are open here that have been opened by PAM. */
1258 (void) close_many(fds, n_fds);
1259
1260 /* Drop privileges - we don't need any to pam_close_session
1261 * and this will make PR_SET_PDEATHSIG work in most cases.
1262 * If this fails, ignore the error - but expect sd-pam threads
1263 * to fail to exit normally */
1264
1265 r = maybe_setgroups(0, NULL);
1266 if (r < 0)
1267 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1268 if (setresgid(gid, gid, gid) < 0)
1269 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1270 if (setresuid(uid, uid, uid) < 0)
1271 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1272
1273 (void) ignore_signals(SIGPIPE);
1274
1275 /* Wait until our parent died. This will only work if
1276 * the above setresuid() succeeds, otherwise the kernel
1277 * will not allow unprivileged parents kill their privileged
1278 * children this way. We rely on the control groups kill logic
1279 * to do the rest for us. */
1280 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1281 goto child_finish;
1282
1283 /* Tell the parent that our setup is done. This is especially
1284 * important regarding dropping privileges. Otherwise, unit
1285 * setup might race against our setresuid(2) call.
1286 *
1287 * If the parent aborted, we'll detect this below, hence ignore
1288 * return failure here. */
1289 (void) barrier_place(&barrier);
1290
1291 /* Check if our parent process might already have died? */
1292 if (getppid() == parent_pid) {
1293 sigset_t ss;
1294
1295 assert_se(sigemptyset(&ss) >= 0);
1296 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1297
1298 for (;;) {
1299 if (sigwait(&ss, &sig) < 0) {
1300 if (errno == EINTR)
1301 continue;
1302
1303 goto child_finish;
1304 }
1305
1306 assert(sig == SIGTERM);
1307 break;
1308 }
1309 }
1310
1311 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1312 if (pam_code != PAM_SUCCESS)
1313 goto child_finish;
1314
1315 /* If our parent died we'll end the session */
1316 if (getppid() != parent_pid) {
1317 pam_code = pam_close_session(handle, flags);
1318 if (pam_code != PAM_SUCCESS)
1319 goto child_finish;
1320 }
1321
1322 ret = 0;
1323
1324 child_finish:
1325 pam_end(handle, pam_code | flags);
1326 _exit(ret);
1327 }
1328
1329 barrier_set_role(&barrier, BARRIER_PARENT);
1330
1331 /* If the child was forked off successfully it will do all the
1332 * cleanups, so forget about the handle here. */
1333 handle = NULL;
1334
1335 /* Unblock SIGTERM again in the parent */
1336 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1337
1338 /* We close the log explicitly here, since the PAM modules
1339 * might have opened it, but we don't want this fd around. */
1340 closelog();
1341
1342 /* Synchronously wait for the child to initialize. We don't care for
1343 * errors as we cannot recover. However, warn loudly if it happens. */
1344 if (!barrier_place_and_sync(&barrier))
1345 log_error("PAM initialization failed");
1346
1347 return strv_free_and_replace(*env, e);
1348
1349 fail:
1350 if (pam_code != PAM_SUCCESS) {
1351 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1352 r = -EPERM; /* PAM errors do not map to errno */
1353 } else
1354 log_error_errno(r, "PAM failed: %m");
1355
1356 if (handle) {
1357 if (close_session)
1358 pam_code = pam_close_session(handle, flags);
1359
1360 pam_end(handle, pam_code | flags);
1361 }
1362
1363 strv_free(e);
1364 closelog();
1365
1366 return r;
1367 #else
1368 return 0;
1369 #endif
1370 }
1371
1372 static void rename_process_from_path(const char *path) {
1373 char process_name[11];
1374 const char *p;
1375 size_t l;
1376
1377 /* This resulting string must fit in 10 chars (i.e. the length
1378 * of "/sbin/init") to look pretty in /bin/ps */
1379
1380 p = basename(path);
1381 if (isempty(p)) {
1382 rename_process("(...)");
1383 return;
1384 }
1385
1386 l = strlen(p);
1387 if (l > 8) {
1388 /* The end of the process name is usually more
1389 * interesting, since the first bit might just be
1390 * "systemd-" */
1391 p = p + l - 8;
1392 l = 8;
1393 }
1394
1395 process_name[0] = '(';
1396 memcpy(process_name+1, p, l);
1397 process_name[1+l] = ')';
1398 process_name[1+l+1] = 0;
1399
1400 rename_process(process_name);
1401 }
1402
1403 static bool context_has_address_families(const ExecContext *c) {
1404 assert(c);
1405
1406 return c->address_families_allow_list ||
1407 !set_isempty(c->address_families);
1408 }
1409
1410 static bool context_has_syscall_filters(const ExecContext *c) {
1411 assert(c);
1412
1413 return c->syscall_allow_list ||
1414 !hashmap_isempty(c->syscall_filter);
1415 }
1416
1417 static bool context_has_syscall_logs(const ExecContext *c) {
1418 assert(c);
1419
1420 return c->syscall_log_allow_list ||
1421 !hashmap_isempty(c->syscall_log);
1422 }
1423
1424 static bool context_has_no_new_privileges(const ExecContext *c) {
1425 assert(c);
1426
1427 if (c->no_new_privileges)
1428 return true;
1429
1430 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1431 return false;
1432
1433 /* We need NNP if we have any form of seccomp and are unprivileged */
1434 return c->lock_personality ||
1435 c->memory_deny_write_execute ||
1436 c->private_devices ||
1437 c->protect_clock ||
1438 c->protect_hostname ||
1439 c->protect_kernel_tunables ||
1440 c->protect_kernel_modules ||
1441 c->protect_kernel_logs ||
1442 context_has_address_families(c) ||
1443 exec_context_restrict_namespaces_set(c) ||
1444 c->restrict_realtime ||
1445 c->restrict_suid_sgid ||
1446 !set_isempty(c->syscall_archs) ||
1447 context_has_syscall_filters(c) ||
1448 context_has_syscall_logs(c);
1449 }
1450
1451 static bool exec_context_has_credentials(const ExecContext *context) {
1452
1453 assert(context);
1454
1455 return !hashmap_isempty(context->set_credentials) ||
1456 context->load_credentials;
1457 }
1458
1459 #if HAVE_SECCOMP
1460
1461 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1462
1463 if (is_seccomp_available())
1464 return false;
1465
1466 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1467 return true;
1468 }
1469
1470 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1471 uint32_t negative_action, default_action, action;
1472 int r;
1473
1474 assert(u);
1475 assert(c);
1476
1477 if (!context_has_syscall_filters(c))
1478 return 0;
1479
1480 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1481 return 0;
1482
1483 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1484
1485 if (c->syscall_allow_list) {
1486 default_action = negative_action;
1487 action = SCMP_ACT_ALLOW;
1488 } else {
1489 default_action = SCMP_ACT_ALLOW;
1490 action = negative_action;
1491 }
1492
1493 if (needs_ambient_hack) {
1494 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1495 if (r < 0)
1496 return r;
1497 }
1498
1499 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1500 }
1501
1502 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1503 #ifdef SCMP_ACT_LOG
1504 uint32_t default_action, action;
1505 #endif
1506
1507 assert(u);
1508 assert(c);
1509
1510 if (!context_has_syscall_logs(c))
1511 return 0;
1512
1513 #ifdef SCMP_ACT_LOG
1514 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1515 return 0;
1516
1517 if (c->syscall_log_allow_list) {
1518 /* Log nothing but the ones listed */
1519 default_action = SCMP_ACT_ALLOW;
1520 action = SCMP_ACT_LOG;
1521 } else {
1522 /* Log everything but the ones listed */
1523 default_action = SCMP_ACT_LOG;
1524 action = SCMP_ACT_ALLOW;
1525 }
1526
1527 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1528 #else
1529 /* old libseccomp */
1530 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1531 return 0;
1532 #endif
1533 }
1534
1535 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1536 assert(u);
1537 assert(c);
1538
1539 if (set_isempty(c->syscall_archs))
1540 return 0;
1541
1542 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1543 return 0;
1544
1545 return seccomp_restrict_archs(c->syscall_archs);
1546 }
1547
1548 static int apply_address_families(const Unit* u, const ExecContext *c) {
1549 assert(u);
1550 assert(c);
1551
1552 if (!context_has_address_families(c))
1553 return 0;
1554
1555 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1556 return 0;
1557
1558 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1559 }
1560
1561 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1562 assert(u);
1563 assert(c);
1564
1565 if (!c->memory_deny_write_execute)
1566 return 0;
1567
1568 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1569 return 0;
1570
1571 return seccomp_memory_deny_write_execute();
1572 }
1573
1574 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1575 assert(u);
1576 assert(c);
1577
1578 if (!c->restrict_realtime)
1579 return 0;
1580
1581 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1582 return 0;
1583
1584 return seccomp_restrict_realtime();
1585 }
1586
1587 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1588 assert(u);
1589 assert(c);
1590
1591 if (!c->restrict_suid_sgid)
1592 return 0;
1593
1594 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1595 return 0;
1596
1597 return seccomp_restrict_suid_sgid();
1598 }
1599
1600 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1601 assert(u);
1602 assert(c);
1603
1604 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1605 * let's protect even those systems where this is left on in the kernel. */
1606
1607 if (!c->protect_kernel_tunables)
1608 return 0;
1609
1610 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1611 return 0;
1612
1613 return seccomp_protect_sysctl();
1614 }
1615
1616 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1617 assert(u);
1618 assert(c);
1619
1620 /* Turn off module syscalls on ProtectKernelModules=yes */
1621
1622 if (!c->protect_kernel_modules)
1623 return 0;
1624
1625 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1626 return 0;
1627
1628 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1629 }
1630
1631 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1632 assert(u);
1633 assert(c);
1634
1635 if (!c->protect_kernel_logs)
1636 return 0;
1637
1638 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1639 return 0;
1640
1641 return seccomp_protect_syslog();
1642 }
1643
1644 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1645 assert(u);
1646 assert(c);
1647
1648 if (!c->protect_clock)
1649 return 0;
1650
1651 if (skip_seccomp_unavailable(u, "ProtectClock="))
1652 return 0;
1653
1654 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1655 }
1656
1657 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1658 assert(u);
1659 assert(c);
1660
1661 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1662
1663 if (!c->private_devices)
1664 return 0;
1665
1666 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1667 return 0;
1668
1669 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1670 }
1671
1672 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1673 assert(u);
1674 assert(c);
1675
1676 if (!exec_context_restrict_namespaces_set(c))
1677 return 0;
1678
1679 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1680 return 0;
1681
1682 return seccomp_restrict_namespaces(c->restrict_namespaces);
1683 }
1684
1685 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1686 unsigned long personality;
1687 int r;
1688
1689 assert(u);
1690 assert(c);
1691
1692 if (!c->lock_personality)
1693 return 0;
1694
1695 if (skip_seccomp_unavailable(u, "LockPersonality="))
1696 return 0;
1697
1698 personality = c->personality;
1699
1700 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1701 if (personality == PERSONALITY_INVALID) {
1702
1703 r = opinionated_personality(&personality);
1704 if (r < 0)
1705 return r;
1706 }
1707
1708 return seccomp_lock_personality(personality);
1709 }
1710
1711 #endif
1712
1713 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1714 assert(u);
1715 assert(c);
1716
1717 if (!c->protect_hostname)
1718 return 0;
1719
1720 if (ns_type_supported(NAMESPACE_UTS)) {
1721 if (unshare(CLONE_NEWUTS) < 0) {
1722 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1723 *ret_exit_status = EXIT_NAMESPACE;
1724 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1725 }
1726
1727 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1728 }
1729 } else
1730 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1731
1732 #if HAVE_SECCOMP
1733 int r;
1734
1735 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1736 return 0;
1737
1738 r = seccomp_protect_hostname();
1739 if (r < 0) {
1740 *ret_exit_status = EXIT_SECCOMP;
1741 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1742 }
1743 #endif
1744
1745 return 0;
1746 }
1747
1748 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1749 assert(idle_pipe);
1750
1751 idle_pipe[1] = safe_close(idle_pipe[1]);
1752 idle_pipe[2] = safe_close(idle_pipe[2]);
1753
1754 if (idle_pipe[0] >= 0) {
1755 int r;
1756
1757 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1758
1759 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1760 ssize_t n;
1761
1762 /* Signal systemd that we are bored and want to continue. */
1763 n = write(idle_pipe[3], "x", 1);
1764 if (n > 0)
1765 /* Wait for systemd to react to the signal above. */
1766 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1767 }
1768
1769 idle_pipe[0] = safe_close(idle_pipe[0]);
1770
1771 }
1772
1773 idle_pipe[3] = safe_close(idle_pipe[3]);
1774 }
1775
1776 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1777
1778 static int build_environment(
1779 const Unit *u,
1780 const ExecContext *c,
1781 const ExecParameters *p,
1782 size_t n_fds,
1783 const char *home,
1784 const char *username,
1785 const char *shell,
1786 dev_t journal_stream_dev,
1787 ino_t journal_stream_ino,
1788 char ***ret) {
1789
1790 _cleanup_strv_free_ char **our_env = NULL;
1791 size_t n_env = 0;
1792 char *x;
1793
1794 assert(u);
1795 assert(c);
1796 assert(p);
1797 assert(ret);
1798
1799 #define N_ENV_VARS 17
1800 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1801 if (!our_env)
1802 return -ENOMEM;
1803
1804 if (n_fds > 0) {
1805 _cleanup_free_ char *joined = NULL;
1806
1807 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1808 return -ENOMEM;
1809 our_env[n_env++] = x;
1810
1811 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1812 return -ENOMEM;
1813 our_env[n_env++] = x;
1814
1815 joined = strv_join(p->fd_names, ":");
1816 if (!joined)
1817 return -ENOMEM;
1818
1819 x = strjoin("LISTEN_FDNAMES=", joined);
1820 if (!x)
1821 return -ENOMEM;
1822 our_env[n_env++] = x;
1823 }
1824
1825 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1826 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1827 return -ENOMEM;
1828 our_env[n_env++] = x;
1829
1830 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1831 return -ENOMEM;
1832 our_env[n_env++] = x;
1833 }
1834
1835 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1836 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1837 * check the database directly. */
1838 if (p->flags & EXEC_NSS_BYPASS_BUS) {
1839 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1840 if (!x)
1841 return -ENOMEM;
1842 our_env[n_env++] = x;
1843 }
1844
1845 if (home) {
1846 x = strjoin("HOME=", home);
1847 if (!x)
1848 return -ENOMEM;
1849
1850 path_simplify(x + 5);
1851 our_env[n_env++] = x;
1852 }
1853
1854 if (username) {
1855 x = strjoin("LOGNAME=", username);
1856 if (!x)
1857 return -ENOMEM;
1858 our_env[n_env++] = x;
1859
1860 x = strjoin("USER=", username);
1861 if (!x)
1862 return -ENOMEM;
1863 our_env[n_env++] = x;
1864 }
1865
1866 if (shell) {
1867 x = strjoin("SHELL=", shell);
1868 if (!x)
1869 return -ENOMEM;
1870
1871 path_simplify(x + 6);
1872 our_env[n_env++] = x;
1873 }
1874
1875 if (!sd_id128_is_null(u->invocation_id)) {
1876 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1877 return -ENOMEM;
1878
1879 our_env[n_env++] = x;
1880 }
1881
1882 if (exec_context_needs_term(c)) {
1883 const char *tty_path, *term = NULL;
1884
1885 tty_path = exec_context_tty_path(c);
1886
1887 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1888 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1889 * container manager passes to PID 1 ends up all the way in the console login shown. */
1890
1891 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1892 term = getenv("TERM");
1893
1894 if (!term)
1895 term = default_term_for_tty(tty_path);
1896
1897 x = strjoin("TERM=", term);
1898 if (!x)
1899 return -ENOMEM;
1900 our_env[n_env++] = x;
1901 }
1902
1903 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1904 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1905 return -ENOMEM;
1906
1907 our_env[n_env++] = x;
1908 }
1909
1910 if (c->log_namespace) {
1911 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1912 if (!x)
1913 return -ENOMEM;
1914
1915 our_env[n_env++] = x;
1916 }
1917
1918 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1919 _cleanup_free_ char *pre = NULL, *joined = NULL;
1920 const char *n;
1921
1922 if (!p->prefix[t])
1923 continue;
1924
1925 if (strv_isempty(c->directories[t].paths))
1926 continue;
1927
1928 n = exec_directory_env_name_to_string(t);
1929 if (!n)
1930 continue;
1931
1932 pre = strjoin(p->prefix[t], "/");
1933 if (!pre)
1934 return -ENOMEM;
1935
1936 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
1937 if (!joined)
1938 return -ENOMEM;
1939
1940 x = strjoin(n, "=", joined);
1941 if (!x)
1942 return -ENOMEM;
1943
1944 our_env[n_env++] = x;
1945 }
1946
1947 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1948 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1949 if (!x)
1950 return -ENOMEM;
1951
1952 our_env[n_env++] = x;
1953 }
1954
1955 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1956 return -ENOMEM;
1957
1958 our_env[n_env++] = x;
1959
1960 our_env[n_env++] = NULL;
1961 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1962 #undef N_ENV_VARS
1963
1964 *ret = TAKE_PTR(our_env);
1965
1966 return 0;
1967 }
1968
1969 static int build_pass_environment(const ExecContext *c, char ***ret) {
1970 _cleanup_strv_free_ char **pass_env = NULL;
1971 size_t n_env = 0;
1972 char **i;
1973
1974 STRV_FOREACH(i, c->pass_environment) {
1975 _cleanup_free_ char *x = NULL;
1976 char *v;
1977
1978 v = getenv(*i);
1979 if (!v)
1980 continue;
1981 x = strjoin(*i, "=", v);
1982 if (!x)
1983 return -ENOMEM;
1984
1985 if (!GREEDY_REALLOC(pass_env, n_env + 2))
1986 return -ENOMEM;
1987
1988 pass_env[n_env++] = TAKE_PTR(x);
1989 pass_env[n_env] = NULL;
1990 }
1991
1992 *ret = TAKE_PTR(pass_env);
1993
1994 return 0;
1995 }
1996
1997 bool exec_needs_mount_namespace(
1998 const ExecContext *context,
1999 const ExecParameters *params,
2000 const ExecRuntime *runtime) {
2001
2002 assert(context);
2003
2004 if (context->root_image)
2005 return true;
2006
2007 if (!strv_isempty(context->read_write_paths) ||
2008 !strv_isempty(context->read_only_paths) ||
2009 !strv_isempty(context->inaccessible_paths) ||
2010 !strv_isempty(context->exec_paths) ||
2011 !strv_isempty(context->no_exec_paths))
2012 return true;
2013
2014 if (context->n_bind_mounts > 0)
2015 return true;
2016
2017 if (context->n_temporary_filesystems > 0)
2018 return true;
2019
2020 if (context->n_mount_images > 0)
2021 return true;
2022
2023 if (context->n_extension_images > 0)
2024 return true;
2025
2026 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2027 return true;
2028
2029 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2030 return true;
2031
2032 if (context->private_devices ||
2033 context->private_mounts ||
2034 context->protect_system != PROTECT_SYSTEM_NO ||
2035 context->protect_home != PROTECT_HOME_NO ||
2036 context->protect_kernel_tunables ||
2037 context->protect_kernel_modules ||
2038 context->protect_kernel_logs ||
2039 context->protect_control_groups ||
2040 context->protect_proc != PROTECT_PROC_DEFAULT ||
2041 context->proc_subset != PROC_SUBSET_ALL ||
2042 context->private_ipc ||
2043 context->ipc_namespace_path)
2044 return true;
2045
2046 if (context->root_directory) {
2047 if (exec_context_get_effective_mount_apivfs(context))
2048 return true;
2049
2050 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2051 if (params && !params->prefix[t])
2052 continue;
2053
2054 if (!strv_isempty(context->directories[t].paths))
2055 return true;
2056 }
2057 }
2058
2059 if (context->dynamic_user &&
2060 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
2061 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2062 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2063 return true;
2064
2065 if (context->log_namespace)
2066 return true;
2067
2068 return false;
2069 }
2070
2071 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2072 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2073 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2074 _cleanup_close_ int unshare_ready_fd = -1;
2075 _cleanup_(sigkill_waitp) pid_t pid = 0;
2076 uint64_t c = 1;
2077 ssize_t n;
2078 int r;
2079
2080 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2081 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2082 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2083 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2084 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2085 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2086 * continues execution normally.
2087 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2088 * does not need CAP_SETUID to write the single line mapping to itself. */
2089
2090 /* Can only set up multiple mappings with CAP_SETUID. */
2091 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2092 r = asprintf(&uid_map,
2093 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2094 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2095 ouid, ouid, uid, uid);
2096 else
2097 r = asprintf(&uid_map,
2098 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2099 ouid, ouid);
2100
2101 if (r < 0)
2102 return -ENOMEM;
2103
2104 /* Can only set up multiple mappings with CAP_SETGID. */
2105 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2106 r = asprintf(&gid_map,
2107 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2108 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2109 ogid, ogid, gid, gid);
2110 else
2111 r = asprintf(&gid_map,
2112 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2113 ogid, ogid);
2114
2115 if (r < 0)
2116 return -ENOMEM;
2117
2118 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2119 * namespace. */
2120 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2121 if (unshare_ready_fd < 0)
2122 return -errno;
2123
2124 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2125 * failed. */
2126 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2127 return -errno;
2128
2129 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2130 if (r < 0)
2131 return r;
2132 if (r == 0) {
2133 _cleanup_close_ int fd = -1;
2134 const char *a;
2135 pid_t ppid;
2136
2137 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2138 * here, after the parent opened its own user namespace. */
2139
2140 ppid = getppid();
2141 errno_pipe[0] = safe_close(errno_pipe[0]);
2142
2143 /* Wait until the parent unshared the user namespace */
2144 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2145 r = -errno;
2146 goto child_fail;
2147 }
2148
2149 /* Disable the setgroups() system call in the child user namespace, for good. */
2150 a = procfs_file_alloca(ppid, "setgroups");
2151 fd = open(a, O_WRONLY|O_CLOEXEC);
2152 if (fd < 0) {
2153 if (errno != ENOENT) {
2154 r = -errno;
2155 goto child_fail;
2156 }
2157
2158 /* If the file is missing the kernel is too old, let's continue anyway. */
2159 } else {
2160 if (write(fd, "deny\n", 5) < 0) {
2161 r = -errno;
2162 goto child_fail;
2163 }
2164
2165 fd = safe_close(fd);
2166 }
2167
2168 /* First write the GID map */
2169 a = procfs_file_alloca(ppid, "gid_map");
2170 fd = open(a, O_WRONLY|O_CLOEXEC);
2171 if (fd < 0) {
2172 r = -errno;
2173 goto child_fail;
2174 }
2175 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2176 r = -errno;
2177 goto child_fail;
2178 }
2179 fd = safe_close(fd);
2180
2181 /* The write the UID map */
2182 a = procfs_file_alloca(ppid, "uid_map");
2183 fd = open(a, O_WRONLY|O_CLOEXEC);
2184 if (fd < 0) {
2185 r = -errno;
2186 goto child_fail;
2187 }
2188 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2189 r = -errno;
2190 goto child_fail;
2191 }
2192
2193 _exit(EXIT_SUCCESS);
2194
2195 child_fail:
2196 (void) write(errno_pipe[1], &r, sizeof(r));
2197 _exit(EXIT_FAILURE);
2198 }
2199
2200 errno_pipe[1] = safe_close(errno_pipe[1]);
2201
2202 if (unshare(CLONE_NEWUSER) < 0)
2203 return -errno;
2204
2205 /* Let the child know that the namespace is ready now */
2206 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2207 return -errno;
2208
2209 /* Try to read an error code from the child */
2210 n = read(errno_pipe[0], &r, sizeof(r));
2211 if (n < 0)
2212 return -errno;
2213 if (n == sizeof(r)) { /* an error code was sent to us */
2214 if (r < 0)
2215 return r;
2216 return -EIO;
2217 }
2218 if (n != 0) /* on success we should have read 0 bytes */
2219 return -EIO;
2220
2221 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2222 pid = 0;
2223 if (r < 0)
2224 return r;
2225 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2226 return -EIO;
2227
2228 return 0;
2229 }
2230
2231 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2232 if (!context->dynamic_user)
2233 return false;
2234
2235 if (type == EXEC_DIRECTORY_CONFIGURATION)
2236 return false;
2237
2238 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2239 return false;
2240
2241 return true;
2242 }
2243
2244 static int setup_exec_directory(
2245 const ExecContext *context,
2246 const ExecParameters *params,
2247 uid_t uid,
2248 gid_t gid,
2249 ExecDirectoryType type,
2250 int *exit_status) {
2251
2252 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2253 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2254 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2255 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2256 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2257 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2258 };
2259 char **rt;
2260 int r;
2261
2262 assert(context);
2263 assert(params);
2264 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2265 assert(exit_status);
2266
2267 if (!params->prefix[type])
2268 return 0;
2269
2270 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2271 if (!uid_is_valid(uid))
2272 uid = 0;
2273 if (!gid_is_valid(gid))
2274 gid = 0;
2275 }
2276
2277 STRV_FOREACH(rt, context->directories[type].paths) {
2278 _cleanup_free_ char *p = NULL, *pp = NULL;
2279
2280 p = path_join(params->prefix[type], *rt);
2281 if (!p) {
2282 r = -ENOMEM;
2283 goto fail;
2284 }
2285
2286 r = mkdir_parents_label(p, 0755);
2287 if (r < 0)
2288 goto fail;
2289
2290 if (exec_directory_is_private(context, type)) {
2291 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2292 * case we want to avoid leaving a directory around fully accessible that is owned by
2293 * a dynamic user whose UID is later on reused. To lock this down we use the same
2294 * trick used by container managers to prohibit host users to get access to files of
2295 * the same UID in containers: we place everything inside a directory that has an
2296 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2297 * for unprivileged host code. We then use fs namespacing to make this directory
2298 * permeable for the service itself.
2299 *
2300 * Specifically: for a service which wants a special directory "foo/" we first create
2301 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2302 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2303 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2304 * unprivileged host users can't look into it. Inside of the namespace of the unit
2305 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2306 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2307 * for the service and making sure it only gets access to the dirs it needs but no
2308 * others. Tricky? Yes, absolutely, but it works!
2309 *
2310 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2311 * to be owned by the service itself.
2312 *
2313 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2314 * for sharing files or sockets with other services. */
2315
2316 pp = path_join(params->prefix[type], "private");
2317 if (!pp) {
2318 r = -ENOMEM;
2319 goto fail;
2320 }
2321
2322 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2323 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2324 if (r < 0)
2325 goto fail;
2326
2327 if (!path_extend(&pp, *rt)) {
2328 r = -ENOMEM;
2329 goto fail;
2330 }
2331
2332 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2333 r = mkdir_parents_label(pp, 0755);
2334 if (r < 0)
2335 goto fail;
2336
2337 if (is_dir(p, false) > 0 &&
2338 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2339
2340 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2341 * it over. Most likely the service has been upgraded from one that didn't use
2342 * DynamicUser=1, to one that does. */
2343
2344 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2345 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2346 exec_directory_type_to_string(type), p, pp);
2347
2348 if (rename(p, pp) < 0) {
2349 r = -errno;
2350 goto fail;
2351 }
2352 } else {
2353 /* Otherwise, create the actual directory for the service */
2354
2355 r = mkdir_label(pp, context->directories[type].mode);
2356 if (r < 0 && r != -EEXIST)
2357 goto fail;
2358 }
2359
2360 /* And link it up from the original place */
2361 r = symlink_idempotent(pp, p, true);
2362 if (r < 0)
2363 goto fail;
2364
2365 } else {
2366 _cleanup_free_ char *target = NULL;
2367
2368 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2369 readlink_and_make_absolute(p, &target) >= 0) {
2370 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2371
2372 /* This already exists and is a symlink? Interesting. Maybe it's one created
2373 * by DynamicUser=1 (see above)?
2374 *
2375 * We do this for all directory types except for ConfigurationDirectory=,
2376 * since they all support the private/ symlink logic at least in some
2377 * configurations, see above. */
2378
2379 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2380 if (r < 0)
2381 goto fail;
2382
2383 q = path_join(params->prefix[type], "private", *rt);
2384 if (!q) {
2385 r = -ENOMEM;
2386 goto fail;
2387 }
2388
2389 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2390 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2391 if (r < 0)
2392 goto fail;
2393
2394 if (path_equal(q_resolved, target_resolved)) {
2395
2396 /* Hmm, apparently DynamicUser= was once turned on for this service,
2397 * but is no longer. Let's move the directory back up. */
2398
2399 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2400 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2401 exec_directory_type_to_string(type), q, p);
2402
2403 if (unlink(p) < 0) {
2404 r = -errno;
2405 goto fail;
2406 }
2407
2408 if (rename(q, p) < 0) {
2409 r = -errno;
2410 goto fail;
2411 }
2412 }
2413 }
2414
2415 r = mkdir_label(p, context->directories[type].mode);
2416 if (r < 0) {
2417 if (r != -EEXIST)
2418 goto fail;
2419
2420 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2421 struct stat st;
2422
2423 /* Don't change the owner/access mode of the configuration directory,
2424 * as in the common case it is not written to by a service, and shall
2425 * not be writable. */
2426
2427 if (stat(p, &st) < 0) {
2428 r = -errno;
2429 goto fail;
2430 }
2431
2432 /* Still complain if the access mode doesn't match */
2433 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2434 log_warning("%s \'%s\' already exists but the mode is different. "
2435 "(File system: %o %sMode: %o)",
2436 exec_directory_type_to_string(type), *rt,
2437 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2438
2439 continue;
2440 }
2441 }
2442 }
2443
2444 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2445 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2446 * current UID/GID ownership.) */
2447 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2448 if (r < 0)
2449 goto fail;
2450
2451 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2452 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2453 * assignments to exist. */
2454 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2455 if (r < 0)
2456 goto fail;
2457 }
2458
2459 return 0;
2460
2461 fail:
2462 *exit_status = exit_status_table[type];
2463 return r;
2464 }
2465
2466 static int write_credential(
2467 int dfd,
2468 const char *id,
2469 const void *data,
2470 size_t size,
2471 uid_t uid,
2472 bool ownership_ok) {
2473
2474 _cleanup_(unlink_and_freep) char *tmp = NULL;
2475 _cleanup_close_ int fd = -1;
2476 int r;
2477
2478 r = tempfn_random_child("", "cred", &tmp);
2479 if (r < 0)
2480 return r;
2481
2482 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2483 if (fd < 0) {
2484 tmp = mfree(tmp);
2485 return -errno;
2486 }
2487
2488 r = loop_write(fd, data, size, /* do_pool = */ false);
2489 if (r < 0)
2490 return r;
2491
2492 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2493 return -errno;
2494
2495 if (uid_is_valid(uid) && uid != getuid()) {
2496 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2497 if (r < 0) {
2498 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2499 return r;
2500
2501 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2502 * to express: that the user gets read access and nothing
2503 * else. But if the backing fs can't support that (e.g. ramfs)
2504 * then we can use file ownership instead. But that's only safe if
2505 * we can then re-mount the whole thing read-only, so that the
2506 * user can no longer chmod() the file to gain write access. */
2507 return r;
2508
2509 if (fchown(fd, uid, GID_INVALID) < 0)
2510 return -errno;
2511 }
2512 }
2513
2514 if (renameat(dfd, tmp, dfd, id) < 0)
2515 return -errno;
2516
2517 tmp = mfree(tmp);
2518 return 0;
2519 }
2520
2521 #define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2522
2523 static int acquire_credentials(
2524 const ExecContext *context,
2525 const ExecParameters *params,
2526 const char *unit,
2527 const char *p,
2528 uid_t uid,
2529 bool ownership_ok) {
2530
2531 uint64_t left = CREDENTIALS_BYTES_MAX;
2532 _cleanup_close_ int dfd = -1;
2533 ExecSetCredential *sc;
2534 char **id, **fn;
2535 int r;
2536
2537 assert(context);
2538 assert(p);
2539
2540 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2541 if (dfd < 0)
2542 return -errno;
2543
2544 /* First we use the literally specified credentials. Note that they might be overridden again below,
2545 * and thus act as a "default" if the same credential is specified multiple times */
2546 HASHMAP_FOREACH(sc, context->set_credentials) {
2547 size_t add;
2548
2549 add = strlen(sc->id) + sc->size;
2550 if (add > left)
2551 return -E2BIG;
2552
2553 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2554 if (r < 0)
2555 return r;
2556
2557 left -= add;
2558 }
2559
2560 /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2561 STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2562 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2563 _cleanup_(erase_and_freep) char *data = NULL;
2564 _cleanup_free_ char *j = NULL, *bindname = NULL;
2565 bool missing_ok = true;
2566 const char *source;
2567 size_t size, add;
2568
2569 if (path_is_absolute(*fn)) {
2570 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2571 source = *fn;
2572 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2573
2574 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2575 * via the source socket address in case we read off an AF_UNIX socket. */
2576 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2577 return -ENOMEM;
2578
2579 missing_ok = false;
2580
2581 } else if (params->received_credentials) {
2582 /* If this is a relative path, take it relative to the credentials we received
2583 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2584 * on a credential store, i.e. this is guaranteed to be regular files. */
2585 j = path_join(params->received_credentials, *fn);
2586 if (!j)
2587 return -ENOMEM;
2588
2589 source = j;
2590 } else
2591 source = NULL;
2592
2593 if (source)
2594 r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
2595 else
2596 r = -ENOENT;
2597 if (r == -ENOENT && (missing_ok || faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)) {
2598 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2599 * will get clear errors if we don't pass such a missing credential on as they
2600 * themselves will get ENOENT when trying to read them, which should not be much
2601 * worse than when we handle the error here and make it fatal.
2602 *
2603 * Also, if the source file doesn't exist, but we already acquired the key otherwise,
2604 * then don't fail either. */
2605 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", *fn);
2606 continue;
2607 }
2608 if (r < 0)
2609 return log_debug_errno(r, "Failed to read credential '%s': %m", *fn);
2610
2611 add = strlen(*id) + size;
2612 if (add > left)
2613 return -E2BIG;
2614
2615 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2616 if (r < 0)
2617 return r;
2618
2619 left -= add;
2620 }
2621
2622 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2623 return -errno;
2624
2625 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2626 * accessible */
2627
2628 if (uid_is_valid(uid) && uid != getuid()) {
2629 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2630 if (r < 0) {
2631 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2632 return r;
2633
2634 if (!ownership_ok)
2635 return r;
2636
2637 if (fchown(dfd, uid, GID_INVALID) < 0)
2638 return -errno;
2639 }
2640 }
2641
2642 return 0;
2643 }
2644
2645 static int setup_credentials_internal(
2646 const ExecContext *context,
2647 const ExecParameters *params,
2648 const char *unit,
2649 const char *final, /* This is where the credential store shall eventually end up at */
2650 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2651 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2652 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2653 uid_t uid) {
2654
2655 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2656 * if we mounted something; false if we definitely can't mount anything */
2657 bool final_mounted;
2658 const char *where;
2659
2660 assert(context);
2661 assert(final);
2662 assert(workspace);
2663
2664 if (reuse_workspace) {
2665 r = path_is_mount_point(workspace, NULL, 0);
2666 if (r < 0)
2667 return r;
2668 if (r > 0)
2669 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2670 else
2671 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2672 } else
2673 workspace_mounted = -1; /* ditto */
2674
2675 r = path_is_mount_point(final, NULL, 0);
2676 if (r < 0)
2677 return r;
2678 if (r > 0) {
2679 /* If the final place already has something mounted, we use that. If the workspace also has
2680 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2681 * different). */
2682 final_mounted = true;
2683
2684 if (workspace_mounted < 0) {
2685 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2686 * the final version to the workspace, and make it writable, so that we can make
2687 * changes */
2688
2689 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2690 if (r < 0)
2691 return r;
2692
2693 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2694 if (r < 0)
2695 return r;
2696
2697 workspace_mounted = true;
2698 }
2699 } else
2700 final_mounted = false;
2701
2702 if (workspace_mounted < 0) {
2703 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2704 for (int try = 0;; try++) {
2705
2706 if (try == 0) {
2707 /* Try "ramfs" first, since it's not swap backed */
2708 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2709 if (r >= 0) {
2710 workspace_mounted = true;
2711 break;
2712 }
2713
2714 } else if (try == 1) {
2715 _cleanup_free_ char *opts = NULL;
2716
2717 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2718 return -ENOMEM;
2719
2720 /* Fall back to "tmpfs" otherwise */
2721 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2722 if (r >= 0) {
2723 workspace_mounted = true;
2724 break;
2725 }
2726
2727 } else {
2728 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
2729 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2730 if (r < 0) {
2731 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2732 return r;
2733
2734 if (must_mount) /* If we it's not OK to use the plain directory
2735 * fallback, propagate all errors too */
2736 return r;
2737
2738 /* If we lack privileges to bind mount stuff, then let's gracefully
2739 * proceed for compat with container envs, and just use the final dir
2740 * as is. */
2741
2742 workspace_mounted = false;
2743 break;
2744 }
2745
2746 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
2747 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2748 if (r < 0)
2749 return r;
2750
2751 workspace_mounted = true;
2752 break;
2753 }
2754 }
2755 }
2756
2757 assert(!must_mount || workspace_mounted > 0);
2758 where = workspace_mounted ? workspace : final;
2759
2760 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
2761 if (r < 0)
2762 return r;
2763
2764 if (workspace_mounted) {
2765 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
2766 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2767 if (r < 0)
2768 return r;
2769
2770 /* And mount it to the final place, read-only */
2771 if (final_mounted)
2772 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2773 else
2774 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2775 if (r < 0)
2776 return r;
2777 } else {
2778 _cleanup_free_ char *parent = NULL;
2779
2780 /* If we do not have our own mount put used the plain directory fallback, then we need to
2781 * open access to the top-level credential directory and the per-service directory now */
2782
2783 parent = dirname_malloc(final);
2784 if (!parent)
2785 return -ENOMEM;
2786 if (chmod(parent, 0755) < 0)
2787 return -errno;
2788 }
2789
2790 return 0;
2791 }
2792
2793 static int setup_credentials(
2794 const ExecContext *context,
2795 const ExecParameters *params,
2796 const char *unit,
2797 uid_t uid) {
2798
2799 _cleanup_free_ char *p = NULL, *q = NULL;
2800 const char *i;
2801 int r;
2802
2803 assert(context);
2804 assert(params);
2805
2806 if (!exec_context_has_credentials(context))
2807 return 0;
2808
2809 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2810 return -EINVAL;
2811
2812 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2813 * and the subdir we mount over with a read-only file system readable by the service's user */
2814 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2815 if (!q)
2816 return -ENOMEM;
2817
2818 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2819 if (r < 0 && r != -EEXIST)
2820 return r;
2821
2822 p = path_join(q, unit);
2823 if (!p)
2824 return -ENOMEM;
2825
2826 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2827 if (r < 0 && r != -EEXIST)
2828 return r;
2829
2830 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2831 if (r < 0) {
2832 _cleanup_free_ char *t = NULL, *u = NULL;
2833
2834 /* If this is not a privilege or support issue then propagate the error */
2835 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2836 return r;
2837
2838 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2839 * it into place, so that users can't access half-initialized credential stores. */
2840 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2841 if (!t)
2842 return -ENOMEM;
2843
2844 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2845 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2846 * after it is fully set up */
2847 u = path_join(t, unit);
2848 if (!u)
2849 return -ENOMEM;
2850
2851 FOREACH_STRING(i, t, u) {
2852 r = mkdir_label(i, 0700);
2853 if (r < 0 && r != -EEXIST)
2854 return r;
2855 }
2856
2857 r = setup_credentials_internal(
2858 context,
2859 params,
2860 unit,
2861 p, /* final mount point */
2862 u, /* temporary workspace to overmount */
2863 true, /* reuse the workspace if it is already a mount */
2864 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2865 uid);
2866
2867 (void) rmdir(u); /* remove the workspace again if we can. */
2868
2869 if (r < 0)
2870 return r;
2871
2872 } else if (r == 0) {
2873
2874 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2875 * we can use the same directory for all cases, after turning off propagation. Question
2876 * though is: where do we turn off propagation exactly, and where do we place the workspace
2877 * directory? We need some place that is guaranteed to be a mount point in the host, and
2878 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2879 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2880 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2881 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2882 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2883 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2884 * propagation on the former, and then overmount the latter.
2885 *
2886 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2887 * for this purpose, but there are few other candidates that work equally well for us, and
2888 * given that the we do this in a privately namespaced short-lived single-threaded process
2889 * that no one else sees this should be OK to do. */
2890
2891 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2892 if (r < 0)
2893 goto child_fail;
2894
2895 r = setup_credentials_internal(
2896 context,
2897 params,
2898 unit,
2899 p, /* final mount point */
2900 "/dev/shm", /* temporary workspace to overmount */
2901 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2902 true, /* insist that something is mounted, do not allow fallback to plain directory */
2903 uid);
2904 if (r < 0)
2905 goto child_fail;
2906
2907 _exit(EXIT_SUCCESS);
2908
2909 child_fail:
2910 _exit(EXIT_FAILURE);
2911 }
2912
2913 return 0;
2914 }
2915
2916 #if ENABLE_SMACK
2917 static int setup_smack(
2918 const ExecContext *context,
2919 int executable_fd) {
2920 int r;
2921
2922 assert(context);
2923 assert(executable_fd >= 0);
2924
2925 if (context->smack_process_label) {
2926 r = mac_smack_apply_pid(0, context->smack_process_label);
2927 if (r < 0)
2928 return r;
2929 }
2930 #ifdef SMACK_DEFAULT_PROCESS_LABEL
2931 else {
2932 _cleanup_free_ char *exec_label = NULL;
2933
2934 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2935 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
2936 return r;
2937
2938 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2939 if (r < 0)
2940 return r;
2941 }
2942 #endif
2943
2944 return 0;
2945 }
2946 #endif
2947
2948 static int compile_bind_mounts(
2949 const ExecContext *context,
2950 const ExecParameters *params,
2951 BindMount **ret_bind_mounts,
2952 size_t *ret_n_bind_mounts,
2953 char ***ret_empty_directories) {
2954
2955 _cleanup_strv_free_ char **empty_directories = NULL;
2956 BindMount *bind_mounts;
2957 size_t n, h = 0;
2958 int r;
2959
2960 assert(context);
2961 assert(params);
2962 assert(ret_bind_mounts);
2963 assert(ret_n_bind_mounts);
2964 assert(ret_empty_directories);
2965
2966 n = context->n_bind_mounts;
2967 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2968 if (!params->prefix[t])
2969 continue;
2970
2971 n += strv_length(context->directories[t].paths);
2972 }
2973
2974 if (n <= 0) {
2975 *ret_bind_mounts = NULL;
2976 *ret_n_bind_mounts = 0;
2977 *ret_empty_directories = NULL;
2978 return 0;
2979 }
2980
2981 bind_mounts = new(BindMount, n);
2982 if (!bind_mounts)
2983 return -ENOMEM;
2984
2985 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2986 BindMount *item = context->bind_mounts + i;
2987 char *s, *d;
2988
2989 s = strdup(item->source);
2990 if (!s) {
2991 r = -ENOMEM;
2992 goto finish;
2993 }
2994
2995 d = strdup(item->destination);
2996 if (!d) {
2997 free(s);
2998 r = -ENOMEM;
2999 goto finish;
3000 }
3001
3002 bind_mounts[h++] = (BindMount) {
3003 .source = s,
3004 .destination = d,
3005 .read_only = item->read_only,
3006 .recursive = item->recursive,
3007 .ignore_enoent = item->ignore_enoent,
3008 };
3009 }
3010
3011 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3012 char **suffix;
3013
3014 if (!params->prefix[t])
3015 continue;
3016
3017 if (strv_isempty(context->directories[t].paths))
3018 continue;
3019
3020 if (exec_directory_is_private(context, t) &&
3021 !exec_context_with_rootfs(context)) {
3022 char *private_root;
3023
3024 /* So this is for a dynamic user, and we need to make sure the process can access its own
3025 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3026 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3027
3028 private_root = path_join(params->prefix[t], "private");
3029 if (!private_root) {
3030 r = -ENOMEM;
3031 goto finish;
3032 }
3033
3034 r = strv_consume(&empty_directories, private_root);
3035 if (r < 0)
3036 goto finish;
3037 }
3038
3039 STRV_FOREACH(suffix, context->directories[t].paths) {
3040 char *s, *d;
3041
3042 if (exec_directory_is_private(context, t))
3043 s = path_join(params->prefix[t], "private", *suffix);
3044 else
3045 s = path_join(params->prefix[t], *suffix);
3046 if (!s) {
3047 r = -ENOMEM;
3048 goto finish;
3049 }
3050
3051 if (exec_directory_is_private(context, t) &&
3052 exec_context_with_rootfs(context))
3053 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3054 * directory is not created on the root directory. So, let's bind-mount the directory
3055 * on the 'non-private' place. */
3056 d = path_join(params->prefix[t], *suffix);
3057 else
3058 d = strdup(s);
3059 if (!d) {
3060 free(s);
3061 r = -ENOMEM;
3062 goto finish;
3063 }
3064
3065 bind_mounts[h++] = (BindMount) {
3066 .source = s,
3067 .destination = d,
3068 .read_only = false,
3069 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3070 .recursive = true,
3071 .ignore_enoent = false,
3072 };
3073 }
3074 }
3075
3076 assert(h == n);
3077
3078 *ret_bind_mounts = bind_mounts;
3079 *ret_n_bind_mounts = n;
3080 *ret_empty_directories = TAKE_PTR(empty_directories);
3081
3082 return (int) n;
3083
3084 finish:
3085 bind_mount_free_many(bind_mounts, h);
3086 return r;
3087 }
3088
3089 static bool insist_on_sandboxing(
3090 const ExecContext *context,
3091 const char *root_dir,
3092 const char *root_image,
3093 const BindMount *bind_mounts,
3094 size_t n_bind_mounts) {
3095
3096 assert(context);
3097 assert(n_bind_mounts == 0 || bind_mounts);
3098
3099 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3100 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3101 * rearrange stuff in a way we cannot ignore gracefully. */
3102
3103 if (context->n_temporary_filesystems > 0)
3104 return true;
3105
3106 if (root_dir || root_image)
3107 return true;
3108
3109 if (context->n_mount_images > 0)
3110 return true;
3111
3112 if (context->dynamic_user)
3113 return true;
3114
3115 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3116 * essential. */
3117 for (size_t i = 0; i < n_bind_mounts; i++)
3118 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3119 return true;
3120
3121 if (context->log_namespace)
3122 return true;
3123
3124 return false;
3125 }
3126
3127 static int apply_mount_namespace(
3128 const Unit *u,
3129 ExecCommandFlags command_flags,
3130 const ExecContext *context,
3131 const ExecParameters *params,
3132 const ExecRuntime *runtime,
3133 char **error_path) {
3134
3135 _cleanup_strv_free_ char **empty_directories = NULL;
3136 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3137 const char *root_dir = NULL, *root_image = NULL;
3138 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
3139 NamespaceInfo ns_info;
3140 bool needs_sandboxing;
3141 BindMount *bind_mounts = NULL;
3142 size_t n_bind_mounts = 0;
3143 int r;
3144
3145 assert(context);
3146
3147 if (params->flags & EXEC_APPLY_CHROOT) {
3148 root_image = context->root_image;
3149
3150 if (!root_image)
3151 root_dir = context->root_directory;
3152 }
3153
3154 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3155 if (r < 0)
3156 return r;
3157
3158 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3159 if (needs_sandboxing) {
3160 /* The runtime struct only contains the parent of the private /tmp,
3161 * which is non-accessible to world users. Inside of it there's a /tmp
3162 * that is sticky, and that's the one we want to use here.
3163 * This does not apply when we are using /run/systemd/empty as fallback. */
3164
3165 if (context->private_tmp && runtime) {
3166 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3167 tmp_dir = runtime->tmp_dir;
3168 else if (runtime->tmp_dir)
3169 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3170
3171 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3172 var_tmp_dir = runtime->var_tmp_dir;
3173 else if (runtime->var_tmp_dir)
3174 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3175 }
3176
3177 ns_info = (NamespaceInfo) {
3178 .ignore_protect_paths = false,
3179 .private_dev = context->private_devices,
3180 .protect_control_groups = context->protect_control_groups,
3181 .protect_kernel_tunables = context->protect_kernel_tunables,
3182 .protect_kernel_modules = context->protect_kernel_modules,
3183 .protect_kernel_logs = context->protect_kernel_logs,
3184 .protect_hostname = context->protect_hostname,
3185 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3186 .private_mounts = context->private_mounts,
3187 .protect_home = context->protect_home,
3188 .protect_system = context->protect_system,
3189 .protect_proc = context->protect_proc,
3190 .proc_subset = context->proc_subset,
3191 .private_ipc = context->private_ipc || context->ipc_namespace_path,
3192 };
3193 } else if (!context->dynamic_user && root_dir)
3194 /*
3195 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3196 * sandbox info, otherwise enforce it, don't ignore protected paths and
3197 * fail if we are enable to apply the sandbox inside the mount namespace.
3198 */
3199 ns_info = (NamespaceInfo) {
3200 .ignore_protect_paths = true,
3201 };
3202 else
3203 ns_info = (NamespaceInfo) {};
3204
3205 if (context->mount_flags == MS_SHARED)
3206 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3207
3208 if (exec_context_has_credentials(context) &&
3209 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3210 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3211 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3212 if (!creds_path) {
3213 r = -ENOMEM;
3214 goto finalize;
3215 }
3216 }
3217
3218 if (MANAGER_IS_SYSTEM(u->manager)) {
3219 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3220 if (!propagate_dir) {
3221 r = -ENOMEM;
3222 goto finalize;
3223 }
3224
3225 incoming_dir = strdup("/run/systemd/incoming");
3226 if (!incoming_dir) {
3227 r = -ENOMEM;
3228 goto finalize;
3229 }
3230 }
3231
3232 r = setup_namespace(root_dir, root_image, context->root_image_options,
3233 &ns_info, context->read_write_paths,
3234 needs_sandboxing ? context->read_only_paths : NULL,
3235 needs_sandboxing ? context->inaccessible_paths : NULL,
3236 needs_sandboxing ? context->exec_paths : NULL,
3237 needs_sandboxing ? context->no_exec_paths : NULL,
3238 empty_directories,
3239 bind_mounts,
3240 n_bind_mounts,
3241 context->temporary_filesystems,
3242 context->n_temporary_filesystems,
3243 context->mount_images,
3244 context->n_mount_images,
3245 tmp_dir,
3246 var_tmp_dir,
3247 creds_path,
3248 context->log_namespace,
3249 context->mount_flags,
3250 context->root_hash, context->root_hash_size, context->root_hash_path,
3251 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3252 context->root_verity,
3253 context->extension_images,
3254 context->n_extension_images,
3255 propagate_dir,
3256 incoming_dir,
3257 root_dir || root_image ? params->notify_socket : NULL,
3258 error_path);
3259
3260 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3261 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3262 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3263 * completely different execution environment. */
3264 if (r == -ENOANO) {
3265 if (insist_on_sandboxing(
3266 context,
3267 root_dir, root_image,
3268 bind_mounts,
3269 n_bind_mounts)) {
3270 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3271 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3272 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3273
3274 r = -EOPNOTSUPP;
3275 } else {
3276 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3277 r = 0;
3278 }
3279 }
3280
3281 finalize:
3282 bind_mount_free_many(bind_mounts, n_bind_mounts);
3283 return r;
3284 }
3285
3286 static int apply_working_directory(
3287 const ExecContext *context,
3288 const ExecParameters *params,
3289 const char *home,
3290 int *exit_status) {
3291
3292 const char *d, *wd;
3293
3294 assert(context);
3295 assert(exit_status);
3296
3297 if (context->working_directory_home) {
3298
3299 if (!home) {
3300 *exit_status = EXIT_CHDIR;
3301 return -ENXIO;
3302 }
3303
3304 wd = home;
3305
3306 } else
3307 wd = empty_to_root(context->working_directory);
3308
3309 if (params->flags & EXEC_APPLY_CHROOT)
3310 d = wd;
3311 else
3312 d = prefix_roota(context->root_directory, wd);
3313
3314 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3315 *exit_status = EXIT_CHDIR;
3316 return -errno;
3317 }
3318
3319 return 0;
3320 }
3321
3322 static int apply_root_directory(
3323 const ExecContext *context,
3324 const ExecParameters *params,
3325 const bool needs_mount_ns,
3326 int *exit_status) {
3327
3328 assert(context);
3329 assert(exit_status);
3330
3331 if (params->flags & EXEC_APPLY_CHROOT)
3332 if (!needs_mount_ns && context->root_directory)
3333 if (chroot(context->root_directory) < 0) {
3334 *exit_status = EXIT_CHROOT;
3335 return -errno;
3336 }
3337
3338 return 0;
3339 }
3340
3341 static int setup_keyring(
3342 const Unit *u,
3343 const ExecContext *context,
3344 const ExecParameters *p,
3345 uid_t uid, gid_t gid) {
3346
3347 key_serial_t keyring;
3348 int r = 0;
3349 uid_t saved_uid;
3350 gid_t saved_gid;
3351
3352 assert(u);
3353 assert(context);
3354 assert(p);
3355
3356 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3357 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3358 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3359 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3360 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3361 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3362
3363 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3364 return 0;
3365
3366 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3367 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3368 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3369 * & group is just as nasty as acquiring a reference to the user keyring. */
3370
3371 saved_uid = getuid();
3372 saved_gid = getgid();
3373
3374 if (gid_is_valid(gid) && gid != saved_gid) {
3375 if (setregid(gid, -1) < 0)
3376 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3377 }
3378
3379 if (uid_is_valid(uid) && uid != saved_uid) {
3380 if (setreuid(uid, -1) < 0) {
3381 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3382 goto out;
3383 }
3384 }
3385
3386 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3387 if (keyring == -1) {
3388 if (errno == ENOSYS)
3389 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3390 else if (ERRNO_IS_PRIVILEGE(errno))
3391 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3392 else if (errno == EDQUOT)
3393 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3394 else
3395 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3396
3397 goto out;
3398 }
3399
3400 /* When requested link the user keyring into the session keyring. */
3401 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3402
3403 if (keyctl(KEYCTL_LINK,
3404 KEY_SPEC_USER_KEYRING,
3405 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3406 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3407 goto out;
3408 }
3409 }
3410
3411 /* Restore uid/gid back */
3412 if (uid_is_valid(uid) && uid != saved_uid) {
3413 if (setreuid(saved_uid, -1) < 0) {
3414 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3415 goto out;
3416 }
3417 }
3418
3419 if (gid_is_valid(gid) && gid != saved_gid) {
3420 if (setregid(saved_gid, -1) < 0)
3421 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3422 }
3423
3424 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3425 if (!sd_id128_is_null(u->invocation_id)) {
3426 key_serial_t key;
3427
3428 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3429 if (key == -1)
3430 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3431 else {
3432 if (keyctl(KEYCTL_SETPERM, key,
3433 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3434 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3435 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3436 }
3437 }
3438
3439 out:
3440 /* Revert back uid & gid for the last time, and exit */
3441 /* no extra logging, as only the first already reported error matters */
3442 if (getuid() != saved_uid)
3443 (void) setreuid(saved_uid, -1);
3444
3445 if (getgid() != saved_gid)
3446 (void) setregid(saved_gid, -1);
3447
3448 return r;
3449 }
3450
3451 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3452 assert(array);
3453 assert(n);
3454 assert(pair);
3455
3456 if (pair[0] >= 0)
3457 array[(*n)++] = pair[0];
3458 if (pair[1] >= 0)
3459 array[(*n)++] = pair[1];
3460 }
3461
3462 static int close_remaining_fds(
3463 const ExecParameters *params,
3464 const ExecRuntime *runtime,
3465 const DynamicCreds *dcreds,
3466 int user_lookup_fd,
3467 int socket_fd,
3468 const int *fds, size_t n_fds) {
3469
3470 size_t n_dont_close = 0;
3471 int dont_close[n_fds + 12];
3472
3473 assert(params);
3474
3475 if (params->stdin_fd >= 0)
3476 dont_close[n_dont_close++] = params->stdin_fd;
3477 if (params->stdout_fd >= 0)
3478 dont_close[n_dont_close++] = params->stdout_fd;
3479 if (params->stderr_fd >= 0)
3480 dont_close[n_dont_close++] = params->stderr_fd;
3481
3482 if (socket_fd >= 0)
3483 dont_close[n_dont_close++] = socket_fd;
3484 if (n_fds > 0) {
3485 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3486 n_dont_close += n_fds;
3487 }
3488
3489 if (runtime) {
3490 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3491 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3492 }
3493
3494 if (dcreds) {
3495 if (dcreds->user)
3496 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3497 if (dcreds->group)
3498 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3499 }
3500
3501 if (user_lookup_fd >= 0)
3502 dont_close[n_dont_close++] = user_lookup_fd;
3503
3504 return close_all_fds(dont_close, n_dont_close);
3505 }
3506
3507 static int send_user_lookup(
3508 Unit *unit,
3509 int user_lookup_fd,
3510 uid_t uid,
3511 gid_t gid) {
3512
3513 assert(unit);
3514
3515 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3516 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3517 * specified. */
3518
3519 if (user_lookup_fd < 0)
3520 return 0;
3521
3522 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3523 return 0;
3524
3525 if (writev(user_lookup_fd,
3526 (struct iovec[]) {
3527 IOVEC_INIT(&uid, sizeof(uid)),
3528 IOVEC_INIT(&gid, sizeof(gid)),
3529 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3530 return -errno;
3531
3532 return 0;
3533 }
3534
3535 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3536 int r;
3537
3538 assert(c);
3539 assert(home);
3540 assert(buf);
3541
3542 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3543
3544 if (*home)
3545 return 0;
3546
3547 if (!c->working_directory_home)
3548 return 0;
3549
3550 r = get_home_dir(buf);
3551 if (r < 0)
3552 return r;
3553
3554 *home = *buf;
3555 return 1;
3556 }
3557
3558 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3559 _cleanup_strv_free_ char ** list = NULL;
3560 int r;
3561
3562 assert(c);
3563 assert(p);
3564 assert(ret);
3565
3566 assert(c->dynamic_user);
3567
3568 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3569 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3570 * directories. */
3571
3572 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3573 char **i;
3574
3575 if (t == EXEC_DIRECTORY_CONFIGURATION)
3576 continue;
3577
3578 if (!p->prefix[t])
3579 continue;
3580
3581 STRV_FOREACH(i, c->directories[t].paths) {
3582 char *e;
3583
3584 if (exec_directory_is_private(c, t))
3585 e = path_join(p->prefix[t], "private", *i);
3586 else
3587 e = path_join(p->prefix[t], *i);
3588 if (!e)
3589 return -ENOMEM;
3590
3591 r = strv_consume(&list, e);
3592 if (r < 0)
3593 return r;
3594 }
3595 }
3596
3597 *ret = TAKE_PTR(list);
3598
3599 return 0;
3600 }
3601
3602 static char *exec_command_line(char **argv);
3603
3604 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3605 bool using_subcgroup;
3606 char *p;
3607
3608 assert(params);
3609 assert(ret);
3610
3611 if (!params->cgroup_path)
3612 return -EINVAL;
3613
3614 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3615 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3616 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3617 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3618 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3619 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3620 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3621 * flag, which is only passed for the former statements, not for the latter. */
3622
3623 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3624 if (using_subcgroup)
3625 p = path_join(params->cgroup_path, ".control");
3626 else
3627 p = strdup(params->cgroup_path);
3628 if (!p)
3629 return -ENOMEM;
3630
3631 *ret = p;
3632 return using_subcgroup;
3633 }
3634
3635 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3636 _cleanup_(cpu_set_reset) CPUSet s = {};
3637 int r;
3638
3639 assert(c);
3640 assert(ret);
3641
3642 if (!c->numa_policy.nodes.set) {
3643 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3644 return 0;
3645 }
3646
3647 r = numa_to_cpu_set(&c->numa_policy, &s);
3648 if (r < 0)
3649 return r;
3650
3651 cpu_set_reset(ret);
3652
3653 return cpu_set_add_all(ret, &s);
3654 }
3655
3656 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3657 assert(c);
3658
3659 return c->cpu_affinity_from_numa;
3660 }
3661
3662 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3663 int r;
3664
3665 assert(fds);
3666 assert(n_fds);
3667 assert(*n_fds < fds_size);
3668 assert(ret_fd);
3669
3670 if (fd < 0) {
3671 *ret_fd = -1;
3672 return 0;
3673 }
3674
3675 if (fd < 3 + (int) *n_fds) {
3676 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3677 * the fds we pass to the process (or which are closed only during execve). */
3678
3679 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3680 if (r < 0)
3681 return -errno;
3682
3683 CLOSE_AND_REPLACE(fd, r);
3684 }
3685
3686 *ret_fd = fds[*n_fds] = fd;
3687 (*n_fds) ++;
3688 return 1;
3689 }
3690
3691 static int exec_child(
3692 Unit *unit,
3693 const ExecCommand *command,
3694 const ExecContext *context,
3695 const ExecParameters *params,
3696 ExecRuntime *runtime,
3697 DynamicCreds *dcreds,
3698 int socket_fd,
3699 const int named_iofds[static 3],
3700 int *fds,
3701 size_t n_socket_fds,
3702 size_t n_storage_fds,
3703 char **files_env,
3704 int user_lookup_fd,
3705 int *exit_status) {
3706
3707 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
3708 int r, ngids = 0, exec_fd;
3709 _cleanup_free_ gid_t *supplementary_gids = NULL;
3710 const char *username = NULL, *groupname = NULL;
3711 _cleanup_free_ char *home_buffer = NULL;
3712 const char *home = NULL, *shell = NULL;
3713 char **final_argv = NULL;
3714 dev_t journal_stream_dev = 0;
3715 ino_t journal_stream_ino = 0;
3716 bool userns_set_up = false;
3717 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3718 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3719 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3720 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3721 #if HAVE_SELINUX
3722 _cleanup_free_ char *mac_selinux_context_net = NULL;
3723 bool use_selinux = false;
3724 #endif
3725 #if ENABLE_SMACK
3726 bool use_smack = false;
3727 #endif
3728 #if HAVE_APPARMOR
3729 bool use_apparmor = false;
3730 #endif
3731 uid_t saved_uid = getuid();
3732 gid_t saved_gid = getgid();
3733 uid_t uid = UID_INVALID;
3734 gid_t gid = GID_INVALID;
3735 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3736 n_keep_fds; /* total number of fds not to close */
3737 int secure_bits;
3738 _cleanup_free_ gid_t *gids_after_pam = NULL;
3739 int ngids_after_pam = 0;
3740
3741 assert(unit);
3742 assert(command);
3743 assert(context);
3744 assert(params);
3745 assert(exit_status);
3746
3747 rename_process_from_path(command->path);
3748
3749 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3750 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3751 * both of which will be demoted to SIG_DFL. */
3752 (void) default_signals(SIGNALS_CRASH_HANDLER,
3753 SIGNALS_IGNORE);
3754
3755 if (context->ignore_sigpipe)
3756 (void) ignore_signals(SIGPIPE);
3757
3758 r = reset_signal_mask();
3759 if (r < 0) {
3760 *exit_status = EXIT_SIGNAL_MASK;
3761 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
3762 }
3763
3764 if (params->idle_pipe)
3765 do_idle_pipe_dance(params->idle_pipe);
3766
3767 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3768 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3769 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3770 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
3771
3772 log_forget_fds();
3773 log_set_open_when_needed(true);
3774
3775 /* In case anything used libc syslog(), close this here, too */
3776 closelog();
3777
3778 int keep_fds[n_fds + 2];
3779 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3780 n_keep_fds = n_fds;
3781
3782 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3783 if (r < 0) {
3784 *exit_status = EXIT_FDS;
3785 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3786 }
3787
3788 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
3789 if (r < 0) {
3790 *exit_status = EXIT_FDS;
3791 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
3792 }
3793
3794 if (!context->same_pgrp &&
3795 setsid() < 0) {
3796 *exit_status = EXIT_SETSID;
3797 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3798 }
3799
3800 exec_context_tty_reset(context, params);
3801
3802 if (unit_shall_confirm_spawn(unit)) {
3803 const char *vc = params->confirm_spawn;
3804 _cleanup_free_ char *cmdline = NULL;
3805
3806 cmdline = exec_command_line(command->argv);
3807 if (!cmdline) {
3808 *exit_status = EXIT_MEMORY;
3809 return log_oom();
3810 }
3811
3812 r = ask_for_confirmation(vc, unit, cmdline);
3813 if (r != CONFIRM_EXECUTE) {
3814 if (r == CONFIRM_PRETEND_SUCCESS) {
3815 *exit_status = EXIT_SUCCESS;
3816 return 0;
3817 }
3818 *exit_status = EXIT_CONFIRM;
3819 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3820 "Execution cancelled by the user");
3821 }
3822 }
3823
3824 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3825 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3826 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3827 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3828 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3829 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3830 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3831 *exit_status = EXIT_MEMORY;
3832 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3833 }
3834
3835 if (context->dynamic_user && dcreds) {
3836 _cleanup_strv_free_ char **suggested_paths = NULL;
3837
3838 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3839 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
3840 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3841 *exit_status = EXIT_USER;
3842 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3843 }
3844
3845 r = compile_suggested_paths(context, params, &suggested_paths);
3846 if (r < 0) {
3847 *exit_status = EXIT_MEMORY;
3848 return log_oom();
3849 }
3850
3851 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
3852 if (r < 0) {
3853 *exit_status = EXIT_USER;
3854 if (r == -EILSEQ)
3855 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3856 "Failed to update dynamic user credentials: User or group with specified name already exists.");
3857 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
3858 }
3859
3860 if (!uid_is_valid(uid)) {
3861 *exit_status = EXIT_USER;
3862 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
3863 }
3864
3865 if (!gid_is_valid(gid)) {
3866 *exit_status = EXIT_USER;
3867 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
3868 }
3869
3870 if (dcreds->user)
3871 username = dcreds->user->name;
3872
3873 } else {
3874 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3875 if (r < 0) {
3876 *exit_status = EXIT_USER;
3877 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
3878 }
3879
3880 r = get_fixed_group(context, &groupname, &gid);
3881 if (r < 0) {
3882 *exit_status = EXIT_GROUP;
3883 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
3884 }
3885 }
3886
3887 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3888 r = get_supplementary_groups(context, username, groupname, gid,
3889 &supplementary_gids, &ngids);
3890 if (r < 0) {
3891 *exit_status = EXIT_GROUP;
3892 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
3893 }
3894
3895 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3896 if (r < 0) {
3897 *exit_status = EXIT_USER;
3898 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
3899 }
3900
3901 user_lookup_fd = safe_close(user_lookup_fd);
3902
3903 r = acquire_home(context, uid, &home, &home_buffer);
3904 if (r < 0) {
3905 *exit_status = EXIT_CHDIR;
3906 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
3907 }
3908
3909 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3910 * must sure to drop O_NONBLOCK */
3911 if (socket_fd >= 0)
3912 (void) fd_nonblock(socket_fd, false);
3913
3914 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3915 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3916 if (params->cgroup_path) {
3917 _cleanup_free_ char *p = NULL;
3918
3919 r = exec_parameters_get_cgroup_path(params, &p);
3920 if (r < 0) {
3921 *exit_status = EXIT_CGROUP;
3922 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3923 }
3924
3925 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3926 if (r < 0) {
3927 *exit_status = EXIT_CGROUP;
3928 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3929 }
3930 }
3931
3932 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3933 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
3934 if (r < 0) {
3935 *exit_status = EXIT_NETWORK;
3936 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3937 }
3938 }
3939
3940 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
3941 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
3942 if (r < 0) {
3943 *exit_status = EXIT_NAMESPACE;
3944 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
3945 }
3946 }
3947
3948 r = setup_input(context, params, socket_fd, named_iofds);
3949 if (r < 0) {
3950 *exit_status = EXIT_STDIN;
3951 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
3952 }
3953
3954 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3955 if (r < 0) {
3956 *exit_status = EXIT_STDOUT;
3957 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
3958 }
3959
3960 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
3961 if (r < 0) {
3962 *exit_status = EXIT_STDERR;
3963 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
3964 }
3965
3966 if (context->oom_score_adjust_set) {
3967 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3968 * prohibit write access to this file, and we shouldn't trip up over that. */
3969 r = set_oom_score_adjust(context->oom_score_adjust);
3970 if (ERRNO_IS_PRIVILEGE(r))
3971 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
3972 else if (r < 0) {
3973 *exit_status = EXIT_OOM_ADJUST;
3974 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
3975 }
3976 }
3977
3978 if (context->coredump_filter_set) {
3979 r = set_coredump_filter(context->coredump_filter);
3980 if (ERRNO_IS_PRIVILEGE(r))
3981 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3982 else if (r < 0)
3983 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3984 }
3985
3986 if (context->nice_set) {
3987 r = setpriority_closest(context->nice);
3988 if (r < 0)
3989 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3990 }
3991
3992 if (context->cpu_sched_set) {
3993 struct sched_param param = {
3994 .sched_priority = context->cpu_sched_priority,
3995 };
3996
3997 r = sched_setscheduler(0,
3998 context->cpu_sched_policy |
3999 (context->cpu_sched_reset_on_fork ?
4000 SCHED_RESET_ON_FORK : 0),
4001 &param);
4002 if (r < 0) {
4003 *exit_status = EXIT_SETSCHEDULER;
4004 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4005 }
4006 }
4007
4008 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4009 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4010 const CPUSet *cpu_set;
4011
4012 if (context->cpu_affinity_from_numa) {
4013 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4014 if (r < 0) {
4015 *exit_status = EXIT_CPUAFFINITY;
4016 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4017 }
4018
4019 cpu_set = &converted_cpu_set;
4020 } else
4021 cpu_set = &context->cpu_set;
4022
4023 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4024 *exit_status = EXIT_CPUAFFINITY;
4025 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4026 }
4027 }
4028
4029 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4030 r = apply_numa_policy(&context->numa_policy);
4031 if (r == -EOPNOTSUPP)
4032 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4033 else if (r < 0) {
4034 *exit_status = EXIT_NUMA_POLICY;
4035 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4036 }
4037 }
4038
4039 if (context->ioprio_set)
4040 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4041 *exit_status = EXIT_IOPRIO;
4042 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4043 }
4044
4045 if (context->timer_slack_nsec != NSEC_INFINITY)
4046 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4047 *exit_status = EXIT_TIMERSLACK;
4048 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4049 }
4050
4051 if (context->personality != PERSONALITY_INVALID) {
4052 r = safe_personality(context->personality);
4053 if (r < 0) {
4054 *exit_status = EXIT_PERSONALITY;
4055 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4056 }
4057 }
4058
4059 if (context->utmp_id)
4060 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4061 context->tty_path,
4062 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4063 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4064 USER_PROCESS,
4065 username);
4066
4067 if (uid_is_valid(uid)) {
4068 r = chown_terminal(STDIN_FILENO, uid);
4069 if (r < 0) {
4070 *exit_status = EXIT_STDIN;
4071 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4072 }
4073 }
4074
4075 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4076 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4077 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4078 * touch a single hierarchy too. */
4079 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4080 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4081 if (r < 0) {
4082 *exit_status = EXIT_CGROUP;
4083 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4084 }
4085 }
4086
4087 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4088 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
4089 if (r < 0)
4090 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4091 }
4092
4093 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4094 r = setup_credentials(context, params, unit->id, uid);
4095 if (r < 0) {
4096 *exit_status = EXIT_CREDENTIALS;
4097 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4098 }
4099 }
4100
4101 r = build_environment(
4102 unit,
4103 context,
4104 params,
4105 n_fds,
4106 home,
4107 username,
4108 shell,
4109 journal_stream_dev,
4110 journal_stream_ino,
4111 &our_env);
4112 if (r < 0) {
4113 *exit_status = EXIT_MEMORY;
4114 return log_oom();
4115 }
4116
4117 r = build_pass_environment(context, &pass_env);
4118 if (r < 0) {
4119 *exit_status = EXIT_MEMORY;
4120 return log_oom();
4121 }
4122
4123 accum_env = strv_env_merge(5,
4124 params->environment,
4125 our_env,
4126 pass_env,
4127 context->environment,
4128 files_env);
4129 if (!accum_env) {
4130 *exit_status = EXIT_MEMORY;
4131 return log_oom();
4132 }
4133 accum_env = strv_env_clean(accum_env);
4134
4135 (void) umask(context->umask);
4136
4137 r = setup_keyring(unit, context, params, uid, gid);
4138 if (r < 0) {
4139 *exit_status = EXIT_KEYRING;
4140 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4141 }
4142
4143 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
4144 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4145
4146 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4147 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4148
4149 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4150 if (needs_ambient_hack)
4151 needs_setuid = false;
4152 else
4153 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4154
4155 if (needs_sandboxing) {
4156 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4157 * present. The actual MAC context application will happen later, as late as possible, to avoid
4158 * impacting our own code paths. */
4159
4160 #if HAVE_SELINUX
4161 use_selinux = mac_selinux_use();
4162 #endif
4163 #if ENABLE_SMACK
4164 use_smack = mac_smack_use();
4165 #endif
4166 #if HAVE_APPARMOR
4167 use_apparmor = mac_apparmor_use();
4168 #endif
4169 }
4170
4171 if (needs_sandboxing) {
4172 int which_failed;
4173
4174 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4175 * is set here. (See below.) */
4176
4177 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4178 if (r < 0) {
4179 *exit_status = EXIT_LIMITS;
4180 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4181 }
4182 }
4183
4184 if (needs_setuid && context->pam_name && username) {
4185 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4186 * wins here. (See above.) */
4187
4188 /* All fds passed in the fds array will be closed in the pam child process. */
4189 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4190 if (r < 0) {
4191 *exit_status = EXIT_PAM;
4192 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4193 }
4194
4195 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4196 if (ngids_after_pam < 0) {
4197 *exit_status = EXIT_MEMORY;
4198 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4199 }
4200 }
4201
4202 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4203 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4204 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4205 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4206
4207 userns_set_up = true;
4208 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4209 if (r < 0) {
4210 *exit_status = EXIT_USER;
4211 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4212 }
4213 }
4214
4215 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4216
4217 if (ns_type_supported(NAMESPACE_NET)) {
4218 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4219 if (r == -EPERM)
4220 log_unit_warning_errno(unit, r,
4221 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4222 else if (r < 0) {
4223 *exit_status = EXIT_NETWORK;
4224 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4225 }
4226 } else if (context->network_namespace_path) {
4227 *exit_status = EXIT_NETWORK;
4228 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4229 "NetworkNamespacePath= is not supported, refusing.");
4230 } else
4231 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4232 }
4233
4234 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4235
4236 if (ns_type_supported(NAMESPACE_IPC)) {
4237 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4238 if (r == -EPERM)
4239 log_unit_warning_errno(unit, r,
4240 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4241 else if (r < 0) {
4242 *exit_status = EXIT_NAMESPACE;
4243 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4244 }
4245 } else if (context->ipc_namespace_path) {
4246 *exit_status = EXIT_NAMESPACE;
4247 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4248 "IPCNamespacePath= is not supported, refusing.");
4249 } else
4250 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4251 }
4252
4253 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4254 if (needs_mount_namespace) {
4255 _cleanup_free_ char *error_path = NULL;
4256
4257 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4258 if (r < 0) {
4259 *exit_status = EXIT_NAMESPACE;
4260 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4261 error_path ? ": " : "", strempty(error_path));
4262 }
4263 }
4264
4265 if (needs_sandboxing) {
4266 r = apply_protect_hostname(unit, context, exit_status);
4267 if (r < 0)
4268 return r;
4269 }
4270
4271 /* Drop groups as early as possible.
4272 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4273 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4274 if (needs_setuid) {
4275 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4276 int ngids_to_enforce = 0;
4277
4278 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4279 ngids,
4280 gids_after_pam,
4281 ngids_after_pam,
4282 &gids_to_enforce);
4283 if (ngids_to_enforce < 0) {
4284 *exit_status = EXIT_MEMORY;
4285 return log_unit_error_errno(unit,
4286 ngids_to_enforce,
4287 "Failed to merge group lists. Group membership might be incorrect: %m");
4288 }
4289
4290 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4291 if (r < 0) {
4292 *exit_status = EXIT_GROUP;
4293 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4294 }
4295 }
4296
4297 /* If the user namespace was not set up above, try to do it now.
4298 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4299 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4300 * case of mount namespaces being less privileged when the mount point list is copied from a
4301 * different user namespace). */
4302
4303 if (needs_sandboxing && context->private_users && !userns_set_up) {
4304 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4305 if (r < 0) {
4306 *exit_status = EXIT_USER;
4307 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4308 }
4309 }
4310
4311 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4312 * shall execute. */
4313
4314 _cleanup_free_ char *executable = NULL;
4315 _cleanup_close_ int executable_fd = -1;
4316 r = find_executable_full(command->path, false, &executable, &executable_fd);
4317 if (r < 0) {
4318 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4319 log_unit_struct_errno(unit, LOG_INFO, r,
4320 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4321 LOG_UNIT_INVOCATION_ID(unit),
4322 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4323 command->path),
4324 "EXECUTABLE=%s", command->path);
4325 return 0;
4326 }
4327
4328 *exit_status = EXIT_EXEC;
4329
4330 return log_unit_struct_errno(unit, LOG_INFO, r,
4331 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4332 LOG_UNIT_INVOCATION_ID(unit),
4333 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4334 command->path),
4335 "EXECUTABLE=%s", command->path);
4336 }
4337
4338 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4339 if (r < 0) {
4340 *exit_status = EXIT_FDS;
4341 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4342 }
4343
4344 #if HAVE_SELINUX
4345 if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) {
4346 r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net);
4347 if (r < 0) {
4348 *exit_status = EXIT_SELINUX_CONTEXT;
4349 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4350 }
4351 }
4352 #endif
4353
4354 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4355 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4356 * however if we have it as we want to keep it open until the final execve(). */
4357
4358 r = close_all_fds(keep_fds, n_keep_fds);
4359 if (r >= 0)
4360 r = shift_fds(fds, n_fds);
4361 if (r >= 0)
4362 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4363 if (r < 0) {
4364 *exit_status = EXIT_FDS;
4365 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4366 }
4367
4368 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4369 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4370 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4371 * came this far. */
4372
4373 secure_bits = context->secure_bits;
4374
4375 if (needs_sandboxing) {
4376 uint64_t bset;
4377
4378 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4379 * requested. (Note this is placed after the general resource limit initialization, see
4380 * above, in order to take precedence.) */
4381 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4382 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4383 *exit_status = EXIT_LIMITS;
4384 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4385 }
4386 }
4387
4388 #if ENABLE_SMACK
4389 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4390 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4391 if (use_smack) {
4392 r = setup_smack(context, executable_fd);
4393 if (r < 0) {
4394 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4395 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4396 }
4397 }
4398 #endif
4399
4400 bset = context->capability_bounding_set;
4401 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4402 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4403 * instead of us doing that */
4404 if (needs_ambient_hack)
4405 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4406 (UINT64_C(1) << CAP_SETUID) |
4407 (UINT64_C(1) << CAP_SETGID);
4408
4409 if (!cap_test_all(bset)) {
4410 r = capability_bounding_set_drop(bset, false);
4411 if (r < 0) {
4412 *exit_status = EXIT_CAPABILITIES;
4413 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4414 }
4415 }
4416
4417 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4418 * keep-caps set.
4419 * To be able to raise the ambient capabilities after setresuid() they have to be
4420 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4421 * After setresuid() the ambient capabilities can be raised as they are present in
4422 * the permitted and inhertiable set. However it is possible that someone wants to
4423 * set ambient capabilities without changing the user, so we also set the ambient
4424 * capabilities here.
4425 * The requested ambient capabilities are raised in the inheritable set if the
4426 * second argument is true. */
4427 if (!needs_ambient_hack) {
4428 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4429 if (r < 0) {
4430 *exit_status = EXIT_CAPABILITIES;
4431 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4432 }
4433 }
4434 }
4435
4436 /* chroot to root directory first, before we lose the ability to chroot */
4437 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4438 if (r < 0)
4439 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4440
4441 if (needs_setuid) {
4442 if (uid_is_valid(uid)) {
4443 r = enforce_user(context, uid);
4444 if (r < 0) {
4445 *exit_status = EXIT_USER;
4446 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4447 }
4448
4449 if (!needs_ambient_hack &&
4450 context->capability_ambient_set != 0) {
4451
4452 /* Raise the ambient capabilities after user change. */
4453 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4454 if (r < 0) {
4455 *exit_status = EXIT_CAPABILITIES;
4456 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4457 }
4458 }
4459 }
4460 }
4461
4462 /* Apply working directory here, because the working directory might be on NFS and only the user running
4463 * this service might have the correct privilege to change to the working directory */
4464 r = apply_working_directory(context, params, home, exit_status);
4465 if (r < 0)
4466 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4467
4468 if (needs_sandboxing) {
4469 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4470 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4471 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4472 * are restricted. */
4473
4474 #if HAVE_SELINUX
4475 if (use_selinux) {
4476 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4477
4478 if (exec_context) {
4479 r = setexeccon(exec_context);
4480 if (r < 0) {
4481 *exit_status = EXIT_SELINUX_CONTEXT;
4482 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4483 }
4484 }
4485 }
4486 #endif
4487
4488 #if HAVE_APPARMOR
4489 if (use_apparmor && context->apparmor_profile) {
4490 r = aa_change_onexec(context->apparmor_profile);
4491 if (r < 0 && !context->apparmor_profile_ignore) {
4492 *exit_status = EXIT_APPARMOR_PROFILE;
4493 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4494 }
4495 }
4496 #endif
4497
4498 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4499 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4500 * CAP_SETPCAP. */
4501 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4502 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4503 * effective set here.
4504 * The effective set is overwritten during execve with the following values:
4505 * - ambient set (for non-root processes)
4506 * - (inheritable | bounding) set for root processes)
4507 *
4508 * Hence there is no security impact to raise it in the effective set before execve
4509 */
4510 r = capability_gain_cap_setpcap(NULL);
4511 if (r < 0) {
4512 *exit_status = EXIT_CAPABILITIES;
4513 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4514 }
4515 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4516 *exit_status = EXIT_SECUREBITS;
4517 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4518 }
4519 }
4520
4521 if (context_has_no_new_privileges(context))
4522 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4523 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4524 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4525 }
4526
4527 #if HAVE_SECCOMP
4528 r = apply_address_families(unit, context);
4529 if (r < 0) {
4530 *exit_status = EXIT_ADDRESS_FAMILIES;
4531 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4532 }
4533
4534 r = apply_memory_deny_write_execute(unit, context);
4535 if (r < 0) {
4536 *exit_status = EXIT_SECCOMP;
4537 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4538 }
4539
4540 r = apply_restrict_realtime(unit, context);
4541 if (r < 0) {
4542 *exit_status = EXIT_SECCOMP;
4543 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4544 }
4545
4546 r = apply_restrict_suid_sgid(unit, context);
4547 if (r < 0) {
4548 *exit_status = EXIT_SECCOMP;
4549 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4550 }
4551
4552 r = apply_restrict_namespaces(unit, context);
4553 if (r < 0) {
4554 *exit_status = EXIT_SECCOMP;
4555 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4556 }
4557
4558 r = apply_protect_sysctl(unit, context);
4559 if (r < 0) {
4560 *exit_status = EXIT_SECCOMP;
4561 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4562 }
4563
4564 r = apply_protect_kernel_modules(unit, context);
4565 if (r < 0) {
4566 *exit_status = EXIT_SECCOMP;
4567 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4568 }
4569
4570 r = apply_protect_kernel_logs(unit, context);
4571 if (r < 0) {
4572 *exit_status = EXIT_SECCOMP;
4573 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4574 }
4575
4576 r = apply_protect_clock(unit, context);
4577 if (r < 0) {
4578 *exit_status = EXIT_SECCOMP;
4579 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4580 }
4581
4582 r = apply_private_devices(unit, context);
4583 if (r < 0) {
4584 *exit_status = EXIT_SECCOMP;
4585 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
4586 }
4587
4588 r = apply_syscall_archs(unit, context);
4589 if (r < 0) {
4590 *exit_status = EXIT_SECCOMP;
4591 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
4592 }
4593
4594 r = apply_lock_personality(unit, context);
4595 if (r < 0) {
4596 *exit_status = EXIT_SECCOMP;
4597 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
4598 }
4599
4600 r = apply_syscall_log(unit, context);
4601 if (r < 0) {
4602 *exit_status = EXIT_SECCOMP;
4603 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4604 }
4605
4606 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4607 * by the filter as little as possible. */
4608 r = apply_syscall_filter(unit, context, needs_ambient_hack);
4609 if (r < 0) {
4610 *exit_status = EXIT_SECCOMP;
4611 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
4612 }
4613 #endif
4614 }
4615
4616 if (!strv_isempty(context->unset_environment)) {
4617 char **ee = NULL;
4618
4619 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4620 if (!ee) {
4621 *exit_status = EXIT_MEMORY;
4622 return log_oom();
4623 }
4624
4625 strv_free_and_replace(accum_env, ee);
4626 }
4627
4628 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4629 replaced_argv = replace_env_argv(command->argv, accum_env);
4630 if (!replaced_argv) {
4631 *exit_status = EXIT_MEMORY;
4632 return log_oom();
4633 }
4634 final_argv = replaced_argv;
4635 } else
4636 final_argv = command->argv;
4637
4638 if (DEBUG_LOGGING) {
4639 _cleanup_free_ char *line = NULL;
4640
4641 line = exec_command_line(final_argv);
4642 if (line)
4643 log_unit_struct(unit, LOG_DEBUG,
4644 "EXECUTABLE=%s", executable,
4645 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4646 LOG_UNIT_INVOCATION_ID(unit));
4647 }
4648
4649 if (exec_fd >= 0) {
4650 uint8_t hot = 1;
4651
4652 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4653 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4654
4655 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4656 *exit_status = EXIT_EXEC;
4657 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4658 }
4659 }
4660
4661 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
4662
4663 if (exec_fd >= 0) {
4664 uint8_t hot = 0;
4665
4666 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4667 * that POLLHUP on it no longer means execve() succeeded. */
4668
4669 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4670 *exit_status = EXIT_EXEC;
4671 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4672 }
4673 }
4674
4675 *exit_status = EXIT_EXEC;
4676 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
4677 }
4678
4679 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
4680 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
4681
4682 int exec_spawn(Unit *unit,
4683 ExecCommand *command,
4684 const ExecContext *context,
4685 const ExecParameters *params,
4686 ExecRuntime *runtime,
4687 DynamicCreds *dcreds,
4688 pid_t *ret) {
4689
4690 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
4691 _cleanup_free_ char *subcgroup_path = NULL;
4692 _cleanup_strv_free_ char **files_env = NULL;
4693 size_t n_storage_fds = 0, n_socket_fds = 0;
4694 _cleanup_free_ char *line = NULL;
4695 pid_t pid;
4696
4697 assert(unit);
4698 assert(command);
4699 assert(context);
4700 assert(ret);
4701 assert(params);
4702 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4703
4704 if (context->std_input == EXEC_INPUT_SOCKET ||
4705 context->std_output == EXEC_OUTPUT_SOCKET ||
4706 context->std_error == EXEC_OUTPUT_SOCKET) {
4707
4708 if (params->n_socket_fds > 1)
4709 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4710
4711 if (params->n_socket_fds == 0)
4712 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4713
4714 socket_fd = params->fds[0];
4715 } else {
4716 socket_fd = -1;
4717 fds = params->fds;
4718 n_socket_fds = params->n_socket_fds;
4719 n_storage_fds = params->n_storage_fds;
4720 }
4721
4722 r = exec_context_named_iofds(context, params, named_iofds);
4723 if (r < 0)
4724 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4725
4726 r = exec_context_load_environment(unit, context, &files_env);
4727 if (r < 0)
4728 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
4729
4730 line = exec_command_line(command->argv);
4731 if (!line)
4732 return log_oom();
4733
4734 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4735 and, until the next SELinux policy changes, we save further reloads in future children. */
4736 mac_selinux_maybe_reload();
4737
4738 log_unit_struct(unit, LOG_DEBUG,
4739 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4740 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4741 the mount namespace in the child, but we want to log
4742 from the parent, so we need to use the (possibly
4743 inaccurate) path here. */
4744 LOG_UNIT_INVOCATION_ID(unit));
4745
4746 if (params->cgroup_path) {
4747 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4748 if (r < 0)
4749 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4750 if (r > 0) { /* We are using a child cgroup */
4751 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4752 if (r < 0)
4753 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4754
4755 /* Normally we would not propagate the oomd xattrs to children but since we created this
4756 * sub-cgroup internally we should do it. */
4757 cgroup_oomd_xattr_apply(unit, subcgroup_path);
4758 }
4759 }
4760
4761 pid = fork();
4762 if (pid < 0)
4763 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
4764
4765 if (pid == 0) {
4766 int exit_status = EXIT_SUCCESS;
4767
4768 r = exec_child(unit,
4769 command,
4770 context,
4771 params,
4772 runtime,
4773 dcreds,
4774 socket_fd,
4775 named_iofds,
4776 fds,
4777 n_socket_fds,
4778 n_storage_fds,
4779 files_env,
4780 unit->manager->user_lookup_fds[1],
4781 &exit_status);
4782
4783 if (r < 0) {
4784 const char *status =
4785 exit_status_to_string(exit_status,
4786 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
4787
4788 log_unit_struct_errno(unit, LOG_ERR, r,
4789 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4790 LOG_UNIT_INVOCATION_ID(unit),
4791 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4792 status, command->path),
4793 "EXECUTABLE=%s", command->path);
4794 }
4795
4796 _exit(exit_status);
4797 }
4798
4799 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
4800
4801 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4802 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4803 * process will be killed too). */
4804 if (subcgroup_path)
4805 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
4806
4807 exec_status_start(&command->exec_status, pid);
4808
4809 *ret = pid;
4810 return 0;
4811 }
4812
4813 void exec_context_init(ExecContext *c) {
4814 assert(c);
4815
4816 c->umask = 0022;
4817 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
4818 c->cpu_sched_policy = SCHED_OTHER;
4819 c->syslog_priority = LOG_DAEMON|LOG_INFO;
4820 c->syslog_level_prefix = true;
4821 c->ignore_sigpipe = true;
4822 c->timer_slack_nsec = NSEC_INFINITY;
4823 c->personality = PERSONALITY_INVALID;
4824 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4825 c->directories[t].mode = 0755;
4826 c->timeout_clean_usec = USEC_INFINITY;
4827 c->capability_bounding_set = CAP_ALL;
4828 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4829 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
4830 c->log_level_max = -1;
4831 #if HAVE_SECCOMP
4832 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4833 #endif
4834 numa_policy_reset(&c->numa_policy);
4835 }
4836
4837 void exec_context_done(ExecContext *c) {
4838 assert(c);
4839
4840 c->environment = strv_free(c->environment);
4841 c->environment_files = strv_free(c->environment_files);
4842 c->pass_environment = strv_free(c->pass_environment);
4843 c->unset_environment = strv_free(c->unset_environment);
4844
4845 rlimit_free_all(c->rlimit);
4846
4847 for (size_t l = 0; l < 3; l++) {
4848 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
4849 c->stdio_file[l] = mfree(c->stdio_file[l]);
4850 }
4851
4852 c->working_directory = mfree(c->working_directory);
4853 c->root_directory = mfree(c->root_directory);
4854 c->root_image = mfree(c->root_image);
4855 c->root_image_options = mount_options_free_all(c->root_image_options);
4856 c->root_hash = mfree(c->root_hash);
4857 c->root_hash_size = 0;
4858 c->root_hash_path = mfree(c->root_hash_path);
4859 c->root_hash_sig = mfree(c->root_hash_sig);
4860 c->root_hash_sig_size = 0;
4861 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
4862 c->root_verity = mfree(c->root_verity);
4863 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
4864 c->tty_path = mfree(c->tty_path);
4865 c->syslog_identifier = mfree(c->syslog_identifier);
4866 c->user = mfree(c->user);
4867 c->group = mfree(c->group);
4868
4869 c->supplementary_groups = strv_free(c->supplementary_groups);
4870
4871 c->pam_name = mfree(c->pam_name);
4872
4873 c->read_only_paths = strv_free(c->read_only_paths);
4874 c->read_write_paths = strv_free(c->read_write_paths);
4875 c->inaccessible_paths = strv_free(c->inaccessible_paths);
4876 c->exec_paths = strv_free(c->exec_paths);
4877 c->no_exec_paths = strv_free(c->no_exec_paths);
4878
4879 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
4880 c->bind_mounts = NULL;
4881 c->n_bind_mounts = 0;
4882 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4883 c->temporary_filesystems = NULL;
4884 c->n_temporary_filesystems = 0;
4885 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
4886
4887 cpu_set_reset(&c->cpu_set);
4888 numa_policy_reset(&c->numa_policy);
4889
4890 c->utmp_id = mfree(c->utmp_id);
4891 c->selinux_context = mfree(c->selinux_context);
4892 c->apparmor_profile = mfree(c->apparmor_profile);
4893 c->smack_process_label = mfree(c->smack_process_label);
4894
4895 c->syscall_filter = hashmap_free(c->syscall_filter);
4896 c->syscall_archs = set_free(c->syscall_archs);
4897 c->address_families = set_free(c->address_families);
4898
4899 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4900 c->directories[t].paths = strv_free(c->directories[t].paths);
4901
4902 c->log_level_max = -1;
4903
4904 exec_context_free_log_extra_fields(c);
4905
4906 c->log_ratelimit_interval_usec = 0;
4907 c->log_ratelimit_burst = 0;
4908
4909 c->stdin_data = mfree(c->stdin_data);
4910 c->stdin_data_size = 0;
4911
4912 c->network_namespace_path = mfree(c->network_namespace_path);
4913 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
4914
4915 c->log_namespace = mfree(c->log_namespace);
4916
4917 c->load_credentials = strv_free(c->load_credentials);
4918 c->set_credentials = hashmap_free(c->set_credentials);
4919 }
4920
4921 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
4922 char **i;
4923
4924 assert(c);
4925
4926 if (!runtime_prefix)
4927 return 0;
4928
4929 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
4930 _cleanup_free_ char *p = NULL;
4931
4932 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4933 p = path_join(runtime_prefix, "private", *i);
4934 else
4935 p = path_join(runtime_prefix, *i);
4936 if (!p)
4937 return -ENOMEM;
4938
4939 /* We execute this synchronously, since we need to be sure this is gone when we start the
4940 * service next. */
4941 (void) rm_rf(p, REMOVE_ROOT);
4942 }
4943
4944 return 0;
4945 }
4946
4947 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4948 _cleanup_free_ char *p = NULL;
4949
4950 assert(c);
4951
4952 if (!runtime_prefix || !unit)
4953 return 0;
4954
4955 p = path_join(runtime_prefix, "credentials", unit);
4956 if (!p)
4957 return -ENOMEM;
4958
4959 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4960 * unmount it, and afterwards remove the mount point */
4961 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4962 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4963
4964 return 0;
4965 }
4966
4967 static void exec_command_done(ExecCommand *c) {
4968 assert(c);
4969
4970 c->path = mfree(c->path);
4971 c->argv = strv_free(c->argv);
4972 }
4973
4974 void exec_command_done_array(ExecCommand *c, size_t n) {
4975 for (size_t i = 0; i < n; i++)
4976 exec_command_done(c+i);
4977 }
4978
4979 ExecCommand* exec_command_free_list(ExecCommand *c) {
4980 ExecCommand *i;
4981
4982 while ((i = c)) {
4983 LIST_REMOVE(command, c, i);
4984 exec_command_done(i);
4985 free(i);
4986 }
4987
4988 return NULL;
4989 }
4990
4991 void exec_command_free_array(ExecCommand **c, size_t n) {
4992 for (size_t i = 0; i < n; i++)
4993 c[i] = exec_command_free_list(c[i]);
4994 }
4995
4996 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4997 for (size_t i = 0; i < n; i++)
4998 exec_status_reset(&c[i].exec_status);
4999 }
5000
5001 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5002 for (size_t i = 0; i < n; i++) {
5003 ExecCommand *z;
5004
5005 LIST_FOREACH(command, z, c[i])
5006 exec_status_reset(&z->exec_status);
5007 }
5008 }
5009
5010 typedef struct InvalidEnvInfo {
5011 const Unit *unit;
5012 const char *path;
5013 } InvalidEnvInfo;
5014
5015 static void invalid_env(const char *p, void *userdata) {
5016 InvalidEnvInfo *info = userdata;
5017
5018 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5019 }
5020
5021 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5022 assert(c);
5023
5024 switch (fd_index) {
5025
5026 case STDIN_FILENO:
5027 if (c->std_input != EXEC_INPUT_NAMED_FD)
5028 return NULL;
5029
5030 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5031
5032 case STDOUT_FILENO:
5033 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5034 return NULL;
5035
5036 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5037
5038 case STDERR_FILENO:
5039 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5040 return NULL;
5041
5042 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5043
5044 default:
5045 return NULL;
5046 }
5047 }
5048
5049 static int exec_context_named_iofds(
5050 const ExecContext *c,
5051 const ExecParameters *p,
5052 int named_iofds[static 3]) {
5053
5054 size_t targets;
5055 const char* stdio_fdname[3];
5056 size_t n_fds;
5057
5058 assert(c);
5059 assert(p);
5060 assert(named_iofds);
5061
5062 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5063 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5064 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5065
5066 for (size_t i = 0; i < 3; i++)
5067 stdio_fdname[i] = exec_context_fdname(c, i);
5068
5069 n_fds = p->n_storage_fds + p->n_socket_fds;
5070
5071 for (size_t i = 0; i < n_fds && targets > 0; i++)
5072 if (named_iofds[STDIN_FILENO] < 0 &&
5073 c->std_input == EXEC_INPUT_NAMED_FD &&
5074 stdio_fdname[STDIN_FILENO] &&
5075 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5076
5077 named_iofds[STDIN_FILENO] = p->fds[i];
5078 targets--;
5079
5080 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5081 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5082 stdio_fdname[STDOUT_FILENO] &&
5083 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5084
5085 named_iofds[STDOUT_FILENO] = p->fds[i];
5086 targets--;
5087
5088 } else if (named_iofds[STDERR_FILENO] < 0 &&
5089 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5090 stdio_fdname[STDERR_FILENO] &&
5091 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5092
5093 named_iofds[STDERR_FILENO] = p->fds[i];
5094 targets--;
5095 }
5096
5097 return targets == 0 ? 0 : -ENOENT;
5098 }
5099
5100 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
5101 char **i, **r = NULL;
5102
5103 assert(c);
5104 assert(l);
5105
5106 STRV_FOREACH(i, c->environment_files) {
5107 char *fn;
5108 int k;
5109 bool ignore = false;
5110 char **p;
5111 _cleanup_globfree_ glob_t pglob = {};
5112
5113 fn = *i;
5114
5115 if (fn[0] == '-') {
5116 ignore = true;
5117 fn++;
5118 }
5119
5120 if (!path_is_absolute(fn)) {
5121 if (ignore)
5122 continue;
5123
5124 strv_free(r);
5125 return -EINVAL;
5126 }
5127
5128 /* Filename supports globbing, take all matching files */
5129 k = safe_glob(fn, 0, &pglob);
5130 if (k < 0) {
5131 if (ignore)
5132 continue;
5133
5134 strv_free(r);
5135 return k;
5136 }
5137
5138 /* When we don't match anything, -ENOENT should be returned */
5139 assert(pglob.gl_pathc > 0);
5140
5141 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5142 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
5143 if (k < 0) {
5144 if (ignore)
5145 continue;
5146
5147 strv_free(r);
5148 return k;
5149 }
5150 /* Log invalid environment variables with filename */
5151 if (p) {
5152 InvalidEnvInfo info = {
5153 .unit = unit,
5154 .path = pglob.gl_pathv[n]
5155 };
5156
5157 p = strv_env_clean_with_callback(p, invalid_env, &info);
5158 }
5159
5160 if (!r)
5161 r = p;
5162 else {
5163 char **m;
5164
5165 m = strv_env_merge(2, r, p);
5166 strv_free(r);
5167 strv_free(p);
5168 if (!m)
5169 return -ENOMEM;
5170
5171 r = m;
5172 }
5173 }
5174 }
5175
5176 *l = r;
5177
5178 return 0;
5179 }
5180
5181 static bool tty_may_match_dev_console(const char *tty) {
5182 _cleanup_free_ char *resolved = NULL;
5183
5184 if (!tty)
5185 return true;
5186
5187 tty = skip_dev_prefix(tty);
5188
5189 /* trivial identity? */
5190 if (streq(tty, "console"))
5191 return true;
5192
5193 if (resolve_dev_console(&resolved) < 0)
5194 return true; /* if we could not resolve, assume it may */
5195
5196 /* "tty0" means the active VC, so it may be the same sometimes */
5197 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5198 }
5199
5200 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5201 assert(ec);
5202
5203 return ec->tty_reset ||
5204 ec->tty_vhangup ||
5205 ec->tty_vt_disallocate ||
5206 is_terminal_input(ec->std_input) ||
5207 is_terminal_output(ec->std_output) ||
5208 is_terminal_output(ec->std_error);
5209 }
5210
5211 bool exec_context_may_touch_console(const ExecContext *ec) {
5212
5213 return exec_context_may_touch_tty(ec) &&
5214 tty_may_match_dev_console(exec_context_tty_path(ec));
5215 }
5216
5217 static void strv_fprintf(FILE *f, char **l) {
5218 char **g;
5219
5220 assert(f);
5221
5222 STRV_FOREACH(g, l)
5223 fprintf(f, " %s", *g);
5224 }
5225
5226 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5227 assert(f);
5228 assert(prefix);
5229 assert(name);
5230
5231 if (!strv_isempty(strv)) {
5232 fprintf(f, "%s%s:", prefix, name);
5233 strv_fprintf(f, strv);
5234 fputs("\n", f);
5235 }
5236 }
5237
5238 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5239 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
5240 int r;
5241
5242 assert(c);
5243 assert(f);
5244
5245 prefix = strempty(prefix);
5246
5247 fprintf(f,
5248 "%sUMask: %04o\n"
5249 "%sWorkingDirectory: %s\n"
5250 "%sRootDirectory: %s\n"
5251 "%sNonBlocking: %s\n"
5252 "%sPrivateTmp: %s\n"
5253 "%sPrivateDevices: %s\n"
5254 "%sProtectKernelTunables: %s\n"
5255 "%sProtectKernelModules: %s\n"
5256 "%sProtectKernelLogs: %s\n"
5257 "%sProtectClock: %s\n"
5258 "%sProtectControlGroups: %s\n"
5259 "%sPrivateNetwork: %s\n"
5260 "%sPrivateUsers: %s\n"
5261 "%sProtectHome: %s\n"
5262 "%sProtectSystem: %s\n"
5263 "%sMountAPIVFS: %s\n"
5264 "%sIgnoreSIGPIPE: %s\n"
5265 "%sMemoryDenyWriteExecute: %s\n"
5266 "%sRestrictRealtime: %s\n"
5267 "%sRestrictSUIDSGID: %s\n"
5268 "%sKeyringMode: %s\n"
5269 "%sProtectHostname: %s\n"
5270 "%sProtectProc: %s\n"
5271 "%sProcSubset: %s\n",
5272 prefix, c->umask,
5273 prefix, empty_to_root(c->working_directory),
5274 prefix, empty_to_root(c->root_directory),
5275 prefix, yes_no(c->non_blocking),
5276 prefix, yes_no(c->private_tmp),
5277 prefix, yes_no(c->private_devices),
5278 prefix, yes_no(c->protect_kernel_tunables),
5279 prefix, yes_no(c->protect_kernel_modules),
5280 prefix, yes_no(c->protect_kernel_logs),
5281 prefix, yes_no(c->protect_clock),
5282 prefix, yes_no(c->protect_control_groups),
5283 prefix, yes_no(c->private_network),
5284 prefix, yes_no(c->private_users),
5285 prefix, protect_home_to_string(c->protect_home),
5286 prefix, protect_system_to_string(c->protect_system),
5287 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5288 prefix, yes_no(c->ignore_sigpipe),
5289 prefix, yes_no(c->memory_deny_write_execute),
5290 prefix, yes_no(c->restrict_realtime),
5291 prefix, yes_no(c->restrict_suid_sgid),
5292 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5293 prefix, yes_no(c->protect_hostname),
5294 prefix, protect_proc_to_string(c->protect_proc),
5295 prefix, proc_subset_to_string(c->proc_subset));
5296
5297 if (c->root_image)
5298 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5299
5300 if (c->root_image_options) {
5301 MountOptions *o;
5302
5303 fprintf(f, "%sRootImageOptions:", prefix);
5304 LIST_FOREACH(mount_options, o, c->root_image_options)
5305 if (!isempty(o->options))
5306 fprintf(f, " %s:%s",
5307 partition_designator_to_string(o->partition_designator),
5308 o->options);
5309 fprintf(f, "\n");
5310 }
5311
5312 if (c->root_hash) {
5313 _cleanup_free_ char *encoded = NULL;
5314 encoded = hexmem(c->root_hash, c->root_hash_size);
5315 if (encoded)
5316 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5317 }
5318
5319 if (c->root_hash_path)
5320 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5321
5322 if (c->root_hash_sig) {
5323 _cleanup_free_ char *encoded = NULL;
5324 ssize_t len;
5325 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5326 if (len)
5327 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5328 }
5329
5330 if (c->root_hash_sig_path)
5331 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5332
5333 if (c->root_verity)
5334 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5335
5336 STRV_FOREACH(e, c->environment)
5337 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5338
5339 STRV_FOREACH(e, c->environment_files)
5340 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5341
5342 STRV_FOREACH(e, c->pass_environment)
5343 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5344
5345 STRV_FOREACH(e, c->unset_environment)
5346 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5347
5348 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5349
5350 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5351 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5352
5353 STRV_FOREACH(d, c->directories[dt].paths)
5354 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5355 }
5356
5357 fprintf(f,
5358 "%sTimeoutCleanSec: %s\n",
5359 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5360
5361 if (c->nice_set)
5362 fprintf(f,
5363 "%sNice: %i\n",
5364 prefix, c->nice);
5365
5366 if (c->oom_score_adjust_set)
5367 fprintf(f,
5368 "%sOOMScoreAdjust: %i\n",
5369 prefix, c->oom_score_adjust);
5370
5371 if (c->coredump_filter_set)
5372 fprintf(f,
5373 "%sCoredumpFilter: 0x%"PRIx64"\n",
5374 prefix, c->coredump_filter);
5375
5376 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5377 if (c->rlimit[i]) {
5378 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5379 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5380 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5381 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5382 }
5383
5384 if (c->ioprio_set) {
5385 _cleanup_free_ char *class_str = NULL;
5386
5387 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5388 if (r >= 0)
5389 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5390
5391 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
5392 }
5393
5394 if (c->cpu_sched_set) {
5395 _cleanup_free_ char *policy_str = NULL;
5396
5397 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5398 if (r >= 0)
5399 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5400
5401 fprintf(f,
5402 "%sCPUSchedulingPriority: %i\n"
5403 "%sCPUSchedulingResetOnFork: %s\n",
5404 prefix, c->cpu_sched_priority,
5405 prefix, yes_no(c->cpu_sched_reset_on_fork));
5406 }
5407
5408 if (c->cpu_set.set) {
5409 _cleanup_free_ char *affinity = NULL;
5410
5411 affinity = cpu_set_to_range_string(&c->cpu_set);
5412 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5413 }
5414
5415 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5416 _cleanup_free_ char *nodes = NULL;
5417
5418 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5419 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5420 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5421 }
5422
5423 if (c->timer_slack_nsec != NSEC_INFINITY)
5424 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5425
5426 fprintf(f,
5427 "%sStandardInput: %s\n"
5428 "%sStandardOutput: %s\n"
5429 "%sStandardError: %s\n",
5430 prefix, exec_input_to_string(c->std_input),
5431 prefix, exec_output_to_string(c->std_output),
5432 prefix, exec_output_to_string(c->std_error));
5433
5434 if (c->std_input == EXEC_INPUT_NAMED_FD)
5435 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5436 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5437 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5438 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5439 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5440
5441 if (c->std_input == EXEC_INPUT_FILE)
5442 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5443 if (c->std_output == EXEC_OUTPUT_FILE)
5444 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5445 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5446 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5447 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5448 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5449 if (c->std_error == EXEC_OUTPUT_FILE)
5450 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5451 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5452 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5453 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5454 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5455
5456 if (c->tty_path)
5457 fprintf(f,
5458 "%sTTYPath: %s\n"
5459 "%sTTYReset: %s\n"
5460 "%sTTYVHangup: %s\n"
5461 "%sTTYVTDisallocate: %s\n",
5462 prefix, c->tty_path,
5463 prefix, yes_no(c->tty_reset),
5464 prefix, yes_no(c->tty_vhangup),
5465 prefix, yes_no(c->tty_vt_disallocate));
5466
5467 if (IN_SET(c->std_output,
5468 EXEC_OUTPUT_KMSG,
5469 EXEC_OUTPUT_JOURNAL,
5470 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5471 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5472 IN_SET(c->std_error,
5473 EXEC_OUTPUT_KMSG,
5474 EXEC_OUTPUT_JOURNAL,
5475 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5476 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5477
5478 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5479
5480 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5481 if (r >= 0)
5482 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5483
5484 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5485 if (r >= 0)
5486 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5487 }
5488
5489 if (c->log_level_max >= 0) {
5490 _cleanup_free_ char *t = NULL;
5491
5492 (void) log_level_to_string_alloc(c->log_level_max, &t);
5493
5494 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5495 }
5496
5497 if (c->log_ratelimit_interval_usec > 0) {
5498 char buf_timespan[FORMAT_TIMESPAN_MAX];
5499
5500 fprintf(f,
5501 "%sLogRateLimitIntervalSec: %s\n",
5502 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
5503 }
5504
5505 if (c->log_ratelimit_burst > 0)
5506 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5507
5508 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5509 fprintf(f, "%sLogExtraFields: ", prefix);
5510 fwrite(c->log_extra_fields[j].iov_base,
5511 1, c->log_extra_fields[j].iov_len,
5512 f);
5513 fputc('\n', f);
5514 }
5515
5516 if (c->log_namespace)
5517 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5518
5519 if (c->secure_bits) {
5520 _cleanup_free_ char *str = NULL;
5521
5522 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5523 if (r >= 0)
5524 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5525 }
5526
5527 if (c->capability_bounding_set != CAP_ALL) {
5528 _cleanup_free_ char *str = NULL;
5529
5530 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5531 if (r >= 0)
5532 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5533 }
5534
5535 if (c->capability_ambient_set != 0) {
5536 _cleanup_free_ char *str = NULL;
5537
5538 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5539 if (r >= 0)
5540 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5541 }
5542
5543 if (c->user)
5544 fprintf(f, "%sUser: %s\n", prefix, c->user);
5545 if (c->group)
5546 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5547
5548 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5549
5550 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5551
5552 if (c->pam_name)
5553 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5554
5555 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5556 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5557 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5558 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5559 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5560
5561 for (size_t i = 0; i < c->n_bind_mounts; i++)
5562 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5563 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5564 c->bind_mounts[i].ignore_enoent ? "-": "",
5565 c->bind_mounts[i].source,
5566 c->bind_mounts[i].destination,
5567 c->bind_mounts[i].recursive ? "rbind" : "norbind");
5568
5569 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5570 const TemporaryFileSystem *t = c->temporary_filesystems + i;
5571
5572 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5573 t->path,
5574 isempty(t->options) ? "" : ":",
5575 strempty(t->options));
5576 }
5577
5578 if (c->utmp_id)
5579 fprintf(f,
5580 "%sUtmpIdentifier: %s\n",
5581 prefix, c->utmp_id);
5582
5583 if (c->selinux_context)
5584 fprintf(f,
5585 "%sSELinuxContext: %s%s\n",
5586 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
5587
5588 if (c->apparmor_profile)
5589 fprintf(f,
5590 "%sAppArmorProfile: %s%s\n",
5591 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5592
5593 if (c->smack_process_label)
5594 fprintf(f,
5595 "%sSmackProcessLabel: %s%s\n",
5596 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5597
5598 if (c->personality != PERSONALITY_INVALID)
5599 fprintf(f,
5600 "%sPersonality: %s\n",
5601 prefix, strna(personality_to_string(c->personality)));
5602
5603 fprintf(f,
5604 "%sLockPersonality: %s\n",
5605 prefix, yes_no(c->lock_personality));
5606
5607 if (c->syscall_filter) {
5608 #if HAVE_SECCOMP
5609 void *id, *val;
5610 bool first = true;
5611 #endif
5612
5613 fprintf(f,
5614 "%sSystemCallFilter: ",
5615 prefix);
5616
5617 if (!c->syscall_allow_list)
5618 fputc('~', f);
5619
5620 #if HAVE_SECCOMP
5621 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
5622 _cleanup_free_ char *name = NULL;
5623 const char *errno_name = NULL;
5624 int num = PTR_TO_INT(val);
5625
5626 if (first)
5627 first = false;
5628 else
5629 fputc(' ', f);
5630
5631 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
5632 fputs(strna(name), f);
5633
5634 if (num >= 0) {
5635 errno_name = seccomp_errno_or_action_to_string(num);
5636 if (errno_name)
5637 fprintf(f, ":%s", errno_name);
5638 else
5639 fprintf(f, ":%d", num);
5640 }
5641 }
5642 #endif
5643
5644 fputc('\n', f);
5645 }
5646
5647 if (c->syscall_archs) {
5648 #if HAVE_SECCOMP
5649 void *id;
5650 #endif
5651
5652 fprintf(f,
5653 "%sSystemCallArchitectures:",
5654 prefix);
5655
5656 #if HAVE_SECCOMP
5657 SET_FOREACH(id, c->syscall_archs)
5658 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5659 #endif
5660 fputc('\n', f);
5661 }
5662
5663 if (exec_context_restrict_namespaces_set(c)) {
5664 _cleanup_free_ char *s = NULL;
5665
5666 r = namespace_flags_to_string(c->restrict_namespaces, &s);
5667 if (r >= 0)
5668 fprintf(f, "%sRestrictNamespaces: %s\n",
5669 prefix, strna(s));
5670 }
5671
5672 if (c->network_namespace_path)
5673 fprintf(f,
5674 "%sNetworkNamespacePath: %s\n",
5675 prefix, c->network_namespace_path);
5676
5677 if (c->syscall_errno > 0) {
5678 #if HAVE_SECCOMP
5679 const char *errno_name;
5680 #endif
5681
5682 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5683
5684 #if HAVE_SECCOMP
5685 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
5686 if (errno_name)
5687 fputs(errno_name, f);
5688 else
5689 fprintf(f, "%d", c->syscall_errno);
5690 #endif
5691 fputc('\n', f);
5692 }
5693
5694 for (size_t i = 0; i < c->n_mount_images; i++) {
5695 MountOptions *o;
5696
5697 fprintf(f, "%sMountImages: %s%s:%s", prefix,
5698 c->mount_images[i].ignore_enoent ? "-": "",
5699 c->mount_images[i].source,
5700 c->mount_images[i].destination);
5701 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
5702 fprintf(f, ":%s:%s",
5703 partition_designator_to_string(o->partition_designator),
5704 strempty(o->options));
5705 fprintf(f, "\n");
5706 }
5707
5708 for (size_t i = 0; i < c->n_extension_images; i++) {
5709 MountOptions *o;
5710
5711 fprintf(f, "%sExtensionImages: %s%s", prefix,
5712 c->extension_images[i].ignore_enoent ? "-": "",
5713 c->extension_images[i].source);
5714 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5715 fprintf(f, ":%s:%s",
5716 partition_designator_to_string(o->partition_designator),
5717 strempty(o->options));
5718 fprintf(f, "\n");
5719 }
5720 }
5721
5722 bool exec_context_maintains_privileges(const ExecContext *c) {
5723 assert(c);
5724
5725 /* Returns true if the process forked off would run under
5726 * an unchanged UID or as root. */
5727
5728 if (!c->user)
5729 return true;
5730
5731 if (streq(c->user, "root") || streq(c->user, "0"))
5732 return true;
5733
5734 return false;
5735 }
5736
5737 int exec_context_get_effective_ioprio(const ExecContext *c) {
5738 int p;
5739
5740 assert(c);
5741
5742 if (c->ioprio_set)
5743 return c->ioprio;
5744
5745 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5746 if (p < 0)
5747 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5748
5749 return p;
5750 }
5751
5752 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5753 assert(c);
5754
5755 /* Explicit setting wins */
5756 if (c->mount_apivfs_set)
5757 return c->mount_apivfs;
5758
5759 /* Default to "yes" if root directory or image are specified */
5760 if (exec_context_with_rootfs(c))
5761 return true;
5762
5763 return false;
5764 }
5765
5766 void exec_context_free_log_extra_fields(ExecContext *c) {
5767 assert(c);
5768
5769 for (size_t l = 0; l < c->n_log_extra_fields; l++)
5770 free(c->log_extra_fields[l].iov_base);
5771 c->log_extra_fields = mfree(c->log_extra_fields);
5772 c->n_log_extra_fields = 0;
5773 }
5774
5775 void exec_context_revert_tty(ExecContext *c) {
5776 _cleanup_close_ int fd = -1;
5777 const char *path;
5778 struct stat st;
5779 int r;
5780
5781 assert(c);
5782
5783 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5784 exec_context_tty_reset(c, NULL);
5785
5786 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5787 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5788 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
5789 if (!exec_context_may_touch_tty(c))
5790 return;
5791
5792 path = exec_context_tty_path(c);
5793 if (!path)
5794 return;
5795
5796 fd = open(path, O_PATH|O_CLOEXEC);
5797 if (fd < 0)
5798 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
5799 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
5800 path);
5801
5802 if (fstat(fd, &st) < 0)
5803 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
5804
5805 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
5806 * if things are a character device, since a proper check either means we'd have to open the TTY and
5807 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
5808 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
5809 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
5810 if (!S_ISCHR(st.st_mode))
5811 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
5812
5813 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
5814 if (r < 0)
5815 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
5816 }
5817
5818 int exec_context_get_clean_directories(
5819 ExecContext *c,
5820 char **prefix,
5821 ExecCleanMask mask,
5822 char ***ret) {
5823
5824 _cleanup_strv_free_ char **l = NULL;
5825 int r;
5826
5827 assert(c);
5828 assert(prefix);
5829 assert(ret);
5830
5831 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5832 char **i;
5833
5834 if (!FLAGS_SET(mask, 1U << t))
5835 continue;
5836
5837 if (!prefix[t])
5838 continue;
5839
5840 STRV_FOREACH(i, c->directories[t].paths) {
5841 char *j;
5842
5843 j = path_join(prefix[t], *i);
5844 if (!j)
5845 return -ENOMEM;
5846
5847 r = strv_consume(&l, j);
5848 if (r < 0)
5849 return r;
5850
5851 /* Also remove private directories unconditionally. */
5852 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5853 j = path_join(prefix[t], "private", *i);
5854 if (!j)
5855 return -ENOMEM;
5856
5857 r = strv_consume(&l, j);
5858 if (r < 0)
5859 return r;
5860 }
5861 }
5862 }
5863
5864 *ret = TAKE_PTR(l);
5865 return 0;
5866 }
5867
5868 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5869 ExecCleanMask mask = 0;
5870
5871 assert(c);
5872 assert(ret);
5873
5874 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5875 if (!strv_isempty(c->directories[t].paths))
5876 mask |= 1U << t;
5877
5878 *ret = mask;
5879 return 0;
5880 }
5881
5882 void exec_status_start(ExecStatus *s, pid_t pid) {
5883 assert(s);
5884
5885 *s = (ExecStatus) {
5886 .pid = pid,
5887 };
5888
5889 dual_timestamp_get(&s->start_timestamp);
5890 }
5891
5892 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
5893 assert(s);
5894
5895 if (s->pid != pid)
5896 *s = (ExecStatus) {
5897 .pid = pid,
5898 };
5899
5900 dual_timestamp_get(&s->exit_timestamp);
5901
5902 s->code = code;
5903 s->status = status;
5904
5905 if (context && context->utmp_id)
5906 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
5907 }
5908
5909 void exec_status_reset(ExecStatus *s) {
5910 assert(s);
5911
5912 *s = (ExecStatus) {};
5913 }
5914
5915 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
5916 char buf[FORMAT_TIMESTAMP_MAX];
5917
5918 assert(s);
5919 assert(f);
5920
5921 if (s->pid <= 0)
5922 return;
5923
5924 prefix = strempty(prefix);
5925
5926 fprintf(f,
5927 "%sPID: "PID_FMT"\n",
5928 prefix, s->pid);
5929
5930 if (dual_timestamp_is_set(&s->start_timestamp))
5931 fprintf(f,
5932 "%sStart Timestamp: %s\n",
5933 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
5934
5935 if (dual_timestamp_is_set(&s->exit_timestamp))
5936 fprintf(f,
5937 "%sExit Timestamp: %s\n"
5938 "%sExit Code: %s\n"
5939 "%sExit Status: %i\n",
5940 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
5941 prefix, sigchld_code_to_string(s->code),
5942 prefix, s->status);
5943 }
5944
5945 static char *exec_command_line(char **argv) {
5946 size_t k;
5947 char *n, *p, **a;
5948 bool first = true;
5949
5950 assert(argv);
5951
5952 k = 1;
5953 STRV_FOREACH(a, argv)
5954 k += strlen(*a)+3;
5955
5956 n = new(char, k);
5957 if (!n)
5958 return NULL;
5959
5960 p = n;
5961 STRV_FOREACH(a, argv) {
5962
5963 if (!first)
5964 *(p++) = ' ';
5965 else
5966 first = false;
5967
5968 if (strpbrk(*a, WHITESPACE)) {
5969 *(p++) = '\'';
5970 p = stpcpy(p, *a);
5971 *(p++) = '\'';
5972 } else
5973 p = stpcpy(p, *a);
5974
5975 }
5976
5977 *p = 0;
5978
5979 /* FIXME: this doesn't really handle arguments that have
5980 * spaces and ticks in them */
5981
5982 return n;
5983 }
5984
5985 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
5986 _cleanup_free_ char *cmd = NULL;
5987 const char *prefix2;
5988
5989 assert(c);
5990 assert(f);
5991
5992 prefix = strempty(prefix);
5993 prefix2 = strjoina(prefix, "\t");
5994
5995 cmd = exec_command_line(c->argv);
5996 fprintf(f,
5997 "%sCommand Line: %s\n",
5998 prefix, cmd ? cmd : strerror_safe(ENOMEM));
5999
6000 exec_status_dump(&c->exec_status, f, prefix2);
6001 }
6002
6003 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6004 assert(f);
6005
6006 prefix = strempty(prefix);
6007
6008 LIST_FOREACH(command, c, c)
6009 exec_command_dump(c, f, prefix);
6010 }
6011
6012 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6013 ExecCommand *end;
6014
6015 assert(l);
6016 assert(e);
6017
6018 if (*l) {
6019 /* It's kind of important, that we keep the order here */
6020 LIST_FIND_TAIL(command, *l, end);
6021 LIST_INSERT_AFTER(command, *l, end, e);
6022 } else
6023 *l = e;
6024 }
6025
6026 int exec_command_set(ExecCommand *c, const char *path, ...) {
6027 va_list ap;
6028 char **l, *p;
6029
6030 assert(c);
6031 assert(path);
6032
6033 va_start(ap, path);
6034 l = strv_new_ap(path, ap);
6035 va_end(ap);
6036
6037 if (!l)
6038 return -ENOMEM;
6039
6040 p = strdup(path);
6041 if (!p) {
6042 strv_free(l);
6043 return -ENOMEM;
6044 }
6045
6046 free_and_replace(c->path, p);
6047
6048 return strv_free_and_replace(c->argv, l);
6049 }
6050
6051 int exec_command_append(ExecCommand *c, const char *path, ...) {
6052 _cleanup_strv_free_ char **l = NULL;
6053 va_list ap;
6054 int r;
6055
6056 assert(c);
6057 assert(path);
6058
6059 va_start(ap, path);
6060 l = strv_new_ap(path, ap);
6061 va_end(ap);
6062
6063 if (!l)
6064 return -ENOMEM;
6065
6066 r = strv_extend_strv(&c->argv, l, false);
6067 if (r < 0)
6068 return r;
6069
6070 return 0;
6071 }
6072
6073 static void *remove_tmpdir_thread(void *p) {
6074 _cleanup_free_ char *path = p;
6075
6076 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6077 return NULL;
6078 }
6079
6080 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6081 int r;
6082
6083 if (!rt)
6084 return NULL;
6085
6086 if (rt->manager)
6087 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6088
6089 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6090
6091 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6092 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6093
6094 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6095 if (r < 0)
6096 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6097 else
6098 rt->tmp_dir = NULL;
6099 }
6100
6101 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6102 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6103
6104 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6105 if (r < 0)
6106 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6107 else
6108 rt->var_tmp_dir = NULL;
6109 }
6110
6111 rt->id = mfree(rt->id);
6112 rt->tmp_dir = mfree(rt->tmp_dir);
6113 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6114 safe_close_pair(rt->netns_storage_socket);
6115 safe_close_pair(rt->ipcns_storage_socket);
6116 return mfree(rt);
6117 }
6118
6119 static void exec_runtime_freep(ExecRuntime **rt) {
6120 (void) exec_runtime_free(*rt, false);
6121 }
6122
6123 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6124 _cleanup_free_ char *id_copy = NULL;
6125 ExecRuntime *n;
6126
6127 assert(ret);
6128
6129 id_copy = strdup(id);
6130 if (!id_copy)
6131 return -ENOMEM;
6132
6133 n = new(ExecRuntime, 1);
6134 if (!n)
6135 return -ENOMEM;
6136
6137 *n = (ExecRuntime) {
6138 .id = TAKE_PTR(id_copy),
6139 .netns_storage_socket = { -1, -1 },
6140 .ipcns_storage_socket = { -1, -1 },
6141 };
6142
6143 *ret = n;
6144 return 0;
6145 }
6146
6147 static int exec_runtime_add(
6148 Manager *m,
6149 const char *id,
6150 char **tmp_dir,
6151 char **var_tmp_dir,
6152 int netns_storage_socket[2],
6153 int ipcns_storage_socket[2],
6154 ExecRuntime **ret) {
6155
6156 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6157 int r;
6158
6159 assert(m);
6160 assert(id);
6161
6162 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6163
6164 r = exec_runtime_allocate(&rt, id);
6165 if (r < 0)
6166 return r;
6167
6168 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6169 if (r < 0)
6170 return r;
6171
6172 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6173 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6174 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6175
6176 if (netns_storage_socket) {
6177 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6178 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6179 }
6180
6181 if (ipcns_storage_socket) {
6182 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6183 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6184 }
6185
6186 rt->manager = m;
6187
6188 if (ret)
6189 *ret = rt;
6190 /* do not remove created ExecRuntime object when the operation succeeds. */
6191 TAKE_PTR(rt);
6192 return 0;
6193 }
6194
6195 static int exec_runtime_make(
6196 Manager *m,
6197 const ExecContext *c,
6198 const char *id,
6199 ExecRuntime **ret) {
6200
6201 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6202 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6203 int r;
6204
6205 assert(m);
6206 assert(c);
6207 assert(id);
6208
6209 /* It is not necessary to create ExecRuntime object. */
6210 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6211 *ret = NULL;
6212 return 0;
6213 }
6214
6215 if (c->private_tmp &&
6216 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6217 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6218 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6219 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6220 if (r < 0)
6221 return r;
6222 }
6223
6224 if (c->private_network || c->network_namespace_path) {
6225 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6226 return -errno;
6227 }
6228
6229 if (c->private_ipc || c->ipc_namespace_path) {
6230 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6231 return -errno;
6232 }
6233
6234 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6235 if (r < 0)
6236 return r;
6237
6238 return 1;
6239 }
6240
6241 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6242 ExecRuntime *rt;
6243 int r;
6244
6245 assert(m);
6246 assert(id);
6247 assert(ret);
6248
6249 rt = hashmap_get(m->exec_runtime_by_id, id);
6250 if (rt)
6251 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
6252 goto ref;
6253
6254 if (!create) {
6255 *ret = NULL;
6256 return 0;
6257 }
6258
6259 /* If not found, then create a new object. */
6260 r = exec_runtime_make(m, c, id, &rt);
6261 if (r < 0)
6262 return r;
6263 if (r == 0) {
6264 /* When r == 0, it is not necessary to create ExecRuntime object. */
6265 *ret = NULL;
6266 return 0;
6267 }
6268
6269 ref:
6270 /* increment reference counter. */
6271 rt->n_ref++;
6272 *ret = rt;
6273 return 1;
6274 }
6275
6276 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6277 if (!rt)
6278 return NULL;
6279
6280 assert(rt->n_ref > 0);
6281
6282 rt->n_ref--;
6283 if (rt->n_ref > 0)
6284 return NULL;
6285
6286 return exec_runtime_free(rt, destroy);
6287 }
6288
6289 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6290 ExecRuntime *rt;
6291
6292 assert(m);
6293 assert(f);
6294 assert(fds);
6295
6296 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6297 fprintf(f, "exec-runtime=%s", rt->id);
6298
6299 if (rt->tmp_dir)
6300 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6301
6302 if (rt->var_tmp_dir)
6303 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6304
6305 if (rt->netns_storage_socket[0] >= 0) {
6306 int copy;
6307
6308 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6309 if (copy < 0)
6310 return copy;
6311
6312 fprintf(f, " netns-socket-0=%i", copy);
6313 }
6314
6315 if (rt->netns_storage_socket[1] >= 0) {
6316 int copy;
6317
6318 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6319 if (copy < 0)
6320 return copy;
6321
6322 fprintf(f, " netns-socket-1=%i", copy);
6323 }
6324
6325 if (rt->ipcns_storage_socket[0] >= 0) {
6326 int copy;
6327
6328 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6329 if (copy < 0)
6330 return copy;
6331
6332 fprintf(f, " ipcns-socket-0=%i", copy);
6333 }
6334
6335 if (rt->ipcns_storage_socket[1] >= 0) {
6336 int copy;
6337
6338 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6339 if (copy < 0)
6340 return copy;
6341
6342 fprintf(f, " ipcns-socket-1=%i", copy);
6343 }
6344
6345 fputc('\n', f);
6346 }
6347
6348 return 0;
6349 }
6350
6351 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6352 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6353 ExecRuntime *rt;
6354 int r;
6355
6356 /* This is for the migration from old (v237 or earlier) deserialization text.
6357 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6358 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6359 * so or not from the serialized text, then we always creates a new object owned by this. */
6360
6361 assert(u);
6362 assert(key);
6363 assert(value);
6364
6365 /* Manager manages ExecRuntime objects by the unit id.
6366 * So, we omit the serialized text when the unit does not have id (yet?)... */
6367 if (isempty(u->id)) {
6368 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6369 return 0;
6370 }
6371
6372 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6373 return log_oom();
6374
6375 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6376 if (!rt) {
6377 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6378 return log_oom();
6379
6380 rt = rt_create;
6381 }
6382
6383 if (streq(key, "tmp-dir")) {
6384 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6385 return -ENOMEM;
6386
6387 } else if (streq(key, "var-tmp-dir")) {
6388 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6389 return -ENOMEM;
6390
6391 } else if (streq(key, "netns-socket-0")) {
6392 int fd;
6393
6394 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6395 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6396 return 0;
6397 }
6398
6399 safe_close(rt->netns_storage_socket[0]);
6400 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6401
6402 } else if (streq(key, "netns-socket-1")) {
6403 int fd;
6404
6405 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6406 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6407 return 0;
6408 }
6409
6410 safe_close(rt->netns_storage_socket[1]);
6411 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6412
6413 } else
6414 return 0;
6415
6416 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6417 if (rt_create) {
6418 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6419 if (r < 0) {
6420 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6421 return 0;
6422 }
6423
6424 rt_create->manager = u->manager;
6425
6426 /* Avoid cleanup */
6427 TAKE_PTR(rt_create);
6428 }
6429
6430 return 1;
6431 }
6432
6433 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6434 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6435 char *id = NULL;
6436 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6437 const char *p, *v = value;
6438 size_t n;
6439
6440 assert(m);
6441 assert(value);
6442 assert(fds);
6443
6444 n = strcspn(v, " ");
6445 id = strndupa(v, n);
6446 if (v[n] != ' ')
6447 goto finalize;
6448 p = v + n + 1;
6449
6450 v = startswith(p, "tmp-dir=");
6451 if (v) {
6452 n = strcspn(v, " ");
6453 tmp_dir = strndup(v, n);
6454 if (!tmp_dir)
6455 return log_oom();
6456 if (v[n] != ' ')
6457 goto finalize;
6458 p = v + n + 1;
6459 }
6460
6461 v = startswith(p, "var-tmp-dir=");
6462 if (v) {
6463 n = strcspn(v, " ");
6464 var_tmp_dir = strndup(v, n);
6465 if (!var_tmp_dir)
6466 return log_oom();
6467 if (v[n] != ' ')
6468 goto finalize;
6469 p = v + n + 1;
6470 }
6471
6472 v = startswith(p, "netns-socket-0=");
6473 if (v) {
6474 char *buf;
6475
6476 n = strcspn(v, " ");
6477 buf = strndupa(v, n);
6478
6479 r = safe_atoi(buf, &netns_fdpair[0]);
6480 if (r < 0)
6481 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6482 if (!fdset_contains(fds, netns_fdpair[0]))
6483 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6484 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6485 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6486 if (v[n] != ' ')
6487 goto finalize;
6488 p = v + n + 1;
6489 }
6490
6491 v = startswith(p, "netns-socket-1=");
6492 if (v) {
6493 char *buf;
6494
6495 n = strcspn(v, " ");
6496 buf = strndupa(v, n);
6497
6498 r = safe_atoi(buf, &netns_fdpair[1]);
6499 if (r < 0)
6500 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6501 if (!fdset_contains(fds, netns_fdpair[1]))
6502 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6503 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6504 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6505 if (v[n] != ' ')
6506 goto finalize;
6507 p = v + n + 1;
6508 }
6509
6510 v = startswith(p, "ipcns-socket-0=");
6511 if (v) {
6512 char *buf;
6513
6514 n = strcspn(v, " ");
6515 buf = strndupa(v, n);
6516
6517 r = safe_atoi(buf, &ipcns_fdpair[0]);
6518 if (r < 0)
6519 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6520 if (!fdset_contains(fds, ipcns_fdpair[0]))
6521 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6522 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6523 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6524 if (v[n] != ' ')
6525 goto finalize;
6526 p = v + n + 1;
6527 }
6528
6529 v = startswith(p, "ipcns-socket-1=");
6530 if (v) {
6531 char *buf;
6532
6533 n = strcspn(v, " ");
6534 buf = strndupa(v, n);
6535
6536 r = safe_atoi(buf, &ipcns_fdpair[1]);
6537 if (r < 0)
6538 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6539 if (!fdset_contains(fds, ipcns_fdpair[1]))
6540 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6541 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6542 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6543 }
6544
6545 finalize:
6546 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6547 if (r < 0)
6548 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6549 return 0;
6550 }
6551
6552 void exec_runtime_vacuum(Manager *m) {
6553 ExecRuntime *rt;
6554
6555 assert(m);
6556
6557 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6558
6559 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6560 if (rt->n_ref > 0)
6561 continue;
6562
6563 (void) exec_runtime_free(rt, false);
6564 }
6565 }
6566
6567 void exec_params_clear(ExecParameters *p) {
6568 if (!p)
6569 return;
6570
6571 p->environment = strv_free(p->environment);
6572 p->fd_names = strv_free(p->fd_names);
6573 p->fds = mfree(p->fds);
6574 p->exec_fd = safe_close(p->exec_fd);
6575 }
6576
6577 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6578 if (!sc)
6579 return NULL;
6580
6581 free(sc->id);
6582 free(sc->data);
6583 return mfree(sc);
6584 }
6585
6586 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6587
6588 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6589 [EXEC_INPUT_NULL] = "null",
6590 [EXEC_INPUT_TTY] = "tty",
6591 [EXEC_INPUT_TTY_FORCE] = "tty-force",
6592 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
6593 [EXEC_INPUT_SOCKET] = "socket",
6594 [EXEC_INPUT_NAMED_FD] = "fd",
6595 [EXEC_INPUT_DATA] = "data",
6596 [EXEC_INPUT_FILE] = "file",
6597 };
6598
6599 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6600
6601 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
6602 [EXEC_OUTPUT_INHERIT] = "inherit",
6603 [EXEC_OUTPUT_NULL] = "null",
6604 [EXEC_OUTPUT_TTY] = "tty",
6605 [EXEC_OUTPUT_KMSG] = "kmsg",
6606 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
6607 [EXEC_OUTPUT_JOURNAL] = "journal",
6608 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
6609 [EXEC_OUTPUT_SOCKET] = "socket",
6610 [EXEC_OUTPUT_NAMED_FD] = "fd",
6611 [EXEC_OUTPUT_FILE] = "file",
6612 [EXEC_OUTPUT_FILE_APPEND] = "append",
6613 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
6614 };
6615
6616 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
6617
6618 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6619 [EXEC_UTMP_INIT] = "init",
6620 [EXEC_UTMP_LOGIN] = "login",
6621 [EXEC_UTMP_USER] = "user",
6622 };
6623
6624 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
6625
6626 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6627 [EXEC_PRESERVE_NO] = "no",
6628 [EXEC_PRESERVE_YES] = "yes",
6629 [EXEC_PRESERVE_RESTART] = "restart",
6630 };
6631
6632 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
6633
6634 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
6635 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6636 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6637 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6638 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6639 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6640 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6641 };
6642
6643 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
6644
6645 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6646 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6647 * directories, specifically .timer units with their timestamp touch file. */
6648 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6649 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6650 [EXEC_DIRECTORY_STATE] = "state",
6651 [EXEC_DIRECTORY_CACHE] = "cache",
6652 [EXEC_DIRECTORY_LOGS] = "logs",
6653 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6654 };
6655
6656 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6657
6658 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6659 * the service payload in. */
6660 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6661 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6662 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6663 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6664 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6665 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6666 };
6667
6668 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6669
6670 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6671 [EXEC_KEYRING_INHERIT] = "inherit",
6672 [EXEC_KEYRING_PRIVATE] = "private",
6673 [EXEC_KEYRING_SHARED] = "shared",
6674 };
6675
6676 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);