]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core,logind,systemctl,journald: replace calls to strerror() with setting errno + %m
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #if HAVE_PAM
19 #include <security/pam_appl.h>
20 #endif
21
22 #if HAVE_SELINUX
23 #include <selinux/selinux.h>
24 #endif
25
26 #if HAVE_SECCOMP
27 #include <seccomp.h>
28 #endif
29
30 #if HAVE_APPARMOR
31 #include <sys/apparmor.h>
32 #endif
33
34 #include "sd-messages.h"
35
36 #include "acl-util.h"
37 #include "af-list.h"
38 #include "alloc-util.h"
39 #if HAVE_APPARMOR
40 #include "apparmor-util.h"
41 #endif
42 #include "async.h"
43 #include "barrier.h"
44 #include "bpf-lsm.h"
45 #include "cap-list.h"
46 #include "capability-util.h"
47 #include "cgroup-setup.h"
48 #include "chase-symlinks.h"
49 #include "chown-recursive.h"
50 #include "cpu-set-util.h"
51 #include "creds-util.h"
52 #include "data-fd-util.h"
53 #include "def.h"
54 #include "env-file.h"
55 #include "env-util.h"
56 #include "errno-list.h"
57 #include "escape.h"
58 #include "execute.h"
59 #include "exit-status.h"
60 #include "fd-util.h"
61 #include "fileio.h"
62 #include "format-util.h"
63 #include "glob-util.h"
64 #include "hexdecoct.h"
65 #include "io-util.h"
66 #include "ioprio-util.h"
67 #include "label.h"
68 #include "log.h"
69 #include "macro.h"
70 #include "manager.h"
71 #include "manager-dump.h"
72 #include "memory-util.h"
73 #include "missing_fs.h"
74 #include "missing_ioprio.h"
75 #include "mkdir-label.h"
76 #include "mount-util.h"
77 #include "mountpoint-util.h"
78 #include "namespace.h"
79 #include "parse-util.h"
80 #include "path-util.h"
81 #include "process-util.h"
82 #include "random-util.h"
83 #include "recurse-dir.h"
84 #include "rlimit-util.h"
85 #include "rm-rf.h"
86 #if HAVE_SECCOMP
87 #include "seccomp-util.h"
88 #endif
89 #include "securebits-util.h"
90 #include "selinux-util.h"
91 #include "signal-util.h"
92 #include "smack-util.h"
93 #include "socket-util.h"
94 #include "special.h"
95 #include "stat-util.h"
96 #include "string-table.h"
97 #include "string-util.h"
98 #include "strv.h"
99 #include "syslog-util.h"
100 #include "terminal-util.h"
101 #include "tmpfile-util.h"
102 #include "umask-util.h"
103 #include "unit-serialize.h"
104 #include "user-util.h"
105 #include "utmp-wtmp.h"
106
107 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
108 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
109
110 #define SNDBUF_SIZE (8*1024*1024)
111
112 static int shift_fds(int fds[], size_t n_fds) {
113 if (n_fds <= 0)
114 return 0;
115
116 /* Modifies the fds array! (sorts it) */
117
118 assert(fds);
119
120 for (int start = 0;;) {
121 int restart_from = -1;
122
123 for (int i = start; i < (int) n_fds; i++) {
124 int nfd;
125
126 /* Already at right index? */
127 if (fds[i] == i+3)
128 continue;
129
130 nfd = fcntl(fds[i], F_DUPFD, i + 3);
131 if (nfd < 0)
132 return -errno;
133
134 safe_close(fds[i]);
135 fds[i] = nfd;
136
137 /* Hmm, the fd we wanted isn't free? Then
138 * let's remember that and try again from here */
139 if (nfd != i+3 && restart_from < 0)
140 restart_from = i;
141 }
142
143 if (restart_from < 0)
144 break;
145
146 start = restart_from;
147 }
148
149 return 0;
150 }
151
152 static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
153 size_t n_fds;
154 int r;
155
156 n_fds = n_socket_fds + n_storage_fds;
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
164
165 for (size_t i = 0; i < n_fds; i++) {
166
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
172
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
176
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
179 return r;
180 }
181
182 return 0;
183 }
184
185 static const char *exec_context_tty_path(const ExecContext *context) {
186 assert(context);
187
188 if (context->stdio_as_fds)
189 return NULL;
190
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195 }
196
197 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
198 const char *path;
199
200 assert(context);
201
202 path = exec_context_tty_path(context);
203
204 if (context->tty_vhangup) {
205 if (p && p->stdin_fd >= 0)
206 (void) terminal_vhangup_fd(p->stdin_fd);
207 else if (path)
208 (void) terminal_vhangup(path);
209 }
210
211 if (context->tty_reset) {
212 if (p && p->stdin_fd >= 0)
213 (void) reset_terminal_fd(p->stdin_fd, true);
214 else if (path)
215 (void) reset_terminal(path);
216 }
217
218 if (p && p->stdin_fd >= 0)
219 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
220
221 if (context->tty_vt_disallocate && path)
222 (void) vt_disallocate(path);
223 }
224
225 static bool is_terminal_input(ExecInput i) {
226 return IN_SET(i,
227 EXEC_INPUT_TTY,
228 EXEC_INPUT_TTY_FORCE,
229 EXEC_INPUT_TTY_FAIL);
230 }
231
232 static bool is_terminal_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_TTY,
235 EXEC_OUTPUT_KMSG_AND_CONSOLE,
236 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
237 }
238
239 static bool is_kmsg_output(ExecOutput o) {
240 return IN_SET(o,
241 EXEC_OUTPUT_KMSG,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE);
243 }
244
245 static bool exec_context_needs_term(const ExecContext *c) {
246 assert(c);
247
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
249
250 if (is_terminal_input(c->std_input))
251 return true;
252
253 if (is_terminal_output(c->std_output))
254 return true;
255
256 if (is_terminal_output(c->std_error))
257 return true;
258
259 return !!c->tty_path;
260 }
261
262 static int open_null_as(int flags, int nfd) {
263 int fd;
264
265 assert(nfd >= 0);
266
267 fd = open("/dev/null", flags|O_NOCTTY);
268 if (fd < 0)
269 return -errno;
270
271 return move_fd(fd, nfd, false);
272 }
273
274 static int connect_journal_socket(
275 int fd,
276 const char *log_namespace,
277 uid_t uid,
278 gid_t gid) {
279
280 uid_t olduid = UID_INVALID;
281 gid_t oldgid = GID_INVALID;
282 const char *j;
283 int r;
284
285 j = log_namespace ?
286 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
287 "/run/systemd/journal/stdout";
288
289 if (gid_is_valid(gid)) {
290 oldgid = getgid();
291
292 if (setegid(gid) < 0)
293 return -errno;
294 }
295
296 if (uid_is_valid(uid)) {
297 olduid = getuid();
298
299 if (seteuid(uid) < 0) {
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
305 r = connect_unix_path(fd, AT_FDCWD, j);
306
307 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
308 an LSM interferes. */
309
310 if (uid_is_valid(uid))
311 (void) seteuid(olduid);
312
313 restore_gid:
314 if (gid_is_valid(gid))
315 (void) setegid(oldgid);
316
317 return r;
318 }
319
320 static int connect_logger_as(
321 const Unit *unit,
322 const ExecContext *context,
323 const ExecParameters *params,
324 ExecOutput output,
325 const char *ident,
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
330 _cleanup_close_ int fd = -1;
331 int r;
332
333 assert(context);
334 assert(params);
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
338
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
341 return -errno;
342
343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
344 if (r < 0)
345 return r;
346
347 if (shutdown(fd, SHUT_RD) < 0)
348 return -errno;
349
350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
351
352 if (dprintf(fd,
353 "%s\n"
354 "%s\n"
355 "%i\n"
356 "%i\n"
357 "%i\n"
358 "%i\n"
359 "%i\n",
360 context->syslog_identifier ?: ident,
361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
364 false,
365 is_kmsg_output(output),
366 is_terminal_output(output)) < 0)
367 return -errno;
368
369 return move_fd(TAKE_FD(fd), nfd, false);
370 }
371
372 static int open_terminal_as(const char *path, int flags, int nfd) {
373 int fd;
374
375 assert(path);
376 assert(nfd >= 0);
377
378 fd = open_terminal(path, flags | O_NOCTTY);
379 if (fd < 0)
380 return fd;
381
382 return move_fd(fd, nfd, false);
383 }
384
385 static int acquire_path(const char *path, int flags, mode_t mode) {
386 _cleanup_close_ int fd = -1;
387 int r;
388
389 assert(path);
390
391 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
392 flags |= O_CREAT;
393
394 fd = open(path, flags|O_NOCTTY, mode);
395 if (fd >= 0)
396 return TAKE_FD(fd);
397
398 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
399 return -errno;
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
403 fd = socket(AF_UNIX, SOCK_STREAM, 0);
404 if (fd < 0)
405 return -errno;
406
407 r = connect_unix_path(fd, AT_FDCWD, path);
408 if (IN_SET(r, -ENOTSOCK, -EINVAL))
409 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
410 * wasn't an AF_UNIX socket after all */
411 return -ENXIO;
412 if (r < 0)
413 return r;
414
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
420 r = 0;
421 if (r < 0)
422 return -errno;
423
424 return TAKE_FD(fd);
425 }
426
427 static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
440
441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
442 return EXEC_INPUT_NULL;
443
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
447 return std_input;
448 }
449
450 static int fixup_output(ExecOutput output, int socket_fd) {
451
452 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
453 return EXEC_OUTPUT_INHERIT;
454
455 return output;
456 }
457
458 static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
461 int socket_fd,
462 const int named_iofds[static 3]) {
463
464 ExecInput i;
465 int r;
466
467 assert(context);
468 assert(params);
469 assert(named_iofds);
470
471 if (params->stdin_fd >= 0) {
472 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
473 return -errno;
474
475 /* Try to make this the controlling tty, if it is a tty, and reset it */
476 if (isatty(STDIN_FILENO)) {
477 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
478 (void) reset_terminal_fd(STDIN_FILENO, true);
479 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
480 }
481
482 return STDIN_FILENO;
483 }
484
485 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
486
487 switch (i) {
488
489 case EXEC_INPUT_NULL:
490 return open_null_as(O_RDONLY, STDIN_FILENO);
491
492 case EXEC_INPUT_TTY:
493 case EXEC_INPUT_TTY_FORCE:
494 case EXEC_INPUT_TTY_FAIL: {
495 int fd;
496
497 fd = acquire_terminal(exec_context_tty_path(context),
498 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
499 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
500 ACQUIRE_TERMINAL_WAIT,
501 USEC_INFINITY);
502 if (fd < 0)
503 return fd;
504
505 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
506 if (r < 0)
507 return r;
508
509 return move_fd(fd, STDIN_FILENO, false);
510 }
511
512 case EXEC_INPUT_SOCKET:
513 assert(socket_fd >= 0);
514
515 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
516
517 case EXEC_INPUT_NAMED_FD:
518 assert(named_iofds[STDIN_FILENO] >= 0);
519
520 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
521 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
522
523 case EXEC_INPUT_DATA: {
524 int fd;
525
526 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
527 if (fd < 0)
528 return fd;
529
530 return move_fd(fd, STDIN_FILENO, false);
531 }
532
533 case EXEC_INPUT_FILE: {
534 bool rw;
535 int fd;
536
537 assert(context->stdio_file[STDIN_FILENO]);
538
539 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
540 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
541
542 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
543 if (fd < 0)
544 return fd;
545
546 return move_fd(fd, STDIN_FILENO, false);
547 }
548
549 default:
550 assert_not_reached();
551 }
552 }
553
554 static bool can_inherit_stderr_from_stdout(
555 const ExecContext *context,
556 ExecOutput o,
557 ExecOutput e) {
558
559 assert(context);
560
561 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
562 * stderr fd */
563
564 if (e == EXEC_OUTPUT_INHERIT)
565 return true;
566 if (e != o)
567 return false;
568
569 if (e == EXEC_OUTPUT_NAMED_FD)
570 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
571
572 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
573 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
574
575 return true;
576 }
577
578 static int setup_output(
579 const Unit *unit,
580 const ExecContext *context,
581 const ExecParameters *params,
582 int fileno,
583 int socket_fd,
584 const int named_iofds[static 3],
585 const char *ident,
586 uid_t uid,
587 gid_t gid,
588 dev_t *journal_stream_dev,
589 ino_t *journal_stream_ino) {
590
591 ExecOutput o;
592 ExecInput i;
593 int r;
594
595 assert(unit);
596 assert(context);
597 assert(params);
598 assert(ident);
599 assert(journal_stream_dev);
600 assert(journal_stream_ino);
601
602 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
603
604 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
605 return -errno;
606
607 return STDOUT_FILENO;
608 }
609
610 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
611 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
612 return -errno;
613
614 return STDERR_FILENO;
615 }
616
617 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
618 o = fixup_output(context->std_output, socket_fd);
619
620 if (fileno == STDERR_FILENO) {
621 ExecOutput e;
622 e = fixup_output(context->std_error, socket_fd);
623
624 /* This expects the input and output are already set up */
625
626 /* Don't change the stderr file descriptor if we inherit all
627 * the way and are not on a tty */
628 if (e == EXEC_OUTPUT_INHERIT &&
629 o == EXEC_OUTPUT_INHERIT &&
630 i == EXEC_INPUT_NULL &&
631 !is_terminal_input(context->std_input) &&
632 getppid() != 1)
633 return fileno;
634
635 /* Duplicate from stdout if possible */
636 if (can_inherit_stderr_from_stdout(context, o, e))
637 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
638
639 o = e;
640
641 } else if (o == EXEC_OUTPUT_INHERIT) {
642 /* If input got downgraded, inherit the original value */
643 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
644 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
645
646 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
647 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
648 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
649
650 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
651 if (getppid() != 1)
652 return fileno;
653
654 /* We need to open /dev/null here anew, to get the right access mode. */
655 return open_null_as(O_WRONLY, fileno);
656 }
657
658 switch (o) {
659
660 case EXEC_OUTPUT_NULL:
661 return open_null_as(O_WRONLY, fileno);
662
663 case EXEC_OUTPUT_TTY:
664 if (is_terminal_input(i))
665 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
666
667 /* We don't reset the terminal if this is just about output */
668 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
669
670 case EXEC_OUTPUT_KMSG:
671 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
672 case EXEC_OUTPUT_JOURNAL:
673 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
674 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
675 if (r < 0) {
676 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
677 fileno == STDOUT_FILENO ? "stdout" : "stderr");
678 r = open_null_as(O_WRONLY, fileno);
679 } else {
680 struct stat st;
681
682 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
683 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
684 * services to detect whether they are connected to the journal or not.
685 *
686 * If both stdout and stderr are connected to a stream then let's make sure to store the data
687 * about STDERR as that's usually the best way to do logging. */
688
689 if (fstat(fileno, &st) >= 0 &&
690 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
691 *journal_stream_dev = st.st_dev;
692 *journal_stream_ino = st.st_ino;
693 }
694 }
695 return r;
696
697 case EXEC_OUTPUT_SOCKET:
698 assert(socket_fd >= 0);
699
700 return RET_NERRNO(dup2(socket_fd, fileno));
701
702 case EXEC_OUTPUT_NAMED_FD:
703 assert(named_iofds[fileno] >= 0);
704
705 (void) fd_nonblock(named_iofds[fileno], false);
706 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
707
708 case EXEC_OUTPUT_FILE:
709 case EXEC_OUTPUT_FILE_APPEND:
710 case EXEC_OUTPUT_FILE_TRUNCATE: {
711 bool rw;
712 int fd, flags;
713
714 assert(context->stdio_file[fileno]);
715
716 rw = context->std_input == EXEC_INPUT_FILE &&
717 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
718
719 if (rw)
720 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
721
722 flags = O_WRONLY;
723 if (o == EXEC_OUTPUT_FILE_APPEND)
724 flags |= O_APPEND;
725 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
726 flags |= O_TRUNC;
727
728 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
729 if (fd < 0)
730 return fd;
731
732 return move_fd(fd, fileno, 0);
733 }
734
735 default:
736 assert_not_reached();
737 }
738 }
739
740 static int chown_terminal(int fd, uid_t uid) {
741 int r;
742
743 assert(fd >= 0);
744
745 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
746 if (isatty(fd) < 1) {
747 if (IN_SET(errno, EINVAL, ENOTTY))
748 return 0; /* not a tty */
749
750 return -errno;
751 }
752
753 /* This might fail. What matters are the results. */
754 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
755 if (r < 0)
756 return r;
757
758 return 1;
759 }
760
761 static int setup_confirm_stdio(
762 const ExecContext *context,
763 const char *vc,
764 int *ret_saved_stdin,
765 int *ret_saved_stdout) {
766
767 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
768 int r;
769
770 assert(ret_saved_stdin);
771 assert(ret_saved_stdout);
772
773 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
774 if (saved_stdin < 0)
775 return -errno;
776
777 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
778 if (saved_stdout < 0)
779 return -errno;
780
781 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
782 if (fd < 0)
783 return fd;
784
785 r = chown_terminal(fd, getuid());
786 if (r < 0)
787 return r;
788
789 r = reset_terminal_fd(fd, true);
790 if (r < 0)
791 return r;
792
793 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
794 if (r < 0)
795 return r;
796
797 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
798 TAKE_FD(fd);
799 if (r < 0)
800 return r;
801
802 *ret_saved_stdin = TAKE_FD(saved_stdin);
803 *ret_saved_stdout = TAKE_FD(saved_stdout);
804 return 0;
805 }
806
807 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
808 assert(err < 0);
809
810 if (err == -ETIMEDOUT)
811 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
812 else {
813 errno = -err;
814 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
815 }
816 }
817
818 static void write_confirm_error(int err, const char *vc, const Unit *u) {
819 _cleanup_close_ int fd = -1;
820
821 assert(vc);
822
823 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
824 if (fd < 0)
825 return;
826
827 write_confirm_error_fd(err, fd, u);
828 }
829
830 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
831 int r = 0;
832
833 assert(saved_stdin);
834 assert(saved_stdout);
835
836 release_terminal();
837
838 if (*saved_stdin >= 0)
839 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
840 r = -errno;
841
842 if (*saved_stdout >= 0)
843 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
844 r = -errno;
845
846 *saved_stdin = safe_close(*saved_stdin);
847 *saved_stdout = safe_close(*saved_stdout);
848
849 return r;
850 }
851
852 enum {
853 CONFIRM_PRETEND_FAILURE = -1,
854 CONFIRM_PRETEND_SUCCESS = 0,
855 CONFIRM_EXECUTE = 1,
856 };
857
858 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
859 int saved_stdout = -1, saved_stdin = -1, r;
860 _cleanup_free_ char *e = NULL;
861 char c;
862
863 /* For any internal errors, assume a positive response. */
864 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
865 if (r < 0) {
866 write_confirm_error(r, vc, u);
867 return CONFIRM_EXECUTE;
868 }
869
870 /* confirm_spawn might have been disabled while we were sleeping. */
871 if (manager_is_confirm_spawn_disabled(u->manager)) {
872 r = 1;
873 goto restore_stdio;
874 }
875
876 e = ellipsize(cmdline, 60, 100);
877 if (!e) {
878 log_oom();
879 r = CONFIRM_EXECUTE;
880 goto restore_stdio;
881 }
882
883 for (;;) {
884 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
885 if (r < 0) {
886 write_confirm_error_fd(r, STDOUT_FILENO, u);
887 r = CONFIRM_EXECUTE;
888 goto restore_stdio;
889 }
890
891 switch (c) {
892 case 'c':
893 printf("Resuming normal execution.\n");
894 manager_disable_confirm_spawn();
895 r = 1;
896 break;
897 case 'D':
898 unit_dump(u, stdout, " ");
899 continue; /* ask again */
900 case 'f':
901 printf("Failing execution.\n");
902 r = CONFIRM_PRETEND_FAILURE;
903 break;
904 case 'h':
905 printf(" c - continue, proceed without asking anymore\n"
906 " D - dump, show the state of the unit\n"
907 " f - fail, don't execute the command and pretend it failed\n"
908 " h - help\n"
909 " i - info, show a short summary of the unit\n"
910 " j - jobs, show jobs that are in progress\n"
911 " s - skip, don't execute the command and pretend it succeeded\n"
912 " y - yes, execute the command\n");
913 continue; /* ask again */
914 case 'i':
915 printf(" Description: %s\n"
916 " Unit: %s\n"
917 " Command: %s\n",
918 u->id, u->description, cmdline);
919 continue; /* ask again */
920 case 'j':
921 manager_dump_jobs(u->manager, stdout, " ");
922 continue; /* ask again */
923 case 'n':
924 /* 'n' was removed in favor of 'f'. */
925 printf("Didn't understand 'n', did you mean 'f'?\n");
926 continue; /* ask again */
927 case 's':
928 printf("Skipping execution.\n");
929 r = CONFIRM_PRETEND_SUCCESS;
930 break;
931 case 'y':
932 r = CONFIRM_EXECUTE;
933 break;
934 default:
935 assert_not_reached();
936 }
937 break;
938 }
939
940 restore_stdio:
941 restore_confirm_stdio(&saved_stdin, &saved_stdout);
942 return r;
943 }
944
945 static int get_fixed_user(const ExecContext *c, const char **user,
946 uid_t *uid, gid_t *gid,
947 const char **home, const char **shell) {
948 int r;
949 const char *name;
950
951 assert(c);
952
953 if (!c->user)
954 return 0;
955
956 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
957 * (i.e. are "/" or "/bin/nologin"). */
958
959 name = c->user;
960 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
961 if (r < 0)
962 return r;
963
964 *user = name;
965 return 0;
966 }
967
968 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
969 int r;
970 const char *name;
971
972 assert(c);
973
974 if (!c->group)
975 return 0;
976
977 name = c->group;
978 r = get_group_creds(&name, gid, 0);
979 if (r < 0)
980 return r;
981
982 *group = name;
983 return 0;
984 }
985
986 static int get_supplementary_groups(const ExecContext *c, const char *user,
987 const char *group, gid_t gid,
988 gid_t **supplementary_gids, int *ngids) {
989 int r, k = 0;
990 int ngroups_max;
991 bool keep_groups = false;
992 gid_t *groups = NULL;
993 _cleanup_free_ gid_t *l_gids = NULL;
994
995 assert(c);
996
997 /*
998 * If user is given, then lookup GID and supplementary groups list.
999 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1000 * here and as early as possible so we keep the list of supplementary
1001 * groups of the caller.
1002 */
1003 if (user && gid_is_valid(gid) && gid != 0) {
1004 /* First step, initialize groups from /etc/groups */
1005 if (initgroups(user, gid) < 0)
1006 return -errno;
1007
1008 keep_groups = true;
1009 }
1010
1011 if (strv_isempty(c->supplementary_groups))
1012 return 0;
1013
1014 /*
1015 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1016 * be positive, otherwise fail.
1017 */
1018 errno = 0;
1019 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1020 if (ngroups_max <= 0)
1021 return errno_or_else(EOPNOTSUPP);
1022
1023 l_gids = new(gid_t, ngroups_max);
1024 if (!l_gids)
1025 return -ENOMEM;
1026
1027 if (keep_groups) {
1028 /*
1029 * Lookup the list of groups that the user belongs to, we
1030 * avoid NSS lookups here too for gid=0.
1031 */
1032 k = ngroups_max;
1033 if (getgrouplist(user, gid, l_gids, &k) < 0)
1034 return -EINVAL;
1035 } else
1036 k = 0;
1037
1038 STRV_FOREACH(i, c->supplementary_groups) {
1039 const char *g;
1040
1041 if (k >= ngroups_max)
1042 return -E2BIG;
1043
1044 g = *i;
1045 r = get_group_creds(&g, l_gids+k, 0);
1046 if (r < 0)
1047 return r;
1048
1049 k++;
1050 }
1051
1052 /*
1053 * Sets ngids to zero to drop all supplementary groups, happens
1054 * when we are under root and SupplementaryGroups= is empty.
1055 */
1056 if (k == 0) {
1057 *ngids = 0;
1058 return 0;
1059 }
1060
1061 /* Otherwise get the final list of supplementary groups */
1062 groups = memdup(l_gids, sizeof(gid_t) * k);
1063 if (!groups)
1064 return -ENOMEM;
1065
1066 *supplementary_gids = groups;
1067 *ngids = k;
1068
1069 groups = NULL;
1070
1071 return 0;
1072 }
1073
1074 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1075 int r;
1076
1077 /* Handle SupplementaryGroups= if it is not empty */
1078 if (ngids > 0) {
1079 r = maybe_setgroups(ngids, supplementary_gids);
1080 if (r < 0)
1081 return r;
1082 }
1083
1084 if (gid_is_valid(gid)) {
1085 /* Then set our gids */
1086 if (setresgid(gid, gid, gid) < 0)
1087 return -errno;
1088 }
1089
1090 return 0;
1091 }
1092
1093 static int set_securebits(int bits, int mask) {
1094 int current, applied;
1095 current = prctl(PR_GET_SECUREBITS);
1096 if (current < 0)
1097 return -errno;
1098 /* Clear all securebits defined in mask and set bits */
1099 applied = (current & ~mask) | bits;
1100 if (current == applied)
1101 return 0;
1102 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1103 return -errno;
1104 return 1;
1105 }
1106
1107 static int enforce_user(const ExecContext *context, uid_t uid) {
1108 assert(context);
1109 int r;
1110
1111 if (!uid_is_valid(uid))
1112 return 0;
1113
1114 /* Sets (but doesn't look up) the uid and make sure we keep the
1115 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1116 * required, so we also need keep-caps in this case.
1117 */
1118
1119 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1120
1121 /* First step: If we need to keep capabilities but
1122 * drop privileges we need to make sure we keep our
1123 * caps, while we drop privileges. */
1124 if (uid != 0) {
1125 /* Add KEEP_CAPS to the securebits */
1126 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1127 if (r < 0)
1128 return r;
1129 }
1130 }
1131
1132 /* Second step: actually set the uids */
1133 if (setresuid(uid, uid, uid) < 0)
1134 return -errno;
1135
1136 /* At this point we should have all necessary capabilities but
1137 are otherwise a normal user. However, the caps might got
1138 corrupted due to the setresuid() so we need clean them up
1139 later. This is done outside of this call. */
1140
1141 return 0;
1142 }
1143
1144 #if HAVE_PAM
1145
1146 static int null_conv(
1147 int num_msg,
1148 const struct pam_message **msg,
1149 struct pam_response **resp,
1150 void *appdata_ptr) {
1151
1152 /* We don't support conversations */
1153
1154 return PAM_CONV_ERR;
1155 }
1156
1157 #endif
1158
1159 static int setup_pam(
1160 const char *name,
1161 const char *user,
1162 uid_t uid,
1163 gid_t gid,
1164 const char *tty,
1165 char ***env, /* updated on success */
1166 const int fds[], size_t n_fds) {
1167
1168 #if HAVE_PAM
1169
1170 static const struct pam_conv conv = {
1171 .conv = null_conv,
1172 .appdata_ptr = NULL
1173 };
1174
1175 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1176 _cleanup_strv_free_ char **e = NULL;
1177 pam_handle_t *handle = NULL;
1178 sigset_t old_ss;
1179 int pam_code = PAM_SUCCESS, r;
1180 bool close_session = false;
1181 pid_t pam_pid = 0, parent_pid;
1182 int flags = 0;
1183
1184 assert(name);
1185 assert(user);
1186 assert(env);
1187
1188 /* We set up PAM in the parent process, then fork. The child
1189 * will then stay around until killed via PR_GET_PDEATHSIG or
1190 * systemd via the cgroup logic. It will then remove the PAM
1191 * session again. The parent process will exec() the actual
1192 * daemon. We do things this way to ensure that the main PID
1193 * of the daemon is the one we initially fork()ed. */
1194
1195 r = barrier_create(&barrier);
1196 if (r < 0)
1197 goto fail;
1198
1199 if (log_get_max_level() < LOG_DEBUG)
1200 flags |= PAM_SILENT;
1201
1202 pam_code = pam_start(name, user, &conv, &handle);
1203 if (pam_code != PAM_SUCCESS) {
1204 handle = NULL;
1205 goto fail;
1206 }
1207
1208 if (!tty) {
1209 _cleanup_free_ char *q = NULL;
1210
1211 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1212 * out if that's the case, and read the TTY off it. */
1213
1214 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1215 tty = strjoina("/dev/", q);
1216 }
1217
1218 if (tty) {
1219 pam_code = pam_set_item(handle, PAM_TTY, tty);
1220 if (pam_code != PAM_SUCCESS)
1221 goto fail;
1222 }
1223
1224 STRV_FOREACH(nv, *env) {
1225 pam_code = pam_putenv(handle, *nv);
1226 if (pam_code != PAM_SUCCESS)
1227 goto fail;
1228 }
1229
1230 pam_code = pam_acct_mgmt(handle, flags);
1231 if (pam_code != PAM_SUCCESS)
1232 goto fail;
1233
1234 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1235 if (pam_code != PAM_SUCCESS)
1236 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1237
1238 pam_code = pam_open_session(handle, flags);
1239 if (pam_code != PAM_SUCCESS)
1240 goto fail;
1241
1242 close_session = true;
1243
1244 e = pam_getenvlist(handle);
1245 if (!e) {
1246 pam_code = PAM_BUF_ERR;
1247 goto fail;
1248 }
1249
1250 /* Block SIGTERM, so that we know that it won't get lost in the child */
1251
1252 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1253
1254 parent_pid = getpid_cached();
1255
1256 r = safe_fork("(sd-pam)", 0, &pam_pid);
1257 if (r < 0)
1258 goto fail;
1259 if (r == 0) {
1260 int sig, ret = EXIT_PAM;
1261
1262 /* The child's job is to reset the PAM session on termination */
1263 barrier_set_role(&barrier, BARRIER_CHILD);
1264
1265 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1266 * those fds are open here that have been opened by PAM. */
1267 (void) close_many(fds, n_fds);
1268
1269 /* Drop privileges - we don't need any to pam_close_session and this will make
1270 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1271 * threads to fail to exit normally */
1272
1273 r = maybe_setgroups(0, NULL);
1274 if (r < 0)
1275 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1276 if (setresgid(gid, gid, gid) < 0)
1277 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1278 if (setresuid(uid, uid, uid) < 0)
1279 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1280
1281 (void) ignore_signals(SIGPIPE);
1282
1283 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1284 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1285 * this way. We rely on the control groups kill logic to do the rest for us. */
1286 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1287 goto child_finish;
1288
1289 /* Tell the parent that our setup is done. This is especially important regarding dropping
1290 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1291 *
1292 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1293 (void) barrier_place(&barrier);
1294
1295 /* Check if our parent process might already have died? */
1296 if (getppid() == parent_pid) {
1297 sigset_t ss;
1298
1299 assert_se(sigemptyset(&ss) >= 0);
1300 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
1302 for (;;) {
1303 if (sigwait(&ss, &sig) < 0) {
1304 if (errno == EINTR)
1305 continue;
1306
1307 goto child_finish;
1308 }
1309
1310 assert(sig == SIGTERM);
1311 break;
1312 }
1313 }
1314
1315 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316 if (pam_code != PAM_SUCCESS)
1317 goto child_finish;
1318
1319 /* If our parent died we'll end the session */
1320 if (getppid() != parent_pid) {
1321 pam_code = pam_close_session(handle, flags);
1322 if (pam_code != PAM_SUCCESS)
1323 goto child_finish;
1324 }
1325
1326 ret = 0;
1327
1328 child_finish:
1329 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1330 * know about this. See pam_end(3) */
1331 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1332 _exit(ret);
1333 }
1334
1335 barrier_set_role(&barrier, BARRIER_PARENT);
1336
1337 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1338 * here. */
1339 handle = NULL;
1340
1341 /* Unblock SIGTERM again in the parent */
1342 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1343
1344 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1345 * this fd around. */
1346 closelog();
1347
1348 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1349 * recover. However, warn loudly if it happens. */
1350 if (!barrier_place_and_sync(&barrier))
1351 log_error("PAM initialization failed");
1352
1353 return strv_free_and_replace(*env, e);
1354
1355 fail:
1356 if (pam_code != PAM_SUCCESS) {
1357 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1358 r = -EPERM; /* PAM errors do not map to errno */
1359 } else
1360 log_error_errno(r, "PAM failed: %m");
1361
1362 if (handle) {
1363 if (close_session)
1364 pam_code = pam_close_session(handle, flags);
1365
1366 (void) pam_end(handle, pam_code | flags);
1367 }
1368
1369 closelog();
1370 return r;
1371 #else
1372 return 0;
1373 #endif
1374 }
1375
1376 static void rename_process_from_path(const char *path) {
1377 char process_name[11];
1378 const char *p;
1379 size_t l;
1380
1381 /* This resulting string must fit in 10 chars (i.e. the length
1382 * of "/sbin/init") to look pretty in /bin/ps */
1383
1384 p = basename(path);
1385 if (isempty(p)) {
1386 rename_process("(...)");
1387 return;
1388 }
1389
1390 l = strlen(p);
1391 if (l > 8) {
1392 /* The end of the process name is usually more
1393 * interesting, since the first bit might just be
1394 * "systemd-" */
1395 p = p + l - 8;
1396 l = 8;
1397 }
1398
1399 process_name[0] = '(';
1400 memcpy(process_name+1, p, l);
1401 process_name[1+l] = ')';
1402 process_name[1+l+1] = 0;
1403
1404 rename_process(process_name);
1405 }
1406
1407 static bool context_has_address_families(const ExecContext *c) {
1408 assert(c);
1409
1410 return c->address_families_allow_list ||
1411 !set_isempty(c->address_families);
1412 }
1413
1414 static bool context_has_syscall_filters(const ExecContext *c) {
1415 assert(c);
1416
1417 return c->syscall_allow_list ||
1418 !hashmap_isempty(c->syscall_filter);
1419 }
1420
1421 static bool context_has_syscall_logs(const ExecContext *c) {
1422 assert(c);
1423
1424 return c->syscall_log_allow_list ||
1425 !hashmap_isempty(c->syscall_log);
1426 }
1427
1428 static bool context_has_no_new_privileges(const ExecContext *c) {
1429 assert(c);
1430
1431 if (c->no_new_privileges)
1432 return true;
1433
1434 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435 return false;
1436
1437 /* We need NNP if we have any form of seccomp and are unprivileged */
1438 return c->lock_personality ||
1439 c->memory_deny_write_execute ||
1440 c->private_devices ||
1441 c->protect_clock ||
1442 c->protect_hostname ||
1443 c->protect_kernel_tunables ||
1444 c->protect_kernel_modules ||
1445 c->protect_kernel_logs ||
1446 context_has_address_families(c) ||
1447 exec_context_restrict_namespaces_set(c) ||
1448 c->restrict_realtime ||
1449 c->restrict_suid_sgid ||
1450 !set_isempty(c->syscall_archs) ||
1451 context_has_syscall_filters(c) ||
1452 context_has_syscall_logs(c);
1453 }
1454
1455 static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457 assert(context);
1458
1459 return !hashmap_isempty(context->set_credentials) ||
1460 !hashmap_isempty(context->load_credentials);
1461 }
1462
1463 #if HAVE_SECCOMP
1464
1465 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1466
1467 if (is_seccomp_available())
1468 return false;
1469
1470 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1471 return true;
1472 }
1473
1474 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1475 uint32_t negative_action, default_action, action;
1476 int r;
1477
1478 assert(u);
1479 assert(c);
1480
1481 if (!context_has_syscall_filters(c))
1482 return 0;
1483
1484 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485 return 0;
1486
1487 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1488
1489 if (c->syscall_allow_list) {
1490 default_action = negative_action;
1491 action = SCMP_ACT_ALLOW;
1492 } else {
1493 default_action = SCMP_ACT_ALLOW;
1494 action = negative_action;
1495 }
1496
1497 if (needs_ambient_hack) {
1498 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1499 if (r < 0)
1500 return r;
1501 }
1502
1503 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1504 }
1505
1506 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507 #ifdef SCMP_ACT_LOG
1508 uint32_t default_action, action;
1509 #endif
1510
1511 assert(u);
1512 assert(c);
1513
1514 if (!context_has_syscall_logs(c))
1515 return 0;
1516
1517 #ifdef SCMP_ACT_LOG
1518 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519 return 0;
1520
1521 if (c->syscall_log_allow_list) {
1522 /* Log nothing but the ones listed */
1523 default_action = SCMP_ACT_ALLOW;
1524 action = SCMP_ACT_LOG;
1525 } else {
1526 /* Log everything but the ones listed */
1527 default_action = SCMP_ACT_LOG;
1528 action = SCMP_ACT_ALLOW;
1529 }
1530
1531 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532 #else
1533 /* old libseccomp */
1534 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535 return 0;
1536 #endif
1537 }
1538
1539 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540 assert(u);
1541 assert(c);
1542
1543 if (set_isempty(c->syscall_archs))
1544 return 0;
1545
1546 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547 return 0;
1548
1549 return seccomp_restrict_archs(c->syscall_archs);
1550 }
1551
1552 static int apply_address_families(const Unit* u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
1555
1556 if (!context_has_address_families(c))
1557 return 0;
1558
1559 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560 return 0;
1561
1562 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1563 }
1564
1565 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1566 assert(u);
1567 assert(c);
1568
1569 if (!c->memory_deny_write_execute)
1570 return 0;
1571
1572 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573 return 0;
1574
1575 return seccomp_memory_deny_write_execute();
1576 }
1577
1578 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1579 assert(u);
1580 assert(c);
1581
1582 if (!c->restrict_realtime)
1583 return 0;
1584
1585 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586 return 0;
1587
1588 return seccomp_restrict_realtime();
1589 }
1590
1591 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592 assert(u);
1593 assert(c);
1594
1595 if (!c->restrict_suid_sgid)
1596 return 0;
1597
1598 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599 return 0;
1600
1601 return seccomp_restrict_suid_sgid();
1602 }
1603
1604 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1605 assert(u);
1606 assert(c);
1607
1608 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609 * let's protect even those systems where this is left on in the kernel. */
1610
1611 if (!c->protect_kernel_tunables)
1612 return 0;
1613
1614 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615 return 0;
1616
1617 return seccomp_protect_sysctl();
1618 }
1619
1620 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1621 assert(u);
1622 assert(c);
1623
1624 /* Turn off module syscalls on ProtectKernelModules=yes */
1625
1626 if (!c->protect_kernel_modules)
1627 return 0;
1628
1629 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630 return 0;
1631
1632 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1633 }
1634
1635 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636 assert(u);
1637 assert(c);
1638
1639 if (!c->protect_kernel_logs)
1640 return 0;
1641
1642 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643 return 0;
1644
1645 return seccomp_protect_syslog();
1646 }
1647
1648 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1649 assert(u);
1650 assert(c);
1651
1652 if (!c->protect_clock)
1653 return 0;
1654
1655 if (skip_seccomp_unavailable(u, "ProtectClock="))
1656 return 0;
1657
1658 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659 }
1660
1661 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1662 assert(u);
1663 assert(c);
1664
1665 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1666
1667 if (!c->private_devices)
1668 return 0;
1669
1670 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671 return 0;
1672
1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1674 }
1675
1676 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1677 assert(u);
1678 assert(c);
1679
1680 if (!exec_context_restrict_namespaces_set(c))
1681 return 0;
1682
1683 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684 return 0;
1685
1686 return seccomp_restrict_namespaces(c->restrict_namespaces);
1687 }
1688
1689 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1690 unsigned long personality;
1691 int r;
1692
1693 assert(u);
1694 assert(c);
1695
1696 if (!c->lock_personality)
1697 return 0;
1698
1699 if (skip_seccomp_unavailable(u, "LockPersonality="))
1700 return 0;
1701
1702 personality = c->personality;
1703
1704 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1705 if (personality == PERSONALITY_INVALID) {
1706
1707 r = opinionated_personality(&personality);
1708 if (r < 0)
1709 return r;
1710 }
1711
1712 return seccomp_lock_personality(personality);
1713 }
1714
1715 #endif
1716
1717 #if HAVE_LIBBPF
1718 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1719 assert(u);
1720 assert(c);
1721
1722 if (!exec_context_restrict_filesystems_set(c))
1723 return 0;
1724
1725 if (!u->manager->restrict_fs) {
1726 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1727 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1728 return 0;
1729 }
1730
1731 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1732 }
1733 #endif
1734
1735 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1736 assert(u);
1737 assert(c);
1738
1739 if (!c->protect_hostname)
1740 return 0;
1741
1742 if (ns_type_supported(NAMESPACE_UTS)) {
1743 if (unshare(CLONE_NEWUTS) < 0) {
1744 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1745 *ret_exit_status = EXIT_NAMESPACE;
1746 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1747 }
1748
1749 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1750 }
1751 } else
1752 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1753
1754 #if HAVE_SECCOMP
1755 int r;
1756
1757 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1758 return 0;
1759
1760 r = seccomp_protect_hostname();
1761 if (r < 0) {
1762 *ret_exit_status = EXIT_SECCOMP;
1763 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1764 }
1765 #endif
1766
1767 return 0;
1768 }
1769
1770 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1771 assert(idle_pipe);
1772
1773 idle_pipe[1] = safe_close(idle_pipe[1]);
1774 idle_pipe[2] = safe_close(idle_pipe[2]);
1775
1776 if (idle_pipe[0] >= 0) {
1777 int r;
1778
1779 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1780
1781 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1782 ssize_t n;
1783
1784 /* Signal systemd that we are bored and want to continue. */
1785 n = write(idle_pipe[3], "x", 1);
1786 if (n > 0)
1787 /* Wait for systemd to react to the signal above. */
1788 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1789 }
1790
1791 idle_pipe[0] = safe_close(idle_pipe[0]);
1792
1793 }
1794
1795 idle_pipe[3] = safe_close(idle_pipe[3]);
1796 }
1797
1798 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1799
1800 static int build_environment(
1801 const Unit *u,
1802 const ExecContext *c,
1803 const ExecParameters *p,
1804 size_t n_fds,
1805 const char *home,
1806 const char *username,
1807 const char *shell,
1808 dev_t journal_stream_dev,
1809 ino_t journal_stream_ino,
1810 char ***ret) {
1811
1812 _cleanup_strv_free_ char **our_env = NULL;
1813 size_t n_env = 0;
1814 char *x;
1815
1816 assert(u);
1817 assert(c);
1818 assert(p);
1819 assert(ret);
1820
1821 #define N_ENV_VARS 17
1822 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1823 if (!our_env)
1824 return -ENOMEM;
1825
1826 if (n_fds > 0) {
1827 _cleanup_free_ char *joined = NULL;
1828
1829 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1830 return -ENOMEM;
1831 our_env[n_env++] = x;
1832
1833 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1834 return -ENOMEM;
1835 our_env[n_env++] = x;
1836
1837 joined = strv_join(p->fd_names, ":");
1838 if (!joined)
1839 return -ENOMEM;
1840
1841 x = strjoin("LISTEN_FDNAMES=", joined);
1842 if (!x)
1843 return -ENOMEM;
1844 our_env[n_env++] = x;
1845 }
1846
1847 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1848 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1849 return -ENOMEM;
1850 our_env[n_env++] = x;
1851
1852 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1853 return -ENOMEM;
1854 our_env[n_env++] = x;
1855 }
1856
1857 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1858 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1859 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1860 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1861 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1862 if (!x)
1863 return -ENOMEM;
1864 our_env[n_env++] = x;
1865 }
1866
1867 if (home) {
1868 x = strjoin("HOME=", home);
1869 if (!x)
1870 return -ENOMEM;
1871
1872 path_simplify(x + 5);
1873 our_env[n_env++] = x;
1874 }
1875
1876 if (username) {
1877 x = strjoin("LOGNAME=", username);
1878 if (!x)
1879 return -ENOMEM;
1880 our_env[n_env++] = x;
1881
1882 x = strjoin("USER=", username);
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886 }
1887
1888 if (shell) {
1889 x = strjoin("SHELL=", shell);
1890 if (!x)
1891 return -ENOMEM;
1892
1893 path_simplify(x + 6);
1894 our_env[n_env++] = x;
1895 }
1896
1897 if (!sd_id128_is_null(u->invocation_id)) {
1898 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1899 return -ENOMEM;
1900
1901 our_env[n_env++] = x;
1902 }
1903
1904 if (exec_context_needs_term(c)) {
1905 const char *tty_path, *term = NULL;
1906
1907 tty_path = exec_context_tty_path(c);
1908
1909 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1910 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1911 * container manager passes to PID 1 ends up all the way in the console login shown. */
1912
1913 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1914 term = getenv("TERM");
1915
1916 if (!term)
1917 term = default_term_for_tty(tty_path);
1918
1919 x = strjoin("TERM=", term);
1920 if (!x)
1921 return -ENOMEM;
1922 our_env[n_env++] = x;
1923 }
1924
1925 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1926 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1927 return -ENOMEM;
1928
1929 our_env[n_env++] = x;
1930 }
1931
1932 if (c->log_namespace) {
1933 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1934 if (!x)
1935 return -ENOMEM;
1936
1937 our_env[n_env++] = x;
1938 }
1939
1940 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1941 _cleanup_free_ char *joined = NULL;
1942 const char *n;
1943
1944 if (!p->prefix[t])
1945 continue;
1946
1947 if (c->directories[t].n_items == 0)
1948 continue;
1949
1950 n = exec_directory_env_name_to_string(t);
1951 if (!n)
1952 continue;
1953
1954 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1955 _cleanup_free_ char *prefixed = NULL;
1956
1957 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1958 if (!prefixed)
1959 return -ENOMEM;
1960
1961 if (!strextend_with_separator(&joined, ":", prefixed))
1962 return -ENOMEM;
1963 }
1964
1965 x = strjoin(n, "=", joined);
1966 if (!x)
1967 return -ENOMEM;
1968
1969 our_env[n_env++] = x;
1970 }
1971
1972 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1973 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1974 if (!x)
1975 return -ENOMEM;
1976
1977 our_env[n_env++] = x;
1978 }
1979
1980 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1981 return -ENOMEM;
1982
1983 our_env[n_env++] = x;
1984
1985 our_env[n_env++] = NULL;
1986 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1987 #undef N_ENV_VARS
1988
1989 *ret = TAKE_PTR(our_env);
1990
1991 return 0;
1992 }
1993
1994 static int build_pass_environment(const ExecContext *c, char ***ret) {
1995 _cleanup_strv_free_ char **pass_env = NULL;
1996 size_t n_env = 0;
1997
1998 STRV_FOREACH(i, c->pass_environment) {
1999 _cleanup_free_ char *x = NULL;
2000 char *v;
2001
2002 v = getenv(*i);
2003 if (!v)
2004 continue;
2005 x = strjoin(*i, "=", v);
2006 if (!x)
2007 return -ENOMEM;
2008
2009 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2010 return -ENOMEM;
2011
2012 pass_env[n_env++] = TAKE_PTR(x);
2013 pass_env[n_env] = NULL;
2014 }
2015
2016 *ret = TAKE_PTR(pass_env);
2017
2018 return 0;
2019 }
2020
2021 bool exec_needs_mount_namespace(
2022 const ExecContext *context,
2023 const ExecParameters *params,
2024 const ExecRuntime *runtime) {
2025
2026 assert(context);
2027
2028 if (context->root_image)
2029 return true;
2030
2031 if (!strv_isempty(context->read_write_paths) ||
2032 !strv_isempty(context->read_only_paths) ||
2033 !strv_isempty(context->inaccessible_paths) ||
2034 !strv_isempty(context->exec_paths) ||
2035 !strv_isempty(context->no_exec_paths))
2036 return true;
2037
2038 if (context->n_bind_mounts > 0)
2039 return true;
2040
2041 if (context->n_temporary_filesystems > 0)
2042 return true;
2043
2044 if (context->n_mount_images > 0)
2045 return true;
2046
2047 if (context->n_extension_images > 0)
2048 return true;
2049
2050 if (!strv_isempty(context->extension_directories))
2051 return true;
2052
2053 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2054 return true;
2055
2056 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2057 return true;
2058
2059 if (context->private_devices ||
2060 context->private_mounts ||
2061 context->protect_system != PROTECT_SYSTEM_NO ||
2062 context->protect_home != PROTECT_HOME_NO ||
2063 context->protect_kernel_tunables ||
2064 context->protect_kernel_modules ||
2065 context->protect_kernel_logs ||
2066 context->protect_control_groups ||
2067 context->protect_proc != PROTECT_PROC_DEFAULT ||
2068 context->proc_subset != PROC_SUBSET_ALL ||
2069 context->private_ipc ||
2070 context->ipc_namespace_path)
2071 return true;
2072
2073 if (context->root_directory) {
2074 if (exec_context_get_effective_mount_apivfs(context))
2075 return true;
2076
2077 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2078 if (params && !params->prefix[t])
2079 continue;
2080
2081 if (context->directories[t].n_items > 0)
2082 return true;
2083 }
2084 }
2085
2086 if (context->dynamic_user &&
2087 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2088 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2089 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2090 return true;
2091
2092 if (context->log_namespace)
2093 return true;
2094
2095 return false;
2096 }
2097
2098 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2099 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2100 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2101 _cleanup_close_ int unshare_ready_fd = -1;
2102 _cleanup_(sigkill_waitp) pid_t pid = 0;
2103 uint64_t c = 1;
2104 ssize_t n;
2105 int r;
2106
2107 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2108 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2109 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2110 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2111 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2112 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2113 * continues execution normally.
2114 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2115 * does not need CAP_SETUID to write the single line mapping to itself. */
2116
2117 /* Can only set up multiple mappings with CAP_SETUID. */
2118 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
2119 r = asprintf(&uid_map,
2120 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2121 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2122 ouid, ouid, uid, uid);
2123 else
2124 r = asprintf(&uid_map,
2125 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2126 ouid, ouid);
2127
2128 if (r < 0)
2129 return -ENOMEM;
2130
2131 /* Can only set up multiple mappings with CAP_SETGID. */
2132 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
2133 r = asprintf(&gid_map,
2134 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2135 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2136 ogid, ogid, gid, gid);
2137 else
2138 r = asprintf(&gid_map,
2139 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2140 ogid, ogid);
2141
2142 if (r < 0)
2143 return -ENOMEM;
2144
2145 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2146 * namespace. */
2147 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2148 if (unshare_ready_fd < 0)
2149 return -errno;
2150
2151 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2152 * failed. */
2153 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2154 return -errno;
2155
2156 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2157 if (r < 0)
2158 return r;
2159 if (r == 0) {
2160 _cleanup_close_ int fd = -1;
2161 const char *a;
2162 pid_t ppid;
2163
2164 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2165 * here, after the parent opened its own user namespace. */
2166
2167 ppid = getppid();
2168 errno_pipe[0] = safe_close(errno_pipe[0]);
2169
2170 /* Wait until the parent unshared the user namespace */
2171 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2172 r = -errno;
2173 goto child_fail;
2174 }
2175
2176 /* Disable the setgroups() system call in the child user namespace, for good. */
2177 a = procfs_file_alloca(ppid, "setgroups");
2178 fd = open(a, O_WRONLY|O_CLOEXEC);
2179 if (fd < 0) {
2180 if (errno != ENOENT) {
2181 r = -errno;
2182 goto child_fail;
2183 }
2184
2185 /* If the file is missing the kernel is too old, let's continue anyway. */
2186 } else {
2187 if (write(fd, "deny\n", 5) < 0) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191
2192 fd = safe_close(fd);
2193 }
2194
2195 /* First write the GID map */
2196 a = procfs_file_alloca(ppid, "gid_map");
2197 fd = open(a, O_WRONLY|O_CLOEXEC);
2198 if (fd < 0) {
2199 r = -errno;
2200 goto child_fail;
2201 }
2202 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2203 r = -errno;
2204 goto child_fail;
2205 }
2206 fd = safe_close(fd);
2207
2208 /* The write the UID map */
2209 a = procfs_file_alloca(ppid, "uid_map");
2210 fd = open(a, O_WRONLY|O_CLOEXEC);
2211 if (fd < 0) {
2212 r = -errno;
2213 goto child_fail;
2214 }
2215 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2216 r = -errno;
2217 goto child_fail;
2218 }
2219
2220 _exit(EXIT_SUCCESS);
2221
2222 child_fail:
2223 (void) write(errno_pipe[1], &r, sizeof(r));
2224 _exit(EXIT_FAILURE);
2225 }
2226
2227 errno_pipe[1] = safe_close(errno_pipe[1]);
2228
2229 if (unshare(CLONE_NEWUSER) < 0)
2230 return -errno;
2231
2232 /* Let the child know that the namespace is ready now */
2233 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2234 return -errno;
2235
2236 /* Try to read an error code from the child */
2237 n = read(errno_pipe[0], &r, sizeof(r));
2238 if (n < 0)
2239 return -errno;
2240 if (n == sizeof(r)) { /* an error code was sent to us */
2241 if (r < 0)
2242 return r;
2243 return -EIO;
2244 }
2245 if (n != 0) /* on success we should have read 0 bytes */
2246 return -EIO;
2247
2248 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2249 if (r < 0)
2250 return r;
2251 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2252 return -EIO;
2253
2254 return 0;
2255 }
2256
2257 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2258 if (!context->dynamic_user)
2259 return false;
2260
2261 if (type == EXEC_DIRECTORY_CONFIGURATION)
2262 return false;
2263
2264 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2265 return false;
2266
2267 return true;
2268 }
2269
2270 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2271 _cleanup_free_ char *src_abs = NULL;
2272 int r;
2273
2274 assert(source);
2275
2276 src_abs = path_join(root, source);
2277 if (!src_abs)
2278 return -ENOMEM;
2279
2280 STRV_FOREACH(dst, symlinks) {
2281 _cleanup_free_ char *dst_abs = NULL;
2282
2283 dst_abs = path_join(root, *dst);
2284 if (!dst_abs)
2285 return -ENOMEM;
2286
2287 r = mkdir_parents_label(dst_abs, 0755);
2288 if (r < 0)
2289 return r;
2290
2291 r = symlink_idempotent(src_abs, dst_abs, true);
2292 if (r < 0)
2293 return r;
2294 }
2295
2296 return 0;
2297 }
2298
2299 static int setup_exec_directory(
2300 const ExecContext *context,
2301 const ExecParameters *params,
2302 uid_t uid,
2303 gid_t gid,
2304 ExecDirectoryType type,
2305 bool needs_mount_namespace,
2306 int *exit_status) {
2307
2308 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2309 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2310 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2311 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2312 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2313 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2314 };
2315 int r;
2316
2317 assert(context);
2318 assert(params);
2319 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2320 assert(exit_status);
2321
2322 if (!params->prefix[type])
2323 return 0;
2324
2325 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2326 if (!uid_is_valid(uid))
2327 uid = 0;
2328 if (!gid_is_valid(gid))
2329 gid = 0;
2330 }
2331
2332 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2333 _cleanup_free_ char *p = NULL, *pp = NULL;
2334
2335 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2336 if (!p) {
2337 r = -ENOMEM;
2338 goto fail;
2339 }
2340
2341 r = mkdir_parents_label(p, 0755);
2342 if (r < 0)
2343 goto fail;
2344
2345 if (exec_directory_is_private(context, type)) {
2346 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2347 * case we want to avoid leaving a directory around fully accessible that is owned by
2348 * a dynamic user whose UID is later on reused. To lock this down we use the same
2349 * trick used by container managers to prohibit host users to get access to files of
2350 * the same UID in containers: we place everything inside a directory that has an
2351 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2352 * for unprivileged host code. We then use fs namespacing to make this directory
2353 * permeable for the service itself.
2354 *
2355 * Specifically: for a service which wants a special directory "foo/" we first create
2356 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2357 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2358 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2359 * unprivileged host users can't look into it. Inside of the namespace of the unit
2360 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2361 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2362 * for the service and making sure it only gets access to the dirs it needs but no
2363 * others. Tricky? Yes, absolutely, but it works!
2364 *
2365 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2366 * to be owned by the service itself.
2367 *
2368 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2369 * for sharing files or sockets with other services. */
2370
2371 pp = path_join(params->prefix[type], "private");
2372 if (!pp) {
2373 r = -ENOMEM;
2374 goto fail;
2375 }
2376
2377 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2378 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2379 if (r < 0)
2380 goto fail;
2381
2382 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2383 r = -ENOMEM;
2384 goto fail;
2385 }
2386
2387 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2388 r = mkdir_parents_label(pp, 0755);
2389 if (r < 0)
2390 goto fail;
2391
2392 if (is_dir(p, false) > 0 &&
2393 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2394
2395 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2396 * it over. Most likely the service has been upgraded from one that didn't use
2397 * DynamicUser=1, to one that does. */
2398
2399 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2400 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2401 exec_directory_type_to_string(type), p, pp);
2402
2403 if (rename(p, pp) < 0) {
2404 r = -errno;
2405 goto fail;
2406 }
2407 } else {
2408 /* Otherwise, create the actual directory for the service */
2409
2410 r = mkdir_label(pp, context->directories[type].mode);
2411 if (r < 0 && r != -EEXIST)
2412 goto fail;
2413 }
2414
2415 /* And link it up from the original place. Note that if a mount namespace is going to be
2416 * used, then this symlink remains on the host, and a new one for the child namespace will
2417 * be created later. */
2418 r = symlink_idempotent(pp, p, true);
2419 if (r < 0)
2420 goto fail;
2421
2422 } else {
2423 _cleanup_free_ char *target = NULL;
2424
2425 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2426 readlink_and_make_absolute(p, &target) >= 0) {
2427 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2428
2429 /* This already exists and is a symlink? Interesting. Maybe it's one created
2430 * by DynamicUser=1 (see above)?
2431 *
2432 * We do this for all directory types except for ConfigurationDirectory=,
2433 * since they all support the private/ symlink logic at least in some
2434 * configurations, see above. */
2435
2436 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2437 if (r < 0)
2438 goto fail;
2439
2440 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2441 if (!q) {
2442 r = -ENOMEM;
2443 goto fail;
2444 }
2445
2446 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2447 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2448 if (r < 0)
2449 goto fail;
2450
2451 if (path_equal(q_resolved, target_resolved)) {
2452
2453 /* Hmm, apparently DynamicUser= was once turned on for this service,
2454 * but is no longer. Let's move the directory back up. */
2455
2456 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2457 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2458 exec_directory_type_to_string(type), q, p);
2459
2460 if (unlink(p) < 0) {
2461 r = -errno;
2462 goto fail;
2463 }
2464
2465 if (rename(q, p) < 0) {
2466 r = -errno;
2467 goto fail;
2468 }
2469 }
2470 }
2471
2472 r = mkdir_label(p, context->directories[type].mode);
2473 if (r < 0) {
2474 if (r != -EEXIST)
2475 goto fail;
2476
2477 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2478 struct stat st;
2479
2480 /* Don't change the owner/access mode of the configuration directory,
2481 * as in the common case it is not written to by a service, and shall
2482 * not be writable. */
2483
2484 if (stat(p, &st) < 0) {
2485 r = -errno;
2486 goto fail;
2487 }
2488
2489 /* Still complain if the access mode doesn't match */
2490 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2491 log_warning("%s \'%s\' already exists but the mode is different. "
2492 "(File system: %o %sMode: %o)",
2493 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2494 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2495
2496 continue;
2497 }
2498 }
2499 }
2500
2501 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2502 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2503 * current UID/GID ownership.) */
2504 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2505 if (r < 0)
2506 goto fail;
2507
2508 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2509 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2510 * assignments to exist. */
2511 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2512 if (r < 0)
2513 goto fail;
2514 }
2515
2516 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2517 * they are set up later, to allow configuring empty var/run/etc. */
2518 if (!needs_mount_namespace)
2519 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2520 r = create_many_symlinks(params->prefix[type],
2521 context->directories[type].items[i].path,
2522 context->directories[type].items[i].symlinks);
2523 if (r < 0)
2524 goto fail;
2525 }
2526
2527 return 0;
2528
2529 fail:
2530 *exit_status = exit_status_table[type];
2531 return r;
2532 }
2533
2534 static int write_credential(
2535 int dfd,
2536 const char *id,
2537 const void *data,
2538 size_t size,
2539 uid_t uid,
2540 bool ownership_ok) {
2541
2542 _cleanup_(unlink_and_freep) char *tmp = NULL;
2543 _cleanup_close_ int fd = -1;
2544 int r;
2545
2546 r = tempfn_random_child("", "cred", &tmp);
2547 if (r < 0)
2548 return r;
2549
2550 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2551 if (fd < 0) {
2552 tmp = mfree(tmp);
2553 return -errno;
2554 }
2555
2556 r = loop_write(fd, data, size, /* do_poll = */ false);
2557 if (r < 0)
2558 return r;
2559
2560 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2561 return -errno;
2562
2563 if (uid_is_valid(uid) && uid != getuid()) {
2564 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2565 if (r < 0) {
2566 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2567 return r;
2568
2569 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2570 * to express: that the user gets read access and nothing
2571 * else. But if the backing fs can't support that (e.g. ramfs)
2572 * then we can use file ownership instead. But that's only safe if
2573 * we can then re-mount the whole thing read-only, so that the
2574 * user can no longer chmod() the file to gain write access. */
2575 return r;
2576
2577 if (fchown(fd, uid, GID_INVALID) < 0)
2578 return -errno;
2579 }
2580 }
2581
2582 if (renameat(dfd, tmp, dfd, id) < 0)
2583 return -errno;
2584
2585 tmp = mfree(tmp);
2586 return 0;
2587 }
2588
2589 static char **credential_search_path(
2590 const ExecParameters *params,
2591 bool encrypted) {
2592
2593 _cleanup_strv_free_ char **l = NULL;
2594
2595 assert(params);
2596
2597 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2598 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2599 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2600
2601 if (encrypted) {
2602 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2603 return NULL;
2604
2605 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2606 return NULL;
2607 }
2608
2609 if (params->received_credentials_directory)
2610 if (strv_extend(&l, params->received_credentials_directory) < 0)
2611 return NULL;
2612
2613 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2614 return NULL;
2615
2616 if (DEBUG_LOGGING) {
2617 _cleanup_free_ char *t = strv_join(l, ":");
2618
2619 log_debug("Credential search path is: %s", t);
2620 }
2621
2622 return TAKE_PTR(l);
2623 }
2624
2625 static int load_credential(
2626 const ExecContext *context,
2627 const ExecParameters *params,
2628 const char *id,
2629 const char *path,
2630 bool encrypted,
2631 const char *unit,
2632 int read_dfd,
2633 int write_dfd,
2634 uid_t uid,
2635 bool ownership_ok,
2636 uint64_t *left) {
2637
2638 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2639 _cleanup_strv_free_ char **search_path = NULL;
2640 _cleanup_(erase_and_freep) char *data = NULL;
2641 _cleanup_free_ char *bindname = NULL;
2642 const char *source = NULL;
2643 bool missing_ok = true;
2644 size_t size, add, maxsz;
2645 int r;
2646
2647 assert(context);
2648 assert(params);
2649 assert(id);
2650 assert(path);
2651 assert(unit);
2652 assert(write_dfd >= 0);
2653 assert(left);
2654
2655 if (read_dfd >= 0) {
2656 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2657 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2658 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2659 * open it. */
2660
2661 if (!filename_is_valid(path)) /* safety check */
2662 return -EINVAL;
2663
2664 missing_ok = true;
2665 source = path;
2666
2667 } else if (path_is_absolute(path)) {
2668 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2669 * sockets */
2670
2671 if (!path_is_valid(path)) /* safety check */
2672 return -EINVAL;
2673
2674 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2675
2676 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2677 * via the source socket address in case we read off an AF_UNIX socket. */
2678 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2679 return -ENOMEM;
2680
2681 missing_ok = false;
2682 source = path;
2683
2684 } else if (credential_name_valid(path)) {
2685 /* If this is a relative path, take it as credential name relative to the credentials
2686 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2687 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2688
2689 search_path = credential_search_path(params, encrypted);
2690 if (!search_path)
2691 return -ENOMEM;
2692
2693 missing_ok = true;
2694 } else
2695 source = NULL;
2696
2697 if (encrypted)
2698 flags |= READ_FULL_FILE_UNBASE64;
2699
2700 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2701
2702 if (search_path) {
2703 STRV_FOREACH(d, search_path) {
2704 _cleanup_free_ char *j = NULL;
2705
2706 j = path_join(*d, path);
2707 if (!j)
2708 return -ENOMEM;
2709
2710 r = read_full_file_full(
2711 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2712 UINT64_MAX,
2713 maxsz,
2714 flags,
2715 NULL,
2716 &data, &size);
2717 if (r != -ENOENT)
2718 break;
2719 }
2720 } else if (source)
2721 r = read_full_file_full(
2722 read_dfd, source,
2723 UINT64_MAX,
2724 maxsz,
2725 flags,
2726 bindname,
2727 &data, &size);
2728 else
2729 r = -ENOENT;
2730
2731 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2732 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2733 * will get clear errors if we don't pass such a missing credential on as they
2734 * themselves will get ENOENT when trying to read them, which should not be much
2735 * worse than when we handle the error here and make it fatal.
2736 *
2737 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2738 * we are fine, too. */
2739 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2740 return 0;
2741 }
2742 if (r < 0)
2743 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2744
2745 if (encrypted) {
2746 _cleanup_free_ void *plaintext = NULL;
2747 size_t plaintext_size = 0;
2748
2749 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2750 if (r < 0)
2751 return r;
2752
2753 free_and_replace(data, plaintext);
2754 size = plaintext_size;
2755 }
2756
2757 add = strlen(id) + size;
2758 if (add > *left)
2759 return -E2BIG;
2760
2761 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2762 if (r < 0)
2763 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2764
2765 *left -= add;
2766 return 0;
2767 }
2768
2769 struct load_cred_args {
2770 const ExecContext *context;
2771 const ExecParameters *params;
2772 bool encrypted;
2773 const char *unit;
2774 int dfd;
2775 uid_t uid;
2776 bool ownership_ok;
2777 uint64_t *left;
2778 };
2779
2780 static int load_cred_recurse_dir_cb(
2781 RecurseDirEvent event,
2782 const char *path,
2783 int dir_fd,
2784 int inode_fd,
2785 const struct dirent *de,
2786 const struct statx *sx,
2787 void *userdata) {
2788
2789 struct load_cred_args *args = ASSERT_PTR(userdata);
2790 _cleanup_free_ char *sub_id = NULL;
2791 int r;
2792
2793 if (event != RECURSE_DIR_ENTRY)
2794 return RECURSE_DIR_CONTINUE;
2795
2796 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2797 return RECURSE_DIR_CONTINUE;
2798
2799 sub_id = strreplace(path, "/", "_");
2800 if (!sub_id)
2801 return -ENOMEM;
2802
2803 if (!credential_name_valid(sub_id))
2804 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2805
2806 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2807 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2808 return RECURSE_DIR_CONTINUE;
2809 }
2810 if (errno != ENOENT)
2811 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2812
2813 r = load_credential(
2814 args->context,
2815 args->params,
2816 sub_id,
2817 de->d_name,
2818 args->encrypted,
2819 args->unit,
2820 dir_fd,
2821 args->dfd,
2822 args->uid,
2823 args->ownership_ok,
2824 args->left);
2825 if (r < 0)
2826 return r;
2827
2828 return RECURSE_DIR_CONTINUE;
2829 }
2830
2831 static int acquire_credentials(
2832 const ExecContext *context,
2833 const ExecParameters *params,
2834 const char *unit,
2835 const char *p,
2836 uid_t uid,
2837 bool ownership_ok) {
2838
2839 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2840 _cleanup_close_ int dfd = -1;
2841 ExecLoadCredential *lc;
2842 ExecSetCredential *sc;
2843 int r;
2844
2845 assert(context);
2846 assert(p);
2847
2848 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2849 if (dfd < 0)
2850 return -errno;
2851
2852 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2853 HASHMAP_FOREACH(lc, context->load_credentials) {
2854 _cleanup_close_ int sub_fd = -1;
2855
2856 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2857 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2858 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2859 * propagate a credential passed to us from further up. */
2860
2861 if (path_is_absolute(lc->path)) {
2862 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2863 if (sub_fd < 0 && !IN_SET(errno,
2864 ENOTDIR, /* Not a directory */
2865 ENOENT)) /* Doesn't exist? */
2866 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2867 }
2868
2869 if (sub_fd < 0)
2870 /* Regular file (incl. a credential passed in from higher up) */
2871 r = load_credential(
2872 context,
2873 params,
2874 lc->id,
2875 lc->path,
2876 lc->encrypted,
2877 unit,
2878 -1,
2879 dfd,
2880 uid,
2881 ownership_ok,
2882 &left);
2883 else
2884 /* Directory */
2885 r = recurse_dir(
2886 sub_fd,
2887 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2888 /* statx_mask= */ 0,
2889 /* n_depth_max= */ UINT_MAX,
2890 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2891 load_cred_recurse_dir_cb,
2892 &(struct load_cred_args) {
2893 .context = context,
2894 .params = params,
2895 .encrypted = lc->encrypted,
2896 .unit = unit,
2897 .dfd = dfd,
2898 .uid = uid,
2899 .ownership_ok = ownership_ok,
2900 .left = &left,
2901 });
2902 if (r < 0)
2903 return r;
2904 }
2905
2906 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2907 * them, so that they can act as a "default" if the same credential is specified multiple times. */
2908 HASHMAP_FOREACH(sc, context->set_credentials) {
2909 _cleanup_(erase_and_freep) void *plaintext = NULL;
2910 const char *data;
2911 size_t size, add;
2912
2913 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2914 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2915 * slow and involved, hence it's nice to be able to skip that if the credential already
2916 * exists anyway. */
2917 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2918 continue;
2919 if (errno != ENOENT)
2920 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2921
2922 if (sc->encrypted) {
2923 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
2924 if (r < 0)
2925 return r;
2926
2927 data = plaintext;
2928 } else {
2929 data = sc->data;
2930 size = sc->size;
2931 }
2932
2933 add = strlen(sc->id) + size;
2934 if (add > left)
2935 return -E2BIG;
2936
2937 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2938 if (r < 0)
2939 return r;
2940
2941 left -= add;
2942 }
2943
2944 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2945 return -errno;
2946
2947 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2948 * accessible */
2949
2950 if (uid_is_valid(uid) && uid != getuid()) {
2951 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2952 if (r < 0) {
2953 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2954 return r;
2955
2956 if (!ownership_ok)
2957 return r;
2958
2959 if (fchown(dfd, uid, GID_INVALID) < 0)
2960 return -errno;
2961 }
2962 }
2963
2964 return 0;
2965 }
2966
2967 static int setup_credentials_internal(
2968 const ExecContext *context,
2969 const ExecParameters *params,
2970 const char *unit,
2971 const char *final, /* This is where the credential store shall eventually end up at */
2972 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2973 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2974 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2975 uid_t uid) {
2976
2977 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2978 * if we mounted something; false if we definitely can't mount anything */
2979 bool final_mounted;
2980 const char *where;
2981
2982 assert(context);
2983 assert(final);
2984 assert(workspace);
2985
2986 if (reuse_workspace) {
2987 r = path_is_mount_point(workspace, NULL, 0);
2988 if (r < 0)
2989 return r;
2990 if (r > 0)
2991 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2992 else
2993 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2994 } else
2995 workspace_mounted = -1; /* ditto */
2996
2997 r = path_is_mount_point(final, NULL, 0);
2998 if (r < 0)
2999 return r;
3000 if (r > 0) {
3001 /* If the final place already has something mounted, we use that. If the workspace also has
3002 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3003 * different). */
3004 final_mounted = true;
3005
3006 if (workspace_mounted < 0) {
3007 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3008 * the final version to the workspace, and make it writable, so that we can make
3009 * changes */
3010
3011 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3012 if (r < 0)
3013 return r;
3014
3015 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3016 if (r < 0)
3017 return r;
3018
3019 workspace_mounted = true;
3020 }
3021 } else
3022 final_mounted = false;
3023
3024 if (workspace_mounted < 0) {
3025 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3026 for (int try = 0;; try++) {
3027
3028 if (try == 0) {
3029 /* Try "ramfs" first, since it's not swap backed */
3030 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3031 if (r >= 0) {
3032 workspace_mounted = true;
3033 break;
3034 }
3035
3036 } else if (try == 1) {
3037 _cleanup_free_ char *opts = NULL;
3038
3039 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3040 return -ENOMEM;
3041
3042 /* Fall back to "tmpfs" otherwise */
3043 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3044 if (r >= 0) {
3045 workspace_mounted = true;
3046 break;
3047 }
3048
3049 } else {
3050 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3051 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3052 if (r < 0) {
3053 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3054 return r;
3055
3056 if (must_mount) /* If we it's not OK to use the plain directory
3057 * fallback, propagate all errors too */
3058 return r;
3059
3060 /* If we lack privileges to bind mount stuff, then let's gracefully
3061 * proceed for compat with container envs, and just use the final dir
3062 * as is. */
3063
3064 workspace_mounted = false;
3065 break;
3066 }
3067
3068 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3069 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3070 if (r < 0)
3071 return r;
3072
3073 workspace_mounted = true;
3074 break;
3075 }
3076 }
3077 }
3078
3079 assert(!must_mount || workspace_mounted > 0);
3080 where = workspace_mounted ? workspace : final;
3081
3082 (void) label_fix_full(AT_FDCWD, where, final, 0);
3083
3084 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3085 if (r < 0)
3086 return r;
3087
3088 if (workspace_mounted) {
3089 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3090 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3091 if (r < 0)
3092 return r;
3093
3094 /* And mount it to the final place, read-only */
3095 if (final_mounted)
3096 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3097 else
3098 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3099 if (r < 0)
3100 return r;
3101 } else {
3102 _cleanup_free_ char *parent = NULL;
3103
3104 /* If we do not have our own mount put used the plain directory fallback, then we need to
3105 * open access to the top-level credential directory and the per-service directory now */
3106
3107 r = path_extract_directory(final, &parent);
3108 if (r < 0)
3109 return r;
3110 if (chmod(parent, 0755) < 0)
3111 return -errno;
3112 }
3113
3114 return 0;
3115 }
3116
3117 static int setup_credentials(
3118 const ExecContext *context,
3119 const ExecParameters *params,
3120 const char *unit,
3121 uid_t uid) {
3122
3123 _cleanup_free_ char *p = NULL, *q = NULL;
3124 int r;
3125
3126 assert(context);
3127 assert(params);
3128
3129 if (!exec_context_has_credentials(context))
3130 return 0;
3131
3132 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3133 return -EINVAL;
3134
3135 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3136 * and the subdir we mount over with a read-only file system readable by the service's user */
3137 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3138 if (!q)
3139 return -ENOMEM;
3140
3141 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3142 if (r < 0 && r != -EEXIST)
3143 return r;
3144
3145 p = path_join(q, unit);
3146 if (!p)
3147 return -ENOMEM;
3148
3149 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3150 if (r < 0 && r != -EEXIST)
3151 return r;
3152
3153 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3154 if (r < 0) {
3155 _cleanup_free_ char *t = NULL, *u = NULL;
3156
3157 /* If this is not a privilege or support issue then propagate the error */
3158 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3159 return r;
3160
3161 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3162 * it into place, so that users can't access half-initialized credential stores. */
3163 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3164 if (!t)
3165 return -ENOMEM;
3166
3167 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3168 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3169 * after it is fully set up */
3170 u = path_join(t, unit);
3171 if (!u)
3172 return -ENOMEM;
3173
3174 FOREACH_STRING(i, t, u) {
3175 r = mkdir_label(i, 0700);
3176 if (r < 0 && r != -EEXIST)
3177 return r;
3178 }
3179
3180 r = setup_credentials_internal(
3181 context,
3182 params,
3183 unit,
3184 p, /* final mount point */
3185 u, /* temporary workspace to overmount */
3186 true, /* reuse the workspace if it is already a mount */
3187 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3188 uid);
3189
3190 (void) rmdir(u); /* remove the workspace again if we can. */
3191
3192 if (r < 0)
3193 return r;
3194
3195 } else if (r == 0) {
3196
3197 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3198 * we can use the same directory for all cases, after turning off propagation. Question
3199 * though is: where do we turn off propagation exactly, and where do we place the workspace
3200 * directory? We need some place that is guaranteed to be a mount point in the host, and
3201 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3202 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3203 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3204 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3205 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3206 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3207 * propagation on the former, and then overmount the latter.
3208 *
3209 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3210 * for this purpose, but there are few other candidates that work equally well for us, and
3211 * given that the we do this in a privately namespaced short-lived single-threaded process
3212 * that no one else sees this should be OK to do. */
3213
3214 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3215 if (r < 0)
3216 goto child_fail;
3217
3218 r = setup_credentials_internal(
3219 context,
3220 params,
3221 unit,
3222 p, /* final mount point */
3223 "/dev/shm", /* temporary workspace to overmount */
3224 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3225 true, /* insist that something is mounted, do not allow fallback to plain directory */
3226 uid);
3227 if (r < 0)
3228 goto child_fail;
3229
3230 _exit(EXIT_SUCCESS);
3231
3232 child_fail:
3233 _exit(EXIT_FAILURE);
3234 }
3235
3236 return 0;
3237 }
3238
3239 #if ENABLE_SMACK
3240 static int setup_smack(
3241 const Manager *manager,
3242 const ExecContext *context,
3243 int executable_fd) {
3244 int r;
3245
3246 assert(context);
3247 assert(executable_fd >= 0);
3248
3249 if (context->smack_process_label) {
3250 r = mac_smack_apply_pid(0, context->smack_process_label);
3251 if (r < 0)
3252 return r;
3253 } else if (manager->default_smack_process_label) {
3254 _cleanup_free_ char *exec_label = NULL;
3255
3256 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3257 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
3258 return r;
3259
3260 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
3261 if (r < 0)
3262 return r;
3263 }
3264
3265 return 0;
3266 }
3267 #endif
3268
3269 static int compile_bind_mounts(
3270 const ExecContext *context,
3271 const ExecParameters *params,
3272 BindMount **ret_bind_mounts,
3273 size_t *ret_n_bind_mounts,
3274 char ***ret_empty_directories) {
3275
3276 _cleanup_strv_free_ char **empty_directories = NULL;
3277 BindMount *bind_mounts;
3278 size_t n, h = 0;
3279 int r;
3280
3281 assert(context);
3282 assert(params);
3283 assert(ret_bind_mounts);
3284 assert(ret_n_bind_mounts);
3285 assert(ret_empty_directories);
3286
3287 n = context->n_bind_mounts;
3288 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3289 if (!params->prefix[t])
3290 continue;
3291
3292 n += context->directories[t].n_items;
3293 }
3294
3295 if (n <= 0) {
3296 *ret_bind_mounts = NULL;
3297 *ret_n_bind_mounts = 0;
3298 *ret_empty_directories = NULL;
3299 return 0;
3300 }
3301
3302 bind_mounts = new(BindMount, n);
3303 if (!bind_mounts)
3304 return -ENOMEM;
3305
3306 for (size_t i = 0; i < context->n_bind_mounts; i++) {
3307 BindMount *item = context->bind_mounts + i;
3308 char *s, *d;
3309
3310 s = strdup(item->source);
3311 if (!s) {
3312 r = -ENOMEM;
3313 goto finish;
3314 }
3315
3316 d = strdup(item->destination);
3317 if (!d) {
3318 free(s);
3319 r = -ENOMEM;
3320 goto finish;
3321 }
3322
3323 bind_mounts[h++] = (BindMount) {
3324 .source = s,
3325 .destination = d,
3326 .read_only = item->read_only,
3327 .recursive = item->recursive,
3328 .ignore_enoent = item->ignore_enoent,
3329 };
3330 }
3331
3332 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3333 if (!params->prefix[t])
3334 continue;
3335
3336 if (context->directories[t].n_items == 0)
3337 continue;
3338
3339 if (exec_directory_is_private(context, t) &&
3340 !exec_context_with_rootfs(context)) {
3341 char *private_root;
3342
3343 /* So this is for a dynamic user, and we need to make sure the process can access its own
3344 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3345 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3346
3347 private_root = path_join(params->prefix[t], "private");
3348 if (!private_root) {
3349 r = -ENOMEM;
3350 goto finish;
3351 }
3352
3353 r = strv_consume(&empty_directories, private_root);
3354 if (r < 0)
3355 goto finish;
3356 }
3357
3358 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3359 char *s, *d;
3360
3361 if (exec_directory_is_private(context, t))
3362 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3363 else
3364 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3365 if (!s) {
3366 r = -ENOMEM;
3367 goto finish;
3368 }
3369
3370 if (exec_directory_is_private(context, t) &&
3371 exec_context_with_rootfs(context))
3372 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3373 * directory is not created on the root directory. So, let's bind-mount the directory
3374 * on the 'non-private' place. */
3375 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3376 else
3377 d = strdup(s);
3378 if (!d) {
3379 free(s);
3380 r = -ENOMEM;
3381 goto finish;
3382 }
3383
3384 bind_mounts[h++] = (BindMount) {
3385 .source = s,
3386 .destination = d,
3387 .read_only = false,
3388 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3389 .recursive = true,
3390 .ignore_enoent = false,
3391 };
3392 }
3393 }
3394
3395 assert(h == n);
3396
3397 *ret_bind_mounts = bind_mounts;
3398 *ret_n_bind_mounts = n;
3399 *ret_empty_directories = TAKE_PTR(empty_directories);
3400
3401 return (int) n;
3402
3403 finish:
3404 bind_mount_free_many(bind_mounts, h);
3405 return r;
3406 }
3407
3408 /* ret_symlinks will contain a list of pairs src:dest that describes
3409 * the symlinks to create later on. For example, the symlinks needed
3410 * to safely give private directories to DynamicUser=1 users. */
3411 static int compile_symlinks(
3412 const ExecContext *context,
3413 const ExecParameters *params,
3414 char ***ret_symlinks) {
3415
3416 _cleanup_strv_free_ char **symlinks = NULL;
3417 int r;
3418
3419 assert(context);
3420 assert(params);
3421 assert(ret_symlinks);
3422
3423 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3424 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3425 _cleanup_free_ char *private_path = NULL, *path = NULL;
3426
3427 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3428 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3429
3430 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3431 dst_abs = path_join(params->prefix[dt], *symlink);
3432 if (!src_abs || !dst_abs)
3433 return -ENOMEM;
3434
3435 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3436 if (r < 0)
3437 return r;
3438 }
3439
3440 if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context))
3441 continue;
3442
3443 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3444 if (!private_path)
3445 return -ENOMEM;
3446
3447 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3448 if (!path)
3449 return -ENOMEM;
3450
3451 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3452 if (r < 0)
3453 return r;
3454 }
3455 }
3456
3457 *ret_symlinks = TAKE_PTR(symlinks);
3458
3459 return 0;
3460 }
3461
3462 static bool insist_on_sandboxing(
3463 const ExecContext *context,
3464 const char *root_dir,
3465 const char *root_image,
3466 const BindMount *bind_mounts,
3467 size_t n_bind_mounts) {
3468
3469 assert(context);
3470 assert(n_bind_mounts == 0 || bind_mounts);
3471
3472 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3473 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3474 * rearrange stuff in a way we cannot ignore gracefully. */
3475
3476 if (context->n_temporary_filesystems > 0)
3477 return true;
3478
3479 if (root_dir || root_image)
3480 return true;
3481
3482 if (context->n_mount_images > 0)
3483 return true;
3484
3485 if (context->dynamic_user)
3486 return true;
3487
3488 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3489 return true;
3490
3491 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3492 * essential. */
3493 for (size_t i = 0; i < n_bind_mounts; i++)
3494 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3495 return true;
3496
3497 if (context->log_namespace)
3498 return true;
3499
3500 return false;
3501 }
3502
3503 static int apply_mount_namespace(
3504 const Unit *u,
3505 ExecCommandFlags command_flags,
3506 const ExecContext *context,
3507 const ExecParameters *params,
3508 const ExecRuntime *runtime,
3509 char **error_path) {
3510
3511 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3512 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3513 const char *root_dir = NULL, *root_image = NULL;
3514 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3515 *extension_dir = NULL;
3516 NamespaceInfo ns_info;
3517 bool needs_sandboxing;
3518 BindMount *bind_mounts = NULL;
3519 size_t n_bind_mounts = 0;
3520 int r;
3521
3522 assert(context);
3523
3524 if (params->flags & EXEC_APPLY_CHROOT) {
3525 root_image = context->root_image;
3526
3527 if (!root_image)
3528 root_dir = context->root_directory;
3529 }
3530
3531 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3532 if (r < 0)
3533 return r;
3534
3535 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3536 r = compile_symlinks(context, params, &symlinks);
3537 if (r < 0)
3538 goto finalize;
3539
3540 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3541 if (needs_sandboxing) {
3542 /* The runtime struct only contains the parent of the private /tmp,
3543 * which is non-accessible to world users. Inside of it there's a /tmp
3544 * that is sticky, and that's the one we want to use here.
3545 * This does not apply when we are using /run/systemd/empty as fallback. */
3546
3547 if (context->private_tmp && runtime) {
3548 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3549 tmp_dir = runtime->tmp_dir;
3550 else if (runtime->tmp_dir)
3551 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3552
3553 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3554 var_tmp_dir = runtime->var_tmp_dir;
3555 else if (runtime->var_tmp_dir)
3556 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3557 }
3558
3559 ns_info = (NamespaceInfo) {
3560 .ignore_protect_paths = false,
3561 .private_dev = context->private_devices,
3562 .protect_control_groups = context->protect_control_groups,
3563 .protect_kernel_tunables = context->protect_kernel_tunables,
3564 .protect_kernel_modules = context->protect_kernel_modules,
3565 .protect_kernel_logs = context->protect_kernel_logs,
3566 .protect_hostname = context->protect_hostname,
3567 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3568 .private_mounts = context->private_mounts,
3569 .protect_home = context->protect_home,
3570 .protect_system = context->protect_system,
3571 .protect_proc = context->protect_proc,
3572 .proc_subset = context->proc_subset,
3573 .private_ipc = context->private_ipc || context->ipc_namespace_path,
3574 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3575 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3576 };
3577 } else if (!context->dynamic_user && root_dir)
3578 /*
3579 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3580 * sandbox info, otherwise enforce it, don't ignore protected paths and
3581 * fail if we are enable to apply the sandbox inside the mount namespace.
3582 */
3583 ns_info = (NamespaceInfo) {
3584 .ignore_protect_paths = true,
3585 };
3586 else
3587 ns_info = (NamespaceInfo) {};
3588
3589 if (context->mount_flags == MS_SHARED)
3590 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3591
3592 if (exec_context_has_credentials(context) &&
3593 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3594 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3595 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3596 if (!creds_path) {
3597 r = -ENOMEM;
3598 goto finalize;
3599 }
3600 }
3601
3602 if (MANAGER_IS_SYSTEM(u->manager)) {
3603 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3604 if (!propagate_dir) {
3605 r = -ENOMEM;
3606 goto finalize;
3607 }
3608
3609 incoming_dir = strdup("/run/systemd/incoming");
3610 if (!incoming_dir) {
3611 r = -ENOMEM;
3612 goto finalize;
3613 }
3614
3615 extension_dir = strdup("/run/systemd/unit-extensions");
3616 if (!extension_dir) {
3617 r = -ENOMEM;
3618 goto finalize;
3619 }
3620 } else
3621 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3622 r = -ENOMEM;
3623 goto finalize;
3624 }
3625
3626 r = setup_namespace(root_dir, root_image, context->root_image_options,
3627 &ns_info, context->read_write_paths,
3628 needs_sandboxing ? context->read_only_paths : NULL,
3629 needs_sandboxing ? context->inaccessible_paths : NULL,
3630 needs_sandboxing ? context->exec_paths : NULL,
3631 needs_sandboxing ? context->no_exec_paths : NULL,
3632 empty_directories,
3633 symlinks,
3634 bind_mounts,
3635 n_bind_mounts,
3636 context->temporary_filesystems,
3637 context->n_temporary_filesystems,
3638 context->mount_images,
3639 context->n_mount_images,
3640 tmp_dir,
3641 var_tmp_dir,
3642 creds_path,
3643 context->log_namespace,
3644 context->mount_flags,
3645 context->root_hash, context->root_hash_size, context->root_hash_path,
3646 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3647 context->root_verity,
3648 context->extension_images,
3649 context->n_extension_images,
3650 context->extension_directories,
3651 propagate_dir,
3652 incoming_dir,
3653 extension_dir,
3654 root_dir || root_image ? params->notify_socket : NULL,
3655 error_path);
3656
3657 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3658 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3659 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3660 * completely different execution environment. */
3661 if (r == -ENOANO) {
3662 if (insist_on_sandboxing(
3663 context,
3664 root_dir, root_image,
3665 bind_mounts,
3666 n_bind_mounts)) {
3667 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3668 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3669 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3670
3671 r = -EOPNOTSUPP;
3672 } else {
3673 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3674 r = 0;
3675 }
3676 }
3677
3678 finalize:
3679 bind_mount_free_many(bind_mounts, n_bind_mounts);
3680 return r;
3681 }
3682
3683 static int apply_working_directory(
3684 const ExecContext *context,
3685 const ExecParameters *params,
3686 const char *home,
3687 int *exit_status) {
3688
3689 const char *d, *wd;
3690
3691 assert(context);
3692 assert(exit_status);
3693
3694 if (context->working_directory_home) {
3695
3696 if (!home) {
3697 *exit_status = EXIT_CHDIR;
3698 return -ENXIO;
3699 }
3700
3701 wd = home;
3702
3703 } else
3704 wd = empty_to_root(context->working_directory);
3705
3706 if (params->flags & EXEC_APPLY_CHROOT)
3707 d = wd;
3708 else
3709 d = prefix_roota(context->root_directory, wd);
3710
3711 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3712 *exit_status = EXIT_CHDIR;
3713 return -errno;
3714 }
3715
3716 return 0;
3717 }
3718
3719 static int apply_root_directory(
3720 const ExecContext *context,
3721 const ExecParameters *params,
3722 const bool needs_mount_ns,
3723 int *exit_status) {
3724
3725 assert(context);
3726 assert(exit_status);
3727
3728 if (params->flags & EXEC_APPLY_CHROOT)
3729 if (!needs_mount_ns && context->root_directory)
3730 if (chroot(context->root_directory) < 0) {
3731 *exit_status = EXIT_CHROOT;
3732 return -errno;
3733 }
3734
3735 return 0;
3736 }
3737
3738 static int setup_keyring(
3739 const Unit *u,
3740 const ExecContext *context,
3741 const ExecParameters *p,
3742 uid_t uid, gid_t gid) {
3743
3744 key_serial_t keyring;
3745 int r = 0;
3746 uid_t saved_uid;
3747 gid_t saved_gid;
3748
3749 assert(u);
3750 assert(context);
3751 assert(p);
3752
3753 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3754 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3755 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3756 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3757 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3758 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3759
3760 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3761 return 0;
3762
3763 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3764 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3765 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3766 * & group is just as nasty as acquiring a reference to the user keyring. */
3767
3768 saved_uid = getuid();
3769 saved_gid = getgid();
3770
3771 if (gid_is_valid(gid) && gid != saved_gid) {
3772 if (setregid(gid, -1) < 0)
3773 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3774 }
3775
3776 if (uid_is_valid(uid) && uid != saved_uid) {
3777 if (setreuid(uid, -1) < 0) {
3778 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3779 goto out;
3780 }
3781 }
3782
3783 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3784 if (keyring == -1) {
3785 if (errno == ENOSYS)
3786 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3787 else if (ERRNO_IS_PRIVILEGE(errno))
3788 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3789 else if (errno == EDQUOT)
3790 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3791 else
3792 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3793
3794 goto out;
3795 }
3796
3797 /* When requested link the user keyring into the session keyring. */
3798 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3799
3800 if (keyctl(KEYCTL_LINK,
3801 KEY_SPEC_USER_KEYRING,
3802 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3803 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3804 goto out;
3805 }
3806 }
3807
3808 /* Restore uid/gid back */
3809 if (uid_is_valid(uid) && uid != saved_uid) {
3810 if (setreuid(saved_uid, -1) < 0) {
3811 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3812 goto out;
3813 }
3814 }
3815
3816 if (gid_is_valid(gid) && gid != saved_gid) {
3817 if (setregid(saved_gid, -1) < 0)
3818 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3819 }
3820
3821 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3822 if (!sd_id128_is_null(u->invocation_id)) {
3823 key_serial_t key;
3824
3825 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3826 if (key == -1)
3827 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3828 else {
3829 if (keyctl(KEYCTL_SETPERM, key,
3830 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3831 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3832 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3833 }
3834 }
3835
3836 out:
3837 /* Revert back uid & gid for the last time, and exit */
3838 /* no extra logging, as only the first already reported error matters */
3839 if (getuid() != saved_uid)
3840 (void) setreuid(saved_uid, -1);
3841
3842 if (getgid() != saved_gid)
3843 (void) setregid(saved_gid, -1);
3844
3845 return r;
3846 }
3847
3848 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3849 assert(array);
3850 assert(n);
3851 assert(pair);
3852
3853 if (pair[0] >= 0)
3854 array[(*n)++] = pair[0];
3855 if (pair[1] >= 0)
3856 array[(*n)++] = pair[1];
3857 }
3858
3859 static int close_remaining_fds(
3860 const ExecParameters *params,
3861 const ExecRuntime *runtime,
3862 const DynamicCreds *dcreds,
3863 int user_lookup_fd,
3864 int socket_fd,
3865 const int *fds, size_t n_fds) {
3866
3867 size_t n_dont_close = 0;
3868 int dont_close[n_fds + 12];
3869
3870 assert(params);
3871
3872 if (params->stdin_fd >= 0)
3873 dont_close[n_dont_close++] = params->stdin_fd;
3874 if (params->stdout_fd >= 0)
3875 dont_close[n_dont_close++] = params->stdout_fd;
3876 if (params->stderr_fd >= 0)
3877 dont_close[n_dont_close++] = params->stderr_fd;
3878
3879 if (socket_fd >= 0)
3880 dont_close[n_dont_close++] = socket_fd;
3881 if (n_fds > 0) {
3882 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3883 n_dont_close += n_fds;
3884 }
3885
3886 if (runtime) {
3887 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3888 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3889 }
3890
3891 if (dcreds) {
3892 if (dcreds->user)
3893 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3894 if (dcreds->group)
3895 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3896 }
3897
3898 if (user_lookup_fd >= 0)
3899 dont_close[n_dont_close++] = user_lookup_fd;
3900
3901 return close_all_fds(dont_close, n_dont_close);
3902 }
3903
3904 static int send_user_lookup(
3905 Unit *unit,
3906 int user_lookup_fd,
3907 uid_t uid,
3908 gid_t gid) {
3909
3910 assert(unit);
3911
3912 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3913 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3914 * specified. */
3915
3916 if (user_lookup_fd < 0)
3917 return 0;
3918
3919 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3920 return 0;
3921
3922 if (writev(user_lookup_fd,
3923 (struct iovec[]) {
3924 IOVEC_INIT(&uid, sizeof(uid)),
3925 IOVEC_INIT(&gid, sizeof(gid)),
3926 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3927 return -errno;
3928
3929 return 0;
3930 }
3931
3932 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3933 int r;
3934
3935 assert(c);
3936 assert(home);
3937 assert(buf);
3938
3939 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3940
3941 if (*home)
3942 return 0;
3943
3944 if (!c->working_directory_home)
3945 return 0;
3946
3947 r = get_home_dir(buf);
3948 if (r < 0)
3949 return r;
3950
3951 *home = *buf;
3952 return 1;
3953 }
3954
3955 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3956 _cleanup_strv_free_ char ** list = NULL;
3957 int r;
3958
3959 assert(c);
3960 assert(p);
3961 assert(ret);
3962
3963 assert(c->dynamic_user);
3964
3965 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3966 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3967 * directories. */
3968
3969 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3970 if (t == EXEC_DIRECTORY_CONFIGURATION)
3971 continue;
3972
3973 if (!p->prefix[t])
3974 continue;
3975
3976 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3977 char *e;
3978
3979 if (exec_directory_is_private(c, t))
3980 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3981 else
3982 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3983 if (!e)
3984 return -ENOMEM;
3985
3986 r = strv_consume(&list, e);
3987 if (r < 0)
3988 return r;
3989 }
3990 }
3991
3992 *ret = TAKE_PTR(list);
3993
3994 return 0;
3995 }
3996
3997 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3998 bool using_subcgroup;
3999 char *p;
4000
4001 assert(params);
4002 assert(ret);
4003
4004 if (!params->cgroup_path)
4005 return -EINVAL;
4006
4007 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4008 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4009 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4010 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4011 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4012 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4013 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4014 * flag, which is only passed for the former statements, not for the latter. */
4015
4016 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4017 if (using_subcgroup)
4018 p = path_join(params->cgroup_path, ".control");
4019 else
4020 p = strdup(params->cgroup_path);
4021 if (!p)
4022 return -ENOMEM;
4023
4024 *ret = p;
4025 return using_subcgroup;
4026 }
4027
4028 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4029 _cleanup_(cpu_set_reset) CPUSet s = {};
4030 int r;
4031
4032 assert(c);
4033 assert(ret);
4034
4035 if (!c->numa_policy.nodes.set) {
4036 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4037 return 0;
4038 }
4039
4040 r = numa_to_cpu_set(&c->numa_policy, &s);
4041 if (r < 0)
4042 return r;
4043
4044 cpu_set_reset(ret);
4045
4046 return cpu_set_add_all(ret, &s);
4047 }
4048
4049 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4050 assert(c);
4051
4052 return c->cpu_affinity_from_numa;
4053 }
4054
4055 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4056 int r;
4057
4058 assert(fds);
4059 assert(n_fds);
4060 assert(*n_fds < fds_size);
4061 assert(ret_fd);
4062
4063 if (fd < 0) {
4064 *ret_fd = -1;
4065 return 0;
4066 }
4067
4068 if (fd < 3 + (int) *n_fds) {
4069 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4070 * the fds we pass to the process (or which are closed only during execve). */
4071
4072 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4073 if (r < 0)
4074 return -errno;
4075
4076 close_and_replace(fd, r);
4077 }
4078
4079 *ret_fd = fds[*n_fds] = fd;
4080 (*n_fds) ++;
4081 return 1;
4082 }
4083
4084 static int exec_child(
4085 Unit *unit,
4086 const ExecCommand *command,
4087 const ExecContext *context,
4088 const ExecParameters *params,
4089 ExecRuntime *runtime,
4090 DynamicCreds *dcreds,
4091 int socket_fd,
4092 const int named_iofds[static 3],
4093 int *fds,
4094 size_t n_socket_fds,
4095 size_t n_storage_fds,
4096 char **files_env,
4097 int user_lookup_fd,
4098 int *exit_status) {
4099
4100 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4101 int r, ngids = 0, exec_fd;
4102 _cleanup_free_ gid_t *supplementary_gids = NULL;
4103 const char *username = NULL, *groupname = NULL;
4104 _cleanup_free_ char *home_buffer = NULL;
4105 const char *home = NULL, *shell = NULL;
4106 char **final_argv = NULL;
4107 dev_t journal_stream_dev = 0;
4108 ino_t journal_stream_ino = 0;
4109 bool userns_set_up = false;
4110 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4111 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4112 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4113 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4114 #if HAVE_SELINUX
4115 _cleanup_free_ char *mac_selinux_context_net = NULL;
4116 bool use_selinux = false;
4117 #endif
4118 #if ENABLE_SMACK
4119 bool use_smack = false;
4120 #endif
4121 #if HAVE_APPARMOR
4122 bool use_apparmor = false;
4123 #endif
4124 uid_t saved_uid = getuid();
4125 gid_t saved_gid = getgid();
4126 uid_t uid = UID_INVALID;
4127 gid_t gid = GID_INVALID;
4128 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4129 n_keep_fds; /* total number of fds not to close */
4130 int secure_bits;
4131 _cleanup_free_ gid_t *gids_after_pam = NULL;
4132 int ngids_after_pam = 0;
4133
4134 assert(unit);
4135 assert(command);
4136 assert(context);
4137 assert(params);
4138 assert(exit_status);
4139
4140 /* Explicitly test for CVE-2021-4034 inspired invocations */
4141 assert(command->path);
4142 assert(!strv_isempty(command->argv));
4143
4144 rename_process_from_path(command->path);
4145
4146 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4147 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4148 * both of which will be demoted to SIG_DFL. */
4149 (void) default_signals(SIGNALS_CRASH_HANDLER,
4150 SIGNALS_IGNORE);
4151
4152 if (context->ignore_sigpipe)
4153 (void) ignore_signals(SIGPIPE);
4154
4155 r = reset_signal_mask();
4156 if (r < 0) {
4157 *exit_status = EXIT_SIGNAL_MASK;
4158 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4159 }
4160
4161 if (params->idle_pipe)
4162 do_idle_pipe_dance(params->idle_pipe);
4163
4164 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4165 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4166 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4167 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4168
4169 log_forget_fds();
4170 log_set_open_when_needed(true);
4171
4172 /* In case anything used libc syslog(), close this here, too */
4173 closelog();
4174
4175 int keep_fds[n_fds + 3];
4176 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4177 n_keep_fds = n_fds;
4178
4179 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4180 if (r < 0) {
4181 *exit_status = EXIT_FDS;
4182 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4183 }
4184
4185 #if HAVE_LIBBPF
4186 if (unit->manager->restrict_fs) {
4187 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4188 if (bpf_map_fd < 0) {
4189 *exit_status = EXIT_FDS;
4190 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4191 }
4192
4193 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4194 if (r < 0) {
4195 *exit_status = EXIT_FDS;
4196 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4197 }
4198 }
4199 #endif
4200
4201 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4202 if (r < 0) {
4203 *exit_status = EXIT_FDS;
4204 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4205 }
4206
4207 if (!context->same_pgrp &&
4208 setsid() < 0) {
4209 *exit_status = EXIT_SETSID;
4210 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4211 }
4212
4213 exec_context_tty_reset(context, params);
4214
4215 if (unit_shall_confirm_spawn(unit)) {
4216 _cleanup_free_ char *cmdline = NULL;
4217
4218 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4219 if (!cmdline) {
4220 *exit_status = EXIT_MEMORY;
4221 return log_oom();
4222 }
4223
4224 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4225 if (r != CONFIRM_EXECUTE) {
4226 if (r == CONFIRM_PRETEND_SUCCESS) {
4227 *exit_status = EXIT_SUCCESS;
4228 return 0;
4229 }
4230 *exit_status = EXIT_CONFIRM;
4231 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4232 "Execution cancelled by the user");
4233 }
4234 }
4235
4236 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4237 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4238 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4239 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4240 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4241 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4242 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4243 *exit_status = EXIT_MEMORY;
4244 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4245 }
4246
4247 if (context->dynamic_user && dcreds) {
4248 _cleanup_strv_free_ char **suggested_paths = NULL;
4249
4250 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4251 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4252 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4253 *exit_status = EXIT_USER;
4254 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4255 }
4256
4257 r = compile_suggested_paths(context, params, &suggested_paths);
4258 if (r < 0) {
4259 *exit_status = EXIT_MEMORY;
4260 return log_oom();
4261 }
4262
4263 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4264 if (r < 0) {
4265 *exit_status = EXIT_USER;
4266 if (r == -EILSEQ)
4267 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4268 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4269 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4270 }
4271
4272 if (!uid_is_valid(uid)) {
4273 *exit_status = EXIT_USER;
4274 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4275 }
4276
4277 if (!gid_is_valid(gid)) {
4278 *exit_status = EXIT_USER;
4279 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4280 }
4281
4282 if (dcreds->user)
4283 username = dcreds->user->name;
4284
4285 } else {
4286 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4287 if (r < 0) {
4288 *exit_status = EXIT_USER;
4289 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4290 }
4291
4292 r = get_fixed_group(context, &groupname, &gid);
4293 if (r < 0) {
4294 *exit_status = EXIT_GROUP;
4295 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4296 }
4297 }
4298
4299 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4300 r = get_supplementary_groups(context, username, groupname, gid,
4301 &supplementary_gids, &ngids);
4302 if (r < 0) {
4303 *exit_status = EXIT_GROUP;
4304 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4305 }
4306
4307 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4308 if (r < 0) {
4309 *exit_status = EXIT_USER;
4310 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4311 }
4312
4313 user_lookup_fd = safe_close(user_lookup_fd);
4314
4315 r = acquire_home(context, uid, &home, &home_buffer);
4316 if (r < 0) {
4317 *exit_status = EXIT_CHDIR;
4318 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4319 }
4320
4321 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4322 * must sure to drop O_NONBLOCK */
4323 if (socket_fd >= 0)
4324 (void) fd_nonblock(socket_fd, false);
4325
4326 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4327 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4328 if (params->cgroup_path) {
4329 _cleanup_free_ char *p = NULL;
4330
4331 r = exec_parameters_get_cgroup_path(params, &p);
4332 if (r < 0) {
4333 *exit_status = EXIT_CGROUP;
4334 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4335 }
4336
4337 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4338 if (r == -EUCLEAN) {
4339 *exit_status = EXIT_CGROUP;
4340 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4341 "because the cgroup or one of its parents or "
4342 "siblings is in the threaded mode: %m", p);
4343 }
4344 if (r < 0) {
4345 *exit_status = EXIT_CGROUP;
4346 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4347 }
4348 }
4349
4350 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4351 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4352 if (r < 0) {
4353 *exit_status = EXIT_NETWORK;
4354 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4355 }
4356 }
4357
4358 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4359 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4360 if (r < 0) {
4361 *exit_status = EXIT_NAMESPACE;
4362 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4363 }
4364 }
4365
4366 r = setup_input(context, params, socket_fd, named_iofds);
4367 if (r < 0) {
4368 *exit_status = EXIT_STDIN;
4369 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4370 }
4371
4372 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4373 if (r < 0) {
4374 *exit_status = EXIT_STDOUT;
4375 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4376 }
4377
4378 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4379 if (r < 0) {
4380 *exit_status = EXIT_STDERR;
4381 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4382 }
4383
4384 if (context->oom_score_adjust_set) {
4385 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4386 * prohibit write access to this file, and we shouldn't trip up over that. */
4387 r = set_oom_score_adjust(context->oom_score_adjust);
4388 if (ERRNO_IS_PRIVILEGE(r))
4389 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4390 else if (r < 0) {
4391 *exit_status = EXIT_OOM_ADJUST;
4392 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4393 }
4394 }
4395
4396 if (context->coredump_filter_set) {
4397 r = set_coredump_filter(context->coredump_filter);
4398 if (ERRNO_IS_PRIVILEGE(r))
4399 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4400 else if (r < 0)
4401 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4402 }
4403
4404 if (context->nice_set) {
4405 r = setpriority_closest(context->nice);
4406 if (r < 0)
4407 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4408 }
4409
4410 if (context->cpu_sched_set) {
4411 struct sched_param param = {
4412 .sched_priority = context->cpu_sched_priority,
4413 };
4414
4415 r = sched_setscheduler(0,
4416 context->cpu_sched_policy |
4417 (context->cpu_sched_reset_on_fork ?
4418 SCHED_RESET_ON_FORK : 0),
4419 &param);
4420 if (r < 0) {
4421 *exit_status = EXIT_SETSCHEDULER;
4422 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4423 }
4424 }
4425
4426 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4427 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4428 const CPUSet *cpu_set;
4429
4430 if (context->cpu_affinity_from_numa) {
4431 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4432 if (r < 0) {
4433 *exit_status = EXIT_CPUAFFINITY;
4434 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4435 }
4436
4437 cpu_set = &converted_cpu_set;
4438 } else
4439 cpu_set = &context->cpu_set;
4440
4441 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4442 *exit_status = EXIT_CPUAFFINITY;
4443 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4444 }
4445 }
4446
4447 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4448 r = apply_numa_policy(&context->numa_policy);
4449 if (r == -EOPNOTSUPP)
4450 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4451 else if (r < 0) {
4452 *exit_status = EXIT_NUMA_POLICY;
4453 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4454 }
4455 }
4456
4457 if (context->ioprio_set)
4458 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4459 *exit_status = EXIT_IOPRIO;
4460 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4461 }
4462
4463 if (context->timer_slack_nsec != NSEC_INFINITY)
4464 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4465 *exit_status = EXIT_TIMERSLACK;
4466 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4467 }
4468
4469 if (context->personality != PERSONALITY_INVALID) {
4470 r = safe_personality(context->personality);
4471 if (r < 0) {
4472 *exit_status = EXIT_PERSONALITY;
4473 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4474 }
4475 }
4476
4477 if (context->utmp_id) {
4478 const char *line = context->tty_path ?
4479 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4480 NULL;
4481 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4482 line,
4483 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4484 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4485 USER_PROCESS,
4486 username);
4487 }
4488
4489 if (uid_is_valid(uid)) {
4490 r = chown_terminal(STDIN_FILENO, uid);
4491 if (r < 0) {
4492 *exit_status = EXIT_STDIN;
4493 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4494 }
4495 }
4496
4497 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4498 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4499 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4500 * touch a single hierarchy too. */
4501 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4502 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4503 if (r < 0) {
4504 *exit_status = EXIT_CGROUP;
4505 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4506 }
4507 }
4508
4509 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4510
4511 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4512 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4513 if (r < 0)
4514 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4515 }
4516
4517 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4518 r = setup_credentials(context, params, unit->id, uid);
4519 if (r < 0) {
4520 *exit_status = EXIT_CREDENTIALS;
4521 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4522 }
4523 }
4524
4525 r = build_environment(
4526 unit,
4527 context,
4528 params,
4529 n_fds,
4530 home,
4531 username,
4532 shell,
4533 journal_stream_dev,
4534 journal_stream_ino,
4535 &our_env);
4536 if (r < 0) {
4537 *exit_status = EXIT_MEMORY;
4538 return log_oom();
4539 }
4540
4541 r = build_pass_environment(context, &pass_env);
4542 if (r < 0) {
4543 *exit_status = EXIT_MEMORY;
4544 return log_oom();
4545 }
4546
4547 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4548 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4549 * not specify PATH but the unit has ExecSearchPath. */
4550 if (!strv_isempty(context->exec_search_path)) {
4551 _cleanup_free_ char *joined = NULL;
4552
4553 joined = strv_join(context->exec_search_path, ":");
4554 if (!joined) {
4555 *exit_status = EXIT_MEMORY;
4556 return log_oom();
4557 }
4558
4559 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4560 if (r < 0) {
4561 *exit_status = EXIT_MEMORY;
4562 return log_oom();
4563 }
4564 }
4565
4566 accum_env = strv_env_merge(params->environment,
4567 our_env,
4568 joined_exec_search_path,
4569 pass_env,
4570 context->environment,
4571 files_env);
4572 if (!accum_env) {
4573 *exit_status = EXIT_MEMORY;
4574 return log_oom();
4575 }
4576 accum_env = strv_env_clean(accum_env);
4577
4578 (void) umask(context->umask);
4579
4580 r = setup_keyring(unit, context, params, uid, gid);
4581 if (r < 0) {
4582 *exit_status = EXIT_KEYRING;
4583 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4584 }
4585
4586 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4587 * from it. */
4588 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4589
4590 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4591 * for it, and the kernel doesn't actually support ambient caps. */
4592 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4593
4594 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4595 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4596 * desired. */
4597 if (needs_ambient_hack)
4598 needs_setuid = false;
4599 else
4600 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4601
4602 if (needs_sandboxing) {
4603 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4604 * /sys being present. The actual MAC context application will happen later, as late as
4605 * possible, to avoid impacting our own code paths. */
4606
4607 #if HAVE_SELINUX
4608 use_selinux = mac_selinux_use();
4609 #endif
4610 #if ENABLE_SMACK
4611 use_smack = mac_smack_use();
4612 #endif
4613 #if HAVE_APPARMOR
4614 use_apparmor = mac_apparmor_use();
4615 #endif
4616 }
4617
4618 if (needs_sandboxing) {
4619 int which_failed;
4620
4621 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4622 * is set here. (See below.) */
4623
4624 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4625 if (r < 0) {
4626 *exit_status = EXIT_LIMITS;
4627 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4628 }
4629 }
4630
4631 if (needs_setuid && context->pam_name && username) {
4632 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4633 * wins here. (See above.) */
4634
4635 /* All fds passed in the fds array will be closed in the pam child process. */
4636 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4637 if (r < 0) {
4638 *exit_status = EXIT_PAM;
4639 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4640 }
4641
4642 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4643 if (ngids_after_pam < 0) {
4644 *exit_status = EXIT_MEMORY;
4645 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4646 }
4647 }
4648
4649 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
4650 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4651 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4652 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4653
4654 userns_set_up = true;
4655 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4656 if (r < 0) {
4657 *exit_status = EXIT_USER;
4658 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4659 }
4660 }
4661
4662 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4663
4664 if (ns_type_supported(NAMESPACE_NET)) {
4665 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4666 if (r == -EPERM)
4667 log_unit_warning_errno(unit, r,
4668 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4669 else if (r < 0) {
4670 *exit_status = EXIT_NETWORK;
4671 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4672 }
4673 } else if (context->network_namespace_path) {
4674 *exit_status = EXIT_NETWORK;
4675 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4676 "NetworkNamespacePath= is not supported, refusing.");
4677 } else
4678 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4679 }
4680
4681 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4682
4683 if (ns_type_supported(NAMESPACE_IPC)) {
4684 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4685 if (r == -EPERM)
4686 log_unit_warning_errno(unit, r,
4687 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4688 else if (r < 0) {
4689 *exit_status = EXIT_NAMESPACE;
4690 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4691 }
4692 } else if (context->ipc_namespace_path) {
4693 *exit_status = EXIT_NAMESPACE;
4694 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4695 "IPCNamespacePath= is not supported, refusing.");
4696 } else
4697 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4698 }
4699
4700 if (needs_mount_namespace) {
4701 _cleanup_free_ char *error_path = NULL;
4702
4703 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4704 if (r < 0) {
4705 *exit_status = EXIT_NAMESPACE;
4706 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4707 error_path ? ": " : "", strempty(error_path));
4708 }
4709 }
4710
4711 if (needs_sandboxing) {
4712 r = apply_protect_hostname(unit, context, exit_status);
4713 if (r < 0)
4714 return r;
4715 }
4716
4717 /* Drop groups as early as possible.
4718 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4719 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4720 if (needs_setuid) {
4721 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4722 int ngids_to_enforce = 0;
4723
4724 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4725 ngids,
4726 gids_after_pam,
4727 ngids_after_pam,
4728 &gids_to_enforce);
4729 if (ngids_to_enforce < 0) {
4730 *exit_status = EXIT_MEMORY;
4731 return log_unit_error_errno(unit,
4732 ngids_to_enforce,
4733 "Failed to merge group lists. Group membership might be incorrect: %m");
4734 }
4735
4736 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4737 if (r < 0) {
4738 *exit_status = EXIT_GROUP;
4739 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4740 }
4741 }
4742
4743 /* If the user namespace was not set up above, try to do it now.
4744 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4745 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4746 * case of mount namespaces being less privileged when the mount point list is copied from a
4747 * different user namespace). */
4748
4749 if (needs_sandboxing && context->private_users && !userns_set_up) {
4750 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4751 if (r < 0) {
4752 *exit_status = EXIT_USER;
4753 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4754 }
4755 }
4756
4757 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4758 * shall execute. */
4759
4760 _cleanup_free_ char *executable = NULL;
4761 _cleanup_close_ int executable_fd = -1;
4762 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4763 if (r < 0) {
4764 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4765 log_unit_struct_errno(unit, LOG_INFO, r,
4766 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4767 LOG_UNIT_INVOCATION_ID(unit),
4768 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4769 command->path),
4770 "EXECUTABLE=%s", command->path);
4771 return 0;
4772 }
4773
4774 *exit_status = EXIT_EXEC;
4775
4776 return log_unit_struct_errno(unit, LOG_INFO, r,
4777 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4778 LOG_UNIT_INVOCATION_ID(unit),
4779 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4780 command->path),
4781 "EXECUTABLE=%s", command->path);
4782 }
4783
4784 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4785 if (r < 0) {
4786 *exit_status = EXIT_FDS;
4787 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4788 }
4789
4790 #if HAVE_SELINUX
4791 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4792 int fd = -1;
4793
4794 if (socket_fd >= 0)
4795 fd = socket_fd;
4796 else if (params->n_socket_fds == 1)
4797 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4798 * use context from that fd to compute the label. */
4799 fd = params->fds[0];
4800
4801 if (fd >= 0) {
4802 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4803 if (r < 0) {
4804 if (!context->selinux_context_ignore) {
4805 *exit_status = EXIT_SELINUX_CONTEXT;
4806 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4807 }
4808 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4809 }
4810 }
4811 }
4812 #endif
4813
4814 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4815 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4816 * however if we have it as we want to keep it open until the final execve(). */
4817
4818 r = close_all_fds(keep_fds, n_keep_fds);
4819 if (r >= 0)
4820 r = shift_fds(fds, n_fds);
4821 if (r >= 0)
4822 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
4823 if (r < 0) {
4824 *exit_status = EXIT_FDS;
4825 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4826 }
4827
4828 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4829 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4830 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4831 * came this far. */
4832
4833 secure_bits = context->secure_bits;
4834
4835 if (needs_sandboxing) {
4836 uint64_t bset;
4837
4838 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4839 * requested. (Note this is placed after the general resource limit initialization, see
4840 * above, in order to take precedence.) */
4841 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4842 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4843 *exit_status = EXIT_LIMITS;
4844 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4845 }
4846 }
4847
4848 #if ENABLE_SMACK
4849 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4850 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4851 if (use_smack) {
4852 r = setup_smack(unit->manager, context, executable_fd);
4853 if (r < 0 && !context->smack_process_label_ignore) {
4854 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4855 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4856 }
4857 }
4858 #endif
4859
4860 bset = context->capability_bounding_set;
4861 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4862 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4863 * instead of us doing that */
4864 if (needs_ambient_hack)
4865 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4866 (UINT64_C(1) << CAP_SETUID) |
4867 (UINT64_C(1) << CAP_SETGID);
4868
4869 if (!cap_test_all(bset)) {
4870 r = capability_bounding_set_drop(bset, false);
4871 if (r < 0) {
4872 *exit_status = EXIT_CAPABILITIES;
4873 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4874 }
4875 }
4876
4877 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4878 * keep-caps set.
4879 * To be able to raise the ambient capabilities after setresuid() they have to be
4880 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4881 * After setresuid() the ambient capabilities can be raised as they are present in
4882 * the permitted and inhertiable set. However it is possible that someone wants to
4883 * set ambient capabilities without changing the user, so we also set the ambient
4884 * capabilities here.
4885 * The requested ambient capabilities are raised in the inheritable set if the
4886 * second argument is true. */
4887 if (!needs_ambient_hack) {
4888 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4889 if (r < 0) {
4890 *exit_status = EXIT_CAPABILITIES;
4891 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4892 }
4893 }
4894 }
4895
4896 /* chroot to root directory first, before we lose the ability to chroot */
4897 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4898 if (r < 0)
4899 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4900
4901 if (needs_setuid) {
4902 if (uid_is_valid(uid)) {
4903 r = enforce_user(context, uid);
4904 if (r < 0) {
4905 *exit_status = EXIT_USER;
4906 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4907 }
4908
4909 if (!needs_ambient_hack &&
4910 context->capability_ambient_set != 0) {
4911
4912 /* Raise the ambient capabilities after user change. */
4913 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4914 if (r < 0) {
4915 *exit_status = EXIT_CAPABILITIES;
4916 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4917 }
4918 }
4919 }
4920 }
4921
4922 /* Apply working directory here, because the working directory might be on NFS and only the user running
4923 * this service might have the correct privilege to change to the working directory */
4924 r = apply_working_directory(context, params, home, exit_status);
4925 if (r < 0)
4926 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4927
4928 if (needs_sandboxing) {
4929 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4930 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4931 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4932 * are restricted. */
4933
4934 #if HAVE_SELINUX
4935 if (use_selinux) {
4936 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4937
4938 if (exec_context) {
4939 r = setexeccon(exec_context);
4940 if (r < 0) {
4941 if (!context->selinux_context_ignore) {
4942 *exit_status = EXIT_SELINUX_CONTEXT;
4943 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4944 }
4945 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4946 }
4947 }
4948 }
4949 #endif
4950
4951 #if HAVE_APPARMOR
4952 if (use_apparmor && context->apparmor_profile) {
4953 r = aa_change_onexec(context->apparmor_profile);
4954 if (r < 0 && !context->apparmor_profile_ignore) {
4955 *exit_status = EXIT_APPARMOR_PROFILE;
4956 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4957 }
4958 }
4959 #endif
4960
4961 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
4962 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4963 * CAP_SETPCAP. */
4964 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4965 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4966 * effective set here.
4967 * The effective set is overwritten during execve with the following values:
4968 * - ambient set (for non-root processes)
4969 * - (inheritable | bounding) set for root processes)
4970 *
4971 * Hence there is no security impact to raise it in the effective set before execve
4972 */
4973 r = capability_gain_cap_setpcap(NULL);
4974 if (r < 0) {
4975 *exit_status = EXIT_CAPABILITIES;
4976 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4977 }
4978 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4979 *exit_status = EXIT_SECUREBITS;
4980 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4981 }
4982 }
4983
4984 if (context_has_no_new_privileges(context))
4985 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4986 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4987 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4988 }
4989
4990 #if HAVE_SECCOMP
4991 r = apply_address_families(unit, context);
4992 if (r < 0) {
4993 *exit_status = EXIT_ADDRESS_FAMILIES;
4994 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4995 }
4996
4997 r = apply_memory_deny_write_execute(unit, context);
4998 if (r < 0) {
4999 *exit_status = EXIT_SECCOMP;
5000 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5001 }
5002
5003 r = apply_restrict_realtime(unit, context);
5004 if (r < 0) {
5005 *exit_status = EXIT_SECCOMP;
5006 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5007 }
5008
5009 r = apply_restrict_suid_sgid(unit, context);
5010 if (r < 0) {
5011 *exit_status = EXIT_SECCOMP;
5012 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5013 }
5014
5015 r = apply_restrict_namespaces(unit, context);
5016 if (r < 0) {
5017 *exit_status = EXIT_SECCOMP;
5018 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5019 }
5020
5021 r = apply_protect_sysctl(unit, context);
5022 if (r < 0) {
5023 *exit_status = EXIT_SECCOMP;
5024 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5025 }
5026
5027 r = apply_protect_kernel_modules(unit, context);
5028 if (r < 0) {
5029 *exit_status = EXIT_SECCOMP;
5030 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5031 }
5032
5033 r = apply_protect_kernel_logs(unit, context);
5034 if (r < 0) {
5035 *exit_status = EXIT_SECCOMP;
5036 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5037 }
5038
5039 r = apply_protect_clock(unit, context);
5040 if (r < 0) {
5041 *exit_status = EXIT_SECCOMP;
5042 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5043 }
5044
5045 r = apply_private_devices(unit, context);
5046 if (r < 0) {
5047 *exit_status = EXIT_SECCOMP;
5048 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5049 }
5050
5051 r = apply_syscall_archs(unit, context);
5052 if (r < 0) {
5053 *exit_status = EXIT_SECCOMP;
5054 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5055 }
5056
5057 r = apply_lock_personality(unit, context);
5058 if (r < 0) {
5059 *exit_status = EXIT_SECCOMP;
5060 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5061 }
5062
5063 r = apply_syscall_log(unit, context);
5064 if (r < 0) {
5065 *exit_status = EXIT_SECCOMP;
5066 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5067 }
5068
5069 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5070 * by the filter as little as possible. */
5071 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5072 if (r < 0) {
5073 *exit_status = EXIT_SECCOMP;
5074 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5075 }
5076 #endif
5077
5078 #if HAVE_LIBBPF
5079 r = apply_restrict_filesystems(unit, context);
5080 if (r < 0) {
5081 *exit_status = EXIT_BPF;
5082 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5083 }
5084 #endif
5085
5086 }
5087
5088 if (!strv_isempty(context->unset_environment)) {
5089 char **ee = NULL;
5090
5091 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5092 if (!ee) {
5093 *exit_status = EXIT_MEMORY;
5094 return log_oom();
5095 }
5096
5097 strv_free_and_replace(accum_env, ee);
5098 }
5099
5100 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5101 replaced_argv = replace_env_argv(command->argv, accum_env);
5102 if (!replaced_argv) {
5103 *exit_status = EXIT_MEMORY;
5104 return log_oom();
5105 }
5106 final_argv = replaced_argv;
5107 } else
5108 final_argv = command->argv;
5109
5110 if (DEBUG_LOGGING) {
5111 _cleanup_free_ char *line = NULL;
5112
5113 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5114 if (!line) {
5115 *exit_status = EXIT_MEMORY;
5116 return log_oom();
5117 }
5118
5119 log_unit_struct(unit, LOG_DEBUG,
5120 "EXECUTABLE=%s", executable,
5121 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5122 }
5123
5124 if (exec_fd >= 0) {
5125 uint8_t hot = 1;
5126
5127 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5128 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5129
5130 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5131 *exit_status = EXIT_EXEC;
5132 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5133 }
5134 }
5135
5136 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5137
5138 if (exec_fd >= 0) {
5139 uint8_t hot = 0;
5140
5141 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5142 * that POLLHUP on it no longer means execve() succeeded. */
5143
5144 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5145 *exit_status = EXIT_EXEC;
5146 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5147 }
5148 }
5149
5150 *exit_status = EXIT_EXEC;
5151 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5152 }
5153
5154 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5155 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5156
5157 int exec_spawn(Unit *unit,
5158 ExecCommand *command,
5159 const ExecContext *context,
5160 const ExecParameters *params,
5161 ExecRuntime *runtime,
5162 DynamicCreds *dcreds,
5163 pid_t *ret) {
5164
5165 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5166 _cleanup_free_ char *subcgroup_path = NULL;
5167 _cleanup_strv_free_ char **files_env = NULL;
5168 size_t n_storage_fds = 0, n_socket_fds = 0;
5169 _cleanup_free_ char *line = NULL;
5170 pid_t pid;
5171
5172 assert(unit);
5173 assert(command);
5174 assert(context);
5175 assert(ret);
5176 assert(params);
5177 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5178
5179 if (context->std_input == EXEC_INPUT_SOCKET ||
5180 context->std_output == EXEC_OUTPUT_SOCKET ||
5181 context->std_error == EXEC_OUTPUT_SOCKET) {
5182
5183 if (params->n_socket_fds > 1)
5184 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5185
5186 if (params->n_socket_fds == 0)
5187 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5188
5189 socket_fd = params->fds[0];
5190 } else {
5191 socket_fd = -1;
5192 fds = params->fds;
5193 n_socket_fds = params->n_socket_fds;
5194 n_storage_fds = params->n_storage_fds;
5195 }
5196
5197 r = exec_context_named_iofds(context, params, named_iofds);
5198 if (r < 0)
5199 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5200
5201 r = exec_context_load_environment(unit, context, &files_env);
5202 if (r < 0)
5203 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5204
5205 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5206 if (!line)
5207 return log_oom();
5208
5209 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5210 and, until the next SELinux policy changes, we save further reloads in future children. */
5211 mac_selinux_maybe_reload();
5212
5213 log_unit_struct(unit, LOG_DEBUG,
5214 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5215 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5216 the mount namespace in the child, but we want to log
5217 from the parent, so we need to use the (possibly
5218 inaccurate) path here. */
5219 LOG_UNIT_INVOCATION_ID(unit));
5220
5221 if (params->cgroup_path) {
5222 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5223 if (r < 0)
5224 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5225 if (r > 0) { /* We are using a child cgroup */
5226 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5227 if (r < 0)
5228 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5229
5230 /* Normally we would not propagate the oomd xattrs to children but since we created this
5231 * sub-cgroup internally we should do it. */
5232 cgroup_oomd_xattr_apply(unit, subcgroup_path);
5233 }
5234 }
5235
5236 pid = fork();
5237 if (pid < 0)
5238 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5239
5240 if (pid == 0) {
5241 int exit_status = EXIT_SUCCESS;
5242
5243 r = exec_child(unit,
5244 command,
5245 context,
5246 params,
5247 runtime,
5248 dcreds,
5249 socket_fd,
5250 named_iofds,
5251 fds,
5252 n_socket_fds,
5253 n_storage_fds,
5254 files_env,
5255 unit->manager->user_lookup_fds[1],
5256 &exit_status);
5257
5258 if (r < 0) {
5259 const char *status =
5260 exit_status_to_string(exit_status,
5261 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5262
5263 log_unit_struct_errno(unit, LOG_ERR, r,
5264 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5265 LOG_UNIT_INVOCATION_ID(unit),
5266 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5267 status, command->path),
5268 "EXECUTABLE=%s", command->path);
5269 }
5270
5271 _exit(exit_status);
5272 }
5273
5274 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5275
5276 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5277 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5278 * process will be killed too). */
5279 if (subcgroup_path)
5280 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5281
5282 exec_status_start(&command->exec_status, pid);
5283
5284 *ret = pid;
5285 return 0;
5286 }
5287
5288 void exec_context_init(ExecContext *c) {
5289 assert(c);
5290
5291 c->umask = 0022;
5292 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5293 c->cpu_sched_policy = SCHED_OTHER;
5294 c->syslog_priority = LOG_DAEMON|LOG_INFO;
5295 c->syslog_level_prefix = true;
5296 c->ignore_sigpipe = true;
5297 c->timer_slack_nsec = NSEC_INFINITY;
5298 c->personality = PERSONALITY_INVALID;
5299 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5300 c->directories[t].mode = 0755;
5301 c->timeout_clean_usec = USEC_INFINITY;
5302 c->capability_bounding_set = CAP_ALL;
5303 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5304 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5305 c->log_level_max = -1;
5306 #if HAVE_SECCOMP
5307 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5308 #endif
5309 c->tty_rows = UINT_MAX;
5310 c->tty_cols = UINT_MAX;
5311 numa_policy_reset(&c->numa_policy);
5312 }
5313
5314 void exec_context_done(ExecContext *c) {
5315 assert(c);
5316
5317 c->environment = strv_free(c->environment);
5318 c->environment_files = strv_free(c->environment_files);
5319 c->pass_environment = strv_free(c->pass_environment);
5320 c->unset_environment = strv_free(c->unset_environment);
5321
5322 rlimit_free_all(c->rlimit);
5323
5324 for (size_t l = 0; l < 3; l++) {
5325 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5326 c->stdio_file[l] = mfree(c->stdio_file[l]);
5327 }
5328
5329 c->working_directory = mfree(c->working_directory);
5330 c->root_directory = mfree(c->root_directory);
5331 c->root_image = mfree(c->root_image);
5332 c->root_image_options = mount_options_free_all(c->root_image_options);
5333 c->root_hash = mfree(c->root_hash);
5334 c->root_hash_size = 0;
5335 c->root_hash_path = mfree(c->root_hash_path);
5336 c->root_hash_sig = mfree(c->root_hash_sig);
5337 c->root_hash_sig_size = 0;
5338 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5339 c->root_verity = mfree(c->root_verity);
5340 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5341 c->extension_directories = strv_free(c->extension_directories);
5342 c->tty_path = mfree(c->tty_path);
5343 c->syslog_identifier = mfree(c->syslog_identifier);
5344 c->user = mfree(c->user);
5345 c->group = mfree(c->group);
5346
5347 c->supplementary_groups = strv_free(c->supplementary_groups);
5348
5349 c->pam_name = mfree(c->pam_name);
5350
5351 c->read_only_paths = strv_free(c->read_only_paths);
5352 c->read_write_paths = strv_free(c->read_write_paths);
5353 c->inaccessible_paths = strv_free(c->inaccessible_paths);
5354 c->exec_paths = strv_free(c->exec_paths);
5355 c->no_exec_paths = strv_free(c->no_exec_paths);
5356 c->exec_search_path = strv_free(c->exec_search_path);
5357
5358 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5359 c->bind_mounts = NULL;
5360 c->n_bind_mounts = 0;
5361 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5362 c->temporary_filesystems = NULL;
5363 c->n_temporary_filesystems = 0;
5364 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5365
5366 cpu_set_reset(&c->cpu_set);
5367 numa_policy_reset(&c->numa_policy);
5368
5369 c->utmp_id = mfree(c->utmp_id);
5370 c->selinux_context = mfree(c->selinux_context);
5371 c->apparmor_profile = mfree(c->apparmor_profile);
5372 c->smack_process_label = mfree(c->smack_process_label);
5373
5374 c->restrict_filesystems = set_free(c->restrict_filesystems);
5375
5376 c->syscall_filter = hashmap_free(c->syscall_filter);
5377 c->syscall_archs = set_free(c->syscall_archs);
5378 c->address_families = set_free(c->address_families);
5379
5380 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5381 exec_directory_done(&c->directories[t]);
5382
5383 c->log_level_max = -1;
5384
5385 exec_context_free_log_extra_fields(c);
5386
5387 c->log_ratelimit_interval_usec = 0;
5388 c->log_ratelimit_burst = 0;
5389
5390 c->stdin_data = mfree(c->stdin_data);
5391 c->stdin_data_size = 0;
5392
5393 c->network_namespace_path = mfree(c->network_namespace_path);
5394 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5395
5396 c->log_namespace = mfree(c->log_namespace);
5397
5398 c->load_credentials = hashmap_free(c->load_credentials);
5399 c->set_credentials = hashmap_free(c->set_credentials);
5400 }
5401
5402 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5403 assert(c);
5404
5405 if (!runtime_prefix)
5406 return 0;
5407
5408 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5409 _cleanup_free_ char *p = NULL;
5410
5411 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5412 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5413 else
5414 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5415 if (!p)
5416 return -ENOMEM;
5417
5418 /* We execute this synchronously, since we need to be sure this is gone when we start the
5419 * service next. */
5420 (void) rm_rf(p, REMOVE_ROOT);
5421
5422 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5423 _cleanup_free_ char *symlink_abs = NULL;
5424
5425 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5426 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5427 else
5428 symlink_abs = path_join(runtime_prefix, *symlink);
5429 if (!symlink_abs)
5430 return -ENOMEM;
5431
5432 (void) unlink(symlink_abs);
5433 }
5434
5435 }
5436
5437 return 0;
5438 }
5439
5440 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5441 _cleanup_free_ char *p = NULL;
5442
5443 assert(c);
5444
5445 if (!runtime_prefix || !unit)
5446 return 0;
5447
5448 p = path_join(runtime_prefix, "credentials", unit);
5449 if (!p)
5450 return -ENOMEM;
5451
5452 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5453 * unmount it, and afterwards remove the mount point */
5454 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5455 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5456
5457 return 0;
5458 }
5459
5460 static void exec_command_done(ExecCommand *c) {
5461 assert(c);
5462
5463 c->path = mfree(c->path);
5464 c->argv = strv_free(c->argv);
5465 }
5466
5467 void exec_command_done_array(ExecCommand *c, size_t n) {
5468 for (size_t i = 0; i < n; i++)
5469 exec_command_done(c+i);
5470 }
5471
5472 ExecCommand* exec_command_free_list(ExecCommand *c) {
5473 ExecCommand *i;
5474
5475 while ((i = c)) {
5476 LIST_REMOVE(command, c, i);
5477 exec_command_done(i);
5478 free(i);
5479 }
5480
5481 return NULL;
5482 }
5483
5484 void exec_command_free_array(ExecCommand **c, size_t n) {
5485 for (size_t i = 0; i < n; i++)
5486 c[i] = exec_command_free_list(c[i]);
5487 }
5488
5489 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5490 for (size_t i = 0; i < n; i++)
5491 exec_status_reset(&c[i].exec_status);
5492 }
5493
5494 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5495 for (size_t i = 0; i < n; i++)
5496 LIST_FOREACH(command, z, c[i])
5497 exec_status_reset(&z->exec_status);
5498 }
5499
5500 typedef struct InvalidEnvInfo {
5501 const Unit *unit;
5502 const char *path;
5503 } InvalidEnvInfo;
5504
5505 static void invalid_env(const char *p, void *userdata) {
5506 InvalidEnvInfo *info = userdata;
5507
5508 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5509 }
5510
5511 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5512 assert(c);
5513
5514 switch (fd_index) {
5515
5516 case STDIN_FILENO:
5517 if (c->std_input != EXEC_INPUT_NAMED_FD)
5518 return NULL;
5519
5520 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5521
5522 case STDOUT_FILENO:
5523 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5524 return NULL;
5525
5526 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5527
5528 case STDERR_FILENO:
5529 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5530 return NULL;
5531
5532 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5533
5534 default:
5535 return NULL;
5536 }
5537 }
5538
5539 static int exec_context_named_iofds(
5540 const ExecContext *c,
5541 const ExecParameters *p,
5542 int named_iofds[static 3]) {
5543
5544 size_t targets;
5545 const char* stdio_fdname[3];
5546 size_t n_fds;
5547
5548 assert(c);
5549 assert(p);
5550 assert(named_iofds);
5551
5552 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5553 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5554 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5555
5556 for (size_t i = 0; i < 3; i++)
5557 stdio_fdname[i] = exec_context_fdname(c, i);
5558
5559 n_fds = p->n_storage_fds + p->n_socket_fds;
5560
5561 for (size_t i = 0; i < n_fds && targets > 0; i++)
5562 if (named_iofds[STDIN_FILENO] < 0 &&
5563 c->std_input == EXEC_INPUT_NAMED_FD &&
5564 stdio_fdname[STDIN_FILENO] &&
5565 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5566
5567 named_iofds[STDIN_FILENO] = p->fds[i];
5568 targets--;
5569
5570 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5571 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5572 stdio_fdname[STDOUT_FILENO] &&
5573 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5574
5575 named_iofds[STDOUT_FILENO] = p->fds[i];
5576 targets--;
5577
5578 } else if (named_iofds[STDERR_FILENO] < 0 &&
5579 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5580 stdio_fdname[STDERR_FILENO] &&
5581 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5582
5583 named_iofds[STDERR_FILENO] = p->fds[i];
5584 targets--;
5585 }
5586
5587 return targets == 0 ? 0 : -ENOENT;
5588 }
5589
5590 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5591 _cleanup_strv_free_ char **v = NULL;
5592 int r;
5593
5594 assert(c);
5595 assert(ret);
5596
5597 STRV_FOREACH(i, c->environment_files) {
5598 _cleanup_globfree_ glob_t pglob = {};
5599 bool ignore = false;
5600 char *fn = *i;
5601
5602 if (fn[0] == '-') {
5603 ignore = true;
5604 fn++;
5605 }
5606
5607 if (!path_is_absolute(fn)) {
5608 if (ignore)
5609 continue;
5610 return -EINVAL;
5611 }
5612
5613 /* Filename supports globbing, take all matching files */
5614 r = safe_glob(fn, 0, &pglob);
5615 if (r < 0) {
5616 if (ignore)
5617 continue;
5618 return r;
5619 }
5620
5621 /* When we don't match anything, -ENOENT should be returned */
5622 assert(pglob.gl_pathc > 0);
5623
5624 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5625 _cleanup_strv_free_ char **p = NULL;
5626
5627 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5628 if (r < 0) {
5629 if (ignore)
5630 continue;
5631 return r;
5632 }
5633
5634 /* Log invalid environment variables with filename */
5635 if (p) {
5636 InvalidEnvInfo info = {
5637 .unit = unit,
5638 .path = pglob.gl_pathv[n]
5639 };
5640
5641 p = strv_env_clean_with_callback(p, invalid_env, &info);
5642 }
5643
5644 if (!v)
5645 v = TAKE_PTR(p);
5646 else {
5647 char **m = strv_env_merge(v, p);
5648 if (!m)
5649 return -ENOMEM;
5650
5651 strv_free_and_replace(v, m);
5652 }
5653 }
5654 }
5655
5656 *ret = TAKE_PTR(v);
5657
5658 return 0;
5659 }
5660
5661 static bool tty_may_match_dev_console(const char *tty) {
5662 _cleanup_free_ char *resolved = NULL;
5663
5664 if (!tty)
5665 return true;
5666
5667 tty = skip_dev_prefix(tty);
5668
5669 /* trivial identity? */
5670 if (streq(tty, "console"))
5671 return true;
5672
5673 if (resolve_dev_console(&resolved) < 0)
5674 return true; /* if we could not resolve, assume it may */
5675
5676 /* "tty0" means the active VC, so it may be the same sometimes */
5677 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5678 }
5679
5680 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5681 assert(ec);
5682
5683 return ec->tty_reset ||
5684 ec->tty_vhangup ||
5685 ec->tty_vt_disallocate ||
5686 is_terminal_input(ec->std_input) ||
5687 is_terminal_output(ec->std_output) ||
5688 is_terminal_output(ec->std_error);
5689 }
5690
5691 bool exec_context_may_touch_console(const ExecContext *ec) {
5692
5693 return exec_context_may_touch_tty(ec) &&
5694 tty_may_match_dev_console(exec_context_tty_path(ec));
5695 }
5696
5697 static void strv_fprintf(FILE *f, char **l) {
5698 assert(f);
5699
5700 STRV_FOREACH(g, l)
5701 fprintf(f, " %s", *g);
5702 }
5703
5704 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5705 assert(f);
5706 assert(prefix);
5707 assert(name);
5708
5709 if (!strv_isempty(strv)) {
5710 fprintf(f, "%s%s:", prefix, name);
5711 strv_fprintf(f, strv);
5712 fputs("\n", f);
5713 }
5714 }
5715
5716 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5717 int r;
5718
5719 assert(c);
5720 assert(f);
5721
5722 prefix = strempty(prefix);
5723
5724 fprintf(f,
5725 "%sUMask: %04o\n"
5726 "%sWorkingDirectory: %s\n"
5727 "%sRootDirectory: %s\n"
5728 "%sNonBlocking: %s\n"
5729 "%sPrivateTmp: %s\n"
5730 "%sPrivateDevices: %s\n"
5731 "%sProtectKernelTunables: %s\n"
5732 "%sProtectKernelModules: %s\n"
5733 "%sProtectKernelLogs: %s\n"
5734 "%sProtectClock: %s\n"
5735 "%sProtectControlGroups: %s\n"
5736 "%sPrivateNetwork: %s\n"
5737 "%sPrivateUsers: %s\n"
5738 "%sProtectHome: %s\n"
5739 "%sProtectSystem: %s\n"
5740 "%sMountAPIVFS: %s\n"
5741 "%sIgnoreSIGPIPE: %s\n"
5742 "%sMemoryDenyWriteExecute: %s\n"
5743 "%sRestrictRealtime: %s\n"
5744 "%sRestrictSUIDSGID: %s\n"
5745 "%sKeyringMode: %s\n"
5746 "%sProtectHostname: %s\n"
5747 "%sProtectProc: %s\n"
5748 "%sProcSubset: %s\n",
5749 prefix, c->umask,
5750 prefix, empty_to_root(c->working_directory),
5751 prefix, empty_to_root(c->root_directory),
5752 prefix, yes_no(c->non_blocking),
5753 prefix, yes_no(c->private_tmp),
5754 prefix, yes_no(c->private_devices),
5755 prefix, yes_no(c->protect_kernel_tunables),
5756 prefix, yes_no(c->protect_kernel_modules),
5757 prefix, yes_no(c->protect_kernel_logs),
5758 prefix, yes_no(c->protect_clock),
5759 prefix, yes_no(c->protect_control_groups),
5760 prefix, yes_no(c->private_network),
5761 prefix, yes_no(c->private_users),
5762 prefix, protect_home_to_string(c->protect_home),
5763 prefix, protect_system_to_string(c->protect_system),
5764 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5765 prefix, yes_no(c->ignore_sigpipe),
5766 prefix, yes_no(c->memory_deny_write_execute),
5767 prefix, yes_no(c->restrict_realtime),
5768 prefix, yes_no(c->restrict_suid_sgid),
5769 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5770 prefix, yes_no(c->protect_hostname),
5771 prefix, protect_proc_to_string(c->protect_proc),
5772 prefix, proc_subset_to_string(c->proc_subset));
5773
5774 if (c->root_image)
5775 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5776
5777 if (c->root_image_options) {
5778 fprintf(f, "%sRootImageOptions:", prefix);
5779 LIST_FOREACH(mount_options, o, c->root_image_options)
5780 if (!isempty(o->options))
5781 fprintf(f, " %s:%s",
5782 partition_designator_to_string(o->partition_designator),
5783 o->options);
5784 fprintf(f, "\n");
5785 }
5786
5787 if (c->root_hash) {
5788 _cleanup_free_ char *encoded = NULL;
5789 encoded = hexmem(c->root_hash, c->root_hash_size);
5790 if (encoded)
5791 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5792 }
5793
5794 if (c->root_hash_path)
5795 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5796
5797 if (c->root_hash_sig) {
5798 _cleanup_free_ char *encoded = NULL;
5799 ssize_t len;
5800 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5801 if (len)
5802 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5803 }
5804
5805 if (c->root_hash_sig_path)
5806 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5807
5808 if (c->root_verity)
5809 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5810
5811 STRV_FOREACH(e, c->environment)
5812 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5813
5814 STRV_FOREACH(e, c->environment_files)
5815 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5816
5817 STRV_FOREACH(e, c->pass_environment)
5818 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5819
5820 STRV_FOREACH(e, c->unset_environment)
5821 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5822
5823 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5824
5825 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5826 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5827
5828 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5829 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5830
5831 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5832 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5833 }
5834 }
5835
5836 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5837
5838 if (c->nice_set)
5839 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5840
5841 if (c->oom_score_adjust_set)
5842 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5843
5844 if (c->coredump_filter_set)
5845 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5846
5847 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5848 if (c->rlimit[i]) {
5849 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5850 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5851 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5852 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5853 }
5854
5855 if (c->ioprio_set) {
5856 _cleanup_free_ char *class_str = NULL;
5857
5858 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5859 if (r >= 0)
5860 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5861
5862 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5863 }
5864
5865 if (c->cpu_sched_set) {
5866 _cleanup_free_ char *policy_str = NULL;
5867
5868 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5869 if (r >= 0)
5870 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5871
5872 fprintf(f,
5873 "%sCPUSchedulingPriority: %i\n"
5874 "%sCPUSchedulingResetOnFork: %s\n",
5875 prefix, c->cpu_sched_priority,
5876 prefix, yes_no(c->cpu_sched_reset_on_fork));
5877 }
5878
5879 if (c->cpu_set.set) {
5880 _cleanup_free_ char *affinity = NULL;
5881
5882 affinity = cpu_set_to_range_string(&c->cpu_set);
5883 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5884 }
5885
5886 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5887 _cleanup_free_ char *nodes = NULL;
5888
5889 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5890 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5891 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5892 }
5893
5894 if (c->timer_slack_nsec != NSEC_INFINITY)
5895 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5896
5897 fprintf(f,
5898 "%sStandardInput: %s\n"
5899 "%sStandardOutput: %s\n"
5900 "%sStandardError: %s\n",
5901 prefix, exec_input_to_string(c->std_input),
5902 prefix, exec_output_to_string(c->std_output),
5903 prefix, exec_output_to_string(c->std_error));
5904
5905 if (c->std_input == EXEC_INPUT_NAMED_FD)
5906 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5907 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5908 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5909 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5910 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5911
5912 if (c->std_input == EXEC_INPUT_FILE)
5913 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5914 if (c->std_output == EXEC_OUTPUT_FILE)
5915 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5916 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5917 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5918 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5919 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5920 if (c->std_error == EXEC_OUTPUT_FILE)
5921 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5922 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5923 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5924 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5925 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5926
5927 if (c->tty_path)
5928 fprintf(f,
5929 "%sTTYPath: %s\n"
5930 "%sTTYReset: %s\n"
5931 "%sTTYVHangup: %s\n"
5932 "%sTTYVTDisallocate: %s\n"
5933 "%sTTYRows: %u\n"
5934 "%sTTYColumns: %u\n",
5935 prefix, c->tty_path,
5936 prefix, yes_no(c->tty_reset),
5937 prefix, yes_no(c->tty_vhangup),
5938 prefix, yes_no(c->tty_vt_disallocate),
5939 prefix, c->tty_rows,
5940 prefix, c->tty_cols);
5941
5942 if (IN_SET(c->std_output,
5943 EXEC_OUTPUT_KMSG,
5944 EXEC_OUTPUT_JOURNAL,
5945 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5946 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5947 IN_SET(c->std_error,
5948 EXEC_OUTPUT_KMSG,
5949 EXEC_OUTPUT_JOURNAL,
5950 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5951 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5952
5953 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5954
5955 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5956 if (r >= 0)
5957 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5958
5959 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5960 if (r >= 0)
5961 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5962 }
5963
5964 if (c->log_level_max >= 0) {
5965 _cleanup_free_ char *t = NULL;
5966
5967 (void) log_level_to_string_alloc(c->log_level_max, &t);
5968
5969 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5970 }
5971
5972 if (c->log_ratelimit_interval_usec > 0)
5973 fprintf(f,
5974 "%sLogRateLimitIntervalSec: %s\n",
5975 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5976
5977 if (c->log_ratelimit_burst > 0)
5978 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5979
5980 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5981 fprintf(f, "%sLogExtraFields: ", prefix);
5982 fwrite(c->log_extra_fields[j].iov_base,
5983 1, c->log_extra_fields[j].iov_len,
5984 f);
5985 fputc('\n', f);
5986 }
5987
5988 if (c->log_namespace)
5989 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5990
5991 if (c->secure_bits) {
5992 _cleanup_free_ char *str = NULL;
5993
5994 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5995 if (r >= 0)
5996 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5997 }
5998
5999 if (c->capability_bounding_set != CAP_ALL) {
6000 _cleanup_free_ char *str = NULL;
6001
6002 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6003 if (r >= 0)
6004 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6005 }
6006
6007 if (c->capability_ambient_set != 0) {
6008 _cleanup_free_ char *str = NULL;
6009
6010 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6011 if (r >= 0)
6012 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6013 }
6014
6015 if (c->user)
6016 fprintf(f, "%sUser: %s\n", prefix, c->user);
6017 if (c->group)
6018 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6019
6020 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6021
6022 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6023
6024 if (c->pam_name)
6025 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6026
6027 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6028 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6029 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6030 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6031 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6032 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6033
6034 for (size_t i = 0; i < c->n_bind_mounts; i++)
6035 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6036 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6037 c->bind_mounts[i].ignore_enoent ? "-": "",
6038 c->bind_mounts[i].source,
6039 c->bind_mounts[i].destination,
6040 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6041
6042 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6043 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6044
6045 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6046 t->path,
6047 isempty(t->options) ? "" : ":",
6048 strempty(t->options));
6049 }
6050
6051 if (c->utmp_id)
6052 fprintf(f,
6053 "%sUtmpIdentifier: %s\n",
6054 prefix, c->utmp_id);
6055
6056 if (c->selinux_context)
6057 fprintf(f,
6058 "%sSELinuxContext: %s%s\n",
6059 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6060
6061 if (c->apparmor_profile)
6062 fprintf(f,
6063 "%sAppArmorProfile: %s%s\n",
6064 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6065
6066 if (c->smack_process_label)
6067 fprintf(f,
6068 "%sSmackProcessLabel: %s%s\n",
6069 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6070
6071 if (c->personality != PERSONALITY_INVALID)
6072 fprintf(f,
6073 "%sPersonality: %s\n",
6074 prefix, strna(personality_to_string(c->personality)));
6075
6076 fprintf(f,
6077 "%sLockPersonality: %s\n",
6078 prefix, yes_no(c->lock_personality));
6079
6080 if (c->syscall_filter) {
6081 fprintf(f,
6082 "%sSystemCallFilter: ",
6083 prefix);
6084
6085 if (!c->syscall_allow_list)
6086 fputc('~', f);
6087
6088 #if HAVE_SECCOMP
6089 void *id, *val;
6090 bool first = true;
6091 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6092 _cleanup_free_ char *name = NULL;
6093 const char *errno_name = NULL;
6094 int num = PTR_TO_INT(val);
6095
6096 if (first)
6097 first = false;
6098 else
6099 fputc(' ', f);
6100
6101 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6102 fputs(strna(name), f);
6103
6104 if (num >= 0) {
6105 errno_name = seccomp_errno_or_action_to_string(num);
6106 if (errno_name)
6107 fprintf(f, ":%s", errno_name);
6108 else
6109 fprintf(f, ":%d", num);
6110 }
6111 }
6112 #endif
6113
6114 fputc('\n', f);
6115 }
6116
6117 if (c->syscall_archs) {
6118 fprintf(f,
6119 "%sSystemCallArchitectures:",
6120 prefix);
6121
6122 #if HAVE_SECCOMP
6123 void *id;
6124 SET_FOREACH(id, c->syscall_archs)
6125 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6126 #endif
6127 fputc('\n', f);
6128 }
6129
6130 if (exec_context_restrict_namespaces_set(c)) {
6131 _cleanup_free_ char *s = NULL;
6132
6133 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6134 if (r >= 0)
6135 fprintf(f, "%sRestrictNamespaces: %s\n",
6136 prefix, strna(s));
6137 }
6138
6139 #if HAVE_LIBBPF
6140 if (exec_context_restrict_filesystems_set(c)) {
6141 char *fs;
6142 SET_FOREACH(fs, c->restrict_filesystems)
6143 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6144 }
6145 #endif
6146
6147 if (c->network_namespace_path)
6148 fprintf(f,
6149 "%sNetworkNamespacePath: %s\n",
6150 prefix, c->network_namespace_path);
6151
6152 if (c->syscall_errno > 0) {
6153 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6154
6155 #if HAVE_SECCOMP
6156 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6157 if (errno_name)
6158 fputs(errno_name, f);
6159 else
6160 fprintf(f, "%d", c->syscall_errno);
6161 #endif
6162 fputc('\n', f);
6163 }
6164
6165 for (size_t i = 0; i < c->n_mount_images; i++) {
6166 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6167 c->mount_images[i].ignore_enoent ? "-": "",
6168 c->mount_images[i].source,
6169 c->mount_images[i].destination);
6170 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6171 fprintf(f, ":%s:%s",
6172 partition_designator_to_string(o->partition_designator),
6173 strempty(o->options));
6174 fprintf(f, "\n");
6175 }
6176
6177 for (size_t i = 0; i < c->n_extension_images; i++) {
6178 fprintf(f, "%sExtensionImages: %s%s", prefix,
6179 c->extension_images[i].ignore_enoent ? "-": "",
6180 c->extension_images[i].source);
6181 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6182 fprintf(f, ":%s:%s",
6183 partition_designator_to_string(o->partition_designator),
6184 strempty(o->options));
6185 fprintf(f, "\n");
6186 }
6187
6188 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6189 }
6190
6191 bool exec_context_maintains_privileges(const ExecContext *c) {
6192 assert(c);
6193
6194 /* Returns true if the process forked off would run under
6195 * an unchanged UID or as root. */
6196
6197 if (!c->user)
6198 return true;
6199
6200 if (streq(c->user, "root") || streq(c->user, "0"))
6201 return true;
6202
6203 return false;
6204 }
6205
6206 int exec_context_get_effective_ioprio(const ExecContext *c) {
6207 int p;
6208
6209 assert(c);
6210
6211 if (c->ioprio_set)
6212 return c->ioprio;
6213
6214 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6215 if (p < 0)
6216 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6217
6218 return ioprio_normalize(p);
6219 }
6220
6221 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6222 assert(c);
6223
6224 /* Explicit setting wins */
6225 if (c->mount_apivfs_set)
6226 return c->mount_apivfs;
6227
6228 /* Default to "yes" if root directory or image are specified */
6229 if (exec_context_with_rootfs(c))
6230 return true;
6231
6232 return false;
6233 }
6234
6235 void exec_context_free_log_extra_fields(ExecContext *c) {
6236 assert(c);
6237
6238 for (size_t l = 0; l < c->n_log_extra_fields; l++)
6239 free(c->log_extra_fields[l].iov_base);
6240 c->log_extra_fields = mfree(c->log_extra_fields);
6241 c->n_log_extra_fields = 0;
6242 }
6243
6244 void exec_context_revert_tty(ExecContext *c) {
6245 _cleanup_close_ int fd = -1;
6246 const char *path;
6247 struct stat st;
6248 int r;
6249
6250 assert(c);
6251
6252 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6253 exec_context_tty_reset(c, NULL);
6254
6255 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6256 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6257 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6258 if (!exec_context_may_touch_tty(c))
6259 return;
6260
6261 path = exec_context_tty_path(c);
6262 if (!path)
6263 return;
6264
6265 fd = open(path, O_PATH|O_CLOEXEC);
6266 if (fd < 0)
6267 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6268 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6269 path);
6270
6271 if (fstat(fd, &st) < 0)
6272 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6273
6274 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6275 * if things are a character device, since a proper check either means we'd have to open the TTY and
6276 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6277 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6278 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6279 if (!S_ISCHR(st.st_mode))
6280 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6281
6282 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6283 if (r < 0)
6284 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6285 }
6286
6287 int exec_context_get_clean_directories(
6288 ExecContext *c,
6289 char **prefix,
6290 ExecCleanMask mask,
6291 char ***ret) {
6292
6293 _cleanup_strv_free_ char **l = NULL;
6294 int r;
6295
6296 assert(c);
6297 assert(prefix);
6298 assert(ret);
6299
6300 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6301 if (!FLAGS_SET(mask, 1U << t))
6302 continue;
6303
6304 if (!prefix[t])
6305 continue;
6306
6307 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6308 char *j;
6309
6310 j = path_join(prefix[t], c->directories[t].items[i].path);
6311 if (!j)
6312 return -ENOMEM;
6313
6314 r = strv_consume(&l, j);
6315 if (r < 0)
6316 return r;
6317
6318 /* Also remove private directories unconditionally. */
6319 if (t != EXEC_DIRECTORY_CONFIGURATION) {
6320 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6321 if (!j)
6322 return -ENOMEM;
6323
6324 r = strv_consume(&l, j);
6325 if (r < 0)
6326 return r;
6327 }
6328
6329 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6330 j = path_join(prefix[t], *symlink);
6331 if (!j)
6332 return -ENOMEM;
6333
6334 r = strv_consume(&l, j);
6335 if (r < 0)
6336 return r;
6337 }
6338 }
6339 }
6340
6341 *ret = TAKE_PTR(l);
6342 return 0;
6343 }
6344
6345 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6346 ExecCleanMask mask = 0;
6347
6348 assert(c);
6349 assert(ret);
6350
6351 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6352 if (c->directories[t].n_items > 0)
6353 mask |= 1U << t;
6354
6355 *ret = mask;
6356 return 0;
6357 }
6358
6359 void exec_status_start(ExecStatus *s, pid_t pid) {
6360 assert(s);
6361
6362 *s = (ExecStatus) {
6363 .pid = pid,
6364 };
6365
6366 dual_timestamp_get(&s->start_timestamp);
6367 }
6368
6369 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6370 assert(s);
6371
6372 if (s->pid != pid)
6373 *s = (ExecStatus) {
6374 .pid = pid,
6375 };
6376
6377 dual_timestamp_get(&s->exit_timestamp);
6378
6379 s->code = code;
6380 s->status = status;
6381
6382 if (context && context->utmp_id)
6383 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6384 }
6385
6386 void exec_status_reset(ExecStatus *s) {
6387 assert(s);
6388
6389 *s = (ExecStatus) {};
6390 }
6391
6392 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6393 assert(s);
6394 assert(f);
6395
6396 if (s->pid <= 0)
6397 return;
6398
6399 prefix = strempty(prefix);
6400
6401 fprintf(f,
6402 "%sPID: "PID_FMT"\n",
6403 prefix, s->pid);
6404
6405 if (dual_timestamp_is_set(&s->start_timestamp))
6406 fprintf(f,
6407 "%sStart Timestamp: %s\n",
6408 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6409
6410 if (dual_timestamp_is_set(&s->exit_timestamp))
6411 fprintf(f,
6412 "%sExit Timestamp: %s\n"
6413 "%sExit Code: %s\n"
6414 "%sExit Status: %i\n",
6415 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6416 prefix, sigchld_code_to_string(s->code),
6417 prefix, s->status);
6418 }
6419
6420 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6421 _cleanup_free_ char *cmd = NULL;
6422 const char *prefix2;
6423
6424 assert(c);
6425 assert(f);
6426
6427 prefix = strempty(prefix);
6428 prefix2 = strjoina(prefix, "\t");
6429
6430 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6431
6432 fprintf(f,
6433 "%sCommand Line: %s\n",
6434 prefix, strnull(cmd));
6435
6436 exec_status_dump(&c->exec_status, f, prefix2);
6437 }
6438
6439 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6440 assert(f);
6441
6442 prefix = strempty(prefix);
6443
6444 LIST_FOREACH(command, i, c)
6445 exec_command_dump(i, f, prefix);
6446 }
6447
6448 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6449 ExecCommand *end;
6450
6451 assert(l);
6452 assert(e);
6453
6454 if (*l) {
6455 /* It's kind of important, that we keep the order here */
6456 LIST_FIND_TAIL(command, *l, end);
6457 LIST_INSERT_AFTER(command, *l, end, e);
6458 } else
6459 *l = e;
6460 }
6461
6462 int exec_command_set(ExecCommand *c, const char *path, ...) {
6463 va_list ap;
6464 char **l, *p;
6465
6466 assert(c);
6467 assert(path);
6468
6469 va_start(ap, path);
6470 l = strv_new_ap(path, ap);
6471 va_end(ap);
6472
6473 if (!l)
6474 return -ENOMEM;
6475
6476 p = strdup(path);
6477 if (!p) {
6478 strv_free(l);
6479 return -ENOMEM;
6480 }
6481
6482 free_and_replace(c->path, p);
6483
6484 return strv_free_and_replace(c->argv, l);
6485 }
6486
6487 int exec_command_append(ExecCommand *c, const char *path, ...) {
6488 _cleanup_strv_free_ char **l = NULL;
6489 va_list ap;
6490 int r;
6491
6492 assert(c);
6493 assert(path);
6494
6495 va_start(ap, path);
6496 l = strv_new_ap(path, ap);
6497 va_end(ap);
6498
6499 if (!l)
6500 return -ENOMEM;
6501
6502 r = strv_extend_strv(&c->argv, l, false);
6503 if (r < 0)
6504 return r;
6505
6506 return 0;
6507 }
6508
6509 static void *remove_tmpdir_thread(void *p) {
6510 _cleanup_free_ char *path = p;
6511
6512 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6513 return NULL;
6514 }
6515
6516 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6517 int r;
6518
6519 if (!rt)
6520 return NULL;
6521
6522 if (rt->manager)
6523 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6524
6525 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6526
6527 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6528 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6529
6530 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6531 if (r < 0)
6532 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6533 else
6534 rt->tmp_dir = NULL;
6535 }
6536
6537 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6538 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6539
6540 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6541 if (r < 0)
6542 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6543 else
6544 rt->var_tmp_dir = NULL;
6545 }
6546
6547 rt->id = mfree(rt->id);
6548 rt->tmp_dir = mfree(rt->tmp_dir);
6549 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6550 safe_close_pair(rt->netns_storage_socket);
6551 safe_close_pair(rt->ipcns_storage_socket);
6552 return mfree(rt);
6553 }
6554
6555 static void exec_runtime_freep(ExecRuntime **rt) {
6556 (void) exec_runtime_free(*rt, false);
6557 }
6558
6559 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6560 _cleanup_free_ char *id_copy = NULL;
6561 ExecRuntime *n;
6562
6563 assert(ret);
6564
6565 id_copy = strdup(id);
6566 if (!id_copy)
6567 return -ENOMEM;
6568
6569 n = new(ExecRuntime, 1);
6570 if (!n)
6571 return -ENOMEM;
6572
6573 *n = (ExecRuntime) {
6574 .id = TAKE_PTR(id_copy),
6575 .netns_storage_socket = { -1, -1 },
6576 .ipcns_storage_socket = { -1, -1 },
6577 };
6578
6579 *ret = n;
6580 return 0;
6581 }
6582
6583 static int exec_runtime_add(
6584 Manager *m,
6585 const char *id,
6586 char **tmp_dir,
6587 char **var_tmp_dir,
6588 int netns_storage_socket[2],
6589 int ipcns_storage_socket[2],
6590 ExecRuntime **ret) {
6591
6592 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6593 int r;
6594
6595 assert(m);
6596 assert(id);
6597
6598 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6599
6600 r = exec_runtime_allocate(&rt, id);
6601 if (r < 0)
6602 return r;
6603
6604 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6605 if (r < 0)
6606 return r;
6607
6608 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6609 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6610 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6611
6612 if (netns_storage_socket) {
6613 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6614 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6615 }
6616
6617 if (ipcns_storage_socket) {
6618 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6619 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6620 }
6621
6622 rt->manager = m;
6623
6624 if (ret)
6625 *ret = rt;
6626 /* do not remove created ExecRuntime object when the operation succeeds. */
6627 TAKE_PTR(rt);
6628 return 0;
6629 }
6630
6631 static int exec_runtime_make(
6632 Manager *m,
6633 const ExecContext *c,
6634 const char *id,
6635 ExecRuntime **ret) {
6636
6637 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6638 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
6639 int r;
6640
6641 assert(m);
6642 assert(c);
6643 assert(id);
6644
6645 /* It is not necessary to create ExecRuntime object. */
6646 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6647 *ret = NULL;
6648 return 0;
6649 }
6650
6651 if (c->private_tmp &&
6652 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6653 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6654 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6655 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6656 if (r < 0)
6657 return r;
6658 }
6659
6660 if (c->private_network || c->network_namespace_path) {
6661 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6662 return -errno;
6663 }
6664
6665 if (c->private_ipc || c->ipc_namespace_path) {
6666 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6667 return -errno;
6668 }
6669
6670 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6671 if (r < 0)
6672 return r;
6673
6674 return 1;
6675 }
6676
6677 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6678 ExecRuntime *rt;
6679 int r;
6680
6681 assert(m);
6682 assert(id);
6683 assert(ret);
6684
6685 rt = hashmap_get(m->exec_runtime_by_id, id);
6686 if (rt)
6687 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6688 goto ref;
6689
6690 if (!create) {
6691 *ret = NULL;
6692 return 0;
6693 }
6694
6695 /* If not found, then create a new object. */
6696 r = exec_runtime_make(m, c, id, &rt);
6697 if (r < 0)
6698 return r;
6699 if (r == 0) {
6700 /* When r == 0, it is not necessary to create ExecRuntime object. */
6701 *ret = NULL;
6702 return 0;
6703 }
6704
6705 ref:
6706 /* increment reference counter. */
6707 rt->n_ref++;
6708 *ret = rt;
6709 return 1;
6710 }
6711
6712 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6713 if (!rt)
6714 return NULL;
6715
6716 assert(rt->n_ref > 0);
6717
6718 rt->n_ref--;
6719 if (rt->n_ref > 0)
6720 return NULL;
6721
6722 return exec_runtime_free(rt, destroy);
6723 }
6724
6725 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6726 ExecRuntime *rt;
6727
6728 assert(m);
6729 assert(f);
6730 assert(fds);
6731
6732 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6733 fprintf(f, "exec-runtime=%s", rt->id);
6734
6735 if (rt->tmp_dir)
6736 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6737
6738 if (rt->var_tmp_dir)
6739 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6740
6741 if (rt->netns_storage_socket[0] >= 0) {
6742 int copy;
6743
6744 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6745 if (copy < 0)
6746 return copy;
6747
6748 fprintf(f, " netns-socket-0=%i", copy);
6749 }
6750
6751 if (rt->netns_storage_socket[1] >= 0) {
6752 int copy;
6753
6754 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6755 if (copy < 0)
6756 return copy;
6757
6758 fprintf(f, " netns-socket-1=%i", copy);
6759 }
6760
6761 if (rt->ipcns_storage_socket[0] >= 0) {
6762 int copy;
6763
6764 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6765 if (copy < 0)
6766 return copy;
6767
6768 fprintf(f, " ipcns-socket-0=%i", copy);
6769 }
6770
6771 if (rt->ipcns_storage_socket[1] >= 0) {
6772 int copy;
6773
6774 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6775 if (copy < 0)
6776 return copy;
6777
6778 fprintf(f, " ipcns-socket-1=%i", copy);
6779 }
6780
6781 fputc('\n', f);
6782 }
6783
6784 return 0;
6785 }
6786
6787 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6788 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6789 ExecRuntime *rt;
6790 int r;
6791
6792 /* This is for the migration from old (v237 or earlier) deserialization text.
6793 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6794 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6795 * so or not from the serialized text, then we always creates a new object owned by this. */
6796
6797 assert(u);
6798 assert(key);
6799 assert(value);
6800
6801 /* Manager manages ExecRuntime objects by the unit id.
6802 * So, we omit the serialized text when the unit does not have id (yet?)... */
6803 if (isempty(u->id)) {
6804 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6805 return 0;
6806 }
6807
6808 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6809 return log_oom();
6810
6811 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6812 if (!rt) {
6813 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6814 return log_oom();
6815
6816 rt = rt_create;
6817 }
6818
6819 if (streq(key, "tmp-dir")) {
6820 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6821 return -ENOMEM;
6822
6823 } else if (streq(key, "var-tmp-dir")) {
6824 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6825 return -ENOMEM;
6826
6827 } else if (streq(key, "netns-socket-0")) {
6828 int fd;
6829
6830 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6831 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6832 return 0;
6833 }
6834
6835 safe_close(rt->netns_storage_socket[0]);
6836 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6837
6838 } else if (streq(key, "netns-socket-1")) {
6839 int fd;
6840
6841 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
6842 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6843 return 0;
6844 }
6845
6846 safe_close(rt->netns_storage_socket[1]);
6847 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6848
6849 } else
6850 return 0;
6851
6852 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6853 if (rt_create) {
6854 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6855 if (r < 0) {
6856 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6857 return 0;
6858 }
6859
6860 rt_create->manager = u->manager;
6861
6862 /* Avoid cleanup */
6863 TAKE_PTR(rt_create);
6864 }
6865
6866 return 1;
6867 }
6868
6869 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6870 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6871 char *id = NULL;
6872 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6873 const char *p, *v = ASSERT_PTR(value);
6874 size_t n;
6875
6876 assert(m);
6877 assert(fds);
6878
6879 n = strcspn(v, " ");
6880 id = strndupa_safe(v, n);
6881 if (v[n] != ' ')
6882 goto finalize;
6883 p = v + n + 1;
6884
6885 v = startswith(p, "tmp-dir=");
6886 if (v) {
6887 n = strcspn(v, " ");
6888 tmp_dir = strndup(v, n);
6889 if (!tmp_dir)
6890 return log_oom();
6891 if (v[n] != ' ')
6892 goto finalize;
6893 p = v + n + 1;
6894 }
6895
6896 v = startswith(p, "var-tmp-dir=");
6897 if (v) {
6898 n = strcspn(v, " ");
6899 var_tmp_dir = strndup(v, n);
6900 if (!var_tmp_dir)
6901 return log_oom();
6902 if (v[n] != ' ')
6903 goto finalize;
6904 p = v + n + 1;
6905 }
6906
6907 v = startswith(p, "netns-socket-0=");
6908 if (v) {
6909 char *buf;
6910
6911 n = strcspn(v, " ");
6912 buf = strndupa_safe(v, n);
6913
6914 r = safe_atoi(buf, &netns_fdpair[0]);
6915 if (r < 0)
6916 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6917 if (!fdset_contains(fds, netns_fdpair[0]))
6918 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6919 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6920 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6921 if (v[n] != ' ')
6922 goto finalize;
6923 p = v + n + 1;
6924 }
6925
6926 v = startswith(p, "netns-socket-1=");
6927 if (v) {
6928 char *buf;
6929
6930 n = strcspn(v, " ");
6931 buf = strndupa_safe(v, n);
6932
6933 r = safe_atoi(buf, &netns_fdpair[1]);
6934 if (r < 0)
6935 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6936 if (!fdset_contains(fds, netns_fdpair[1]))
6937 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6938 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6939 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6940 if (v[n] != ' ')
6941 goto finalize;
6942 p = v + n + 1;
6943 }
6944
6945 v = startswith(p, "ipcns-socket-0=");
6946 if (v) {
6947 char *buf;
6948
6949 n = strcspn(v, " ");
6950 buf = strndupa_safe(v, n);
6951
6952 r = safe_atoi(buf, &ipcns_fdpair[0]);
6953 if (r < 0)
6954 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6955 if (!fdset_contains(fds, ipcns_fdpair[0]))
6956 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6957 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6958 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6959 if (v[n] != ' ')
6960 goto finalize;
6961 p = v + n + 1;
6962 }
6963
6964 v = startswith(p, "ipcns-socket-1=");
6965 if (v) {
6966 char *buf;
6967
6968 n = strcspn(v, " ");
6969 buf = strndupa_safe(v, n);
6970
6971 r = safe_atoi(buf, &ipcns_fdpair[1]);
6972 if (r < 0)
6973 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6974 if (!fdset_contains(fds, ipcns_fdpair[1]))
6975 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6976 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6977 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6978 }
6979
6980 finalize:
6981 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6982 if (r < 0)
6983 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6984 return 0;
6985 }
6986
6987 void exec_runtime_vacuum(Manager *m) {
6988 ExecRuntime *rt;
6989
6990 assert(m);
6991
6992 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6993
6994 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6995 if (rt->n_ref > 0)
6996 continue;
6997
6998 (void) exec_runtime_free(rt, false);
6999 }
7000 }
7001
7002 void exec_params_clear(ExecParameters *p) {
7003 if (!p)
7004 return;
7005
7006 p->environment = strv_free(p->environment);
7007 p->fd_names = strv_free(p->fd_names);
7008 p->fds = mfree(p->fds);
7009 p->exec_fd = safe_close(p->exec_fd);
7010 }
7011
7012 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7013 if (!sc)
7014 return NULL;
7015
7016 free(sc->id);
7017 free(sc->data);
7018 return mfree(sc);
7019 }
7020
7021 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7022 if (!lc)
7023 return NULL;
7024
7025 free(lc->id);
7026 free(lc->path);
7027 return mfree(lc);
7028 }
7029
7030 void exec_directory_done(ExecDirectory *d) {
7031 if (!d)
7032 return;
7033
7034 for (size_t i = 0; i < d->n_items; i++) {
7035 free(d->items[i].path);
7036 strv_free(d->items[i].symlinks);
7037 }
7038
7039 d->items = mfree(d->items);
7040 d->n_items = 0;
7041 d->mode = 0755;
7042 }
7043
7044 int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
7045 _cleanup_strv_free_ char **s = NULL;
7046 _cleanup_free_ char *p = NULL;
7047
7048 assert(d);
7049 assert(n);
7050 assert(path);
7051
7052 p = strdup(path);
7053 if (!p)
7054 return -ENOMEM;
7055
7056 if (symlinks) {
7057 s = strv_copy(symlinks);
7058 if (!s)
7059 return -ENOMEM;
7060 }
7061
7062 if (!GREEDY_REALLOC(*d, *n + 1))
7063 return -ENOMEM;
7064
7065 (*d)[(*n) ++] = (ExecDirectoryItem) {
7066 .path = TAKE_PTR(p),
7067 .symlinks = TAKE_PTR(s),
7068 };
7069
7070 return 0;
7071 }
7072
7073 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7074 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7075
7076 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7077 [EXEC_INPUT_NULL] = "null",
7078 [EXEC_INPUT_TTY] = "tty",
7079 [EXEC_INPUT_TTY_FORCE] = "tty-force",
7080 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7081 [EXEC_INPUT_SOCKET] = "socket",
7082 [EXEC_INPUT_NAMED_FD] = "fd",
7083 [EXEC_INPUT_DATA] = "data",
7084 [EXEC_INPUT_FILE] = "file",
7085 };
7086
7087 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7088
7089 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7090 [EXEC_OUTPUT_INHERIT] = "inherit",
7091 [EXEC_OUTPUT_NULL] = "null",
7092 [EXEC_OUTPUT_TTY] = "tty",
7093 [EXEC_OUTPUT_KMSG] = "kmsg",
7094 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7095 [EXEC_OUTPUT_JOURNAL] = "journal",
7096 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7097 [EXEC_OUTPUT_SOCKET] = "socket",
7098 [EXEC_OUTPUT_NAMED_FD] = "fd",
7099 [EXEC_OUTPUT_FILE] = "file",
7100 [EXEC_OUTPUT_FILE_APPEND] = "append",
7101 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7102 };
7103
7104 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7105
7106 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7107 [EXEC_UTMP_INIT] = "init",
7108 [EXEC_UTMP_LOGIN] = "login",
7109 [EXEC_UTMP_USER] = "user",
7110 };
7111
7112 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7113
7114 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7115 [EXEC_PRESERVE_NO] = "no",
7116 [EXEC_PRESERVE_YES] = "yes",
7117 [EXEC_PRESERVE_RESTART] = "restart",
7118 };
7119
7120 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7121
7122 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7123 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7124 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7125 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7126 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7127 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7128 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7129 };
7130
7131 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7132
7133 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7134 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7135 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7136 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7137 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7138 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7139 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7140 };
7141
7142 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7143
7144 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7145 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7146 * directories, specifically .timer units with their timestamp touch file. */
7147 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7148 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7149 [EXEC_DIRECTORY_STATE] = "state",
7150 [EXEC_DIRECTORY_CACHE] = "cache",
7151 [EXEC_DIRECTORY_LOGS] = "logs",
7152 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7153 };
7154
7155 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7156
7157 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7158 * the service payload in. */
7159 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7160 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7161 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7162 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7163 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7164 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7165 };
7166
7167 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7168
7169 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7170 [EXEC_KEYRING_INHERIT] = "inherit",
7171 [EXEC_KEYRING_PRIVATE] = "private",
7172 [EXEC_KEYRING_SHARED] = "shared",
7173 };
7174
7175 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);