]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: split out functions and definitions from execute.[ch] to credential.[ch]
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/file.h>
8 #include <sys/ioctl.h>
9 #include <sys/mman.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
20 #if HAVE_PAM
21 #include <security/pam_appl.h>
22 #endif
23
24 #if HAVE_SELINUX
25 #include <selinux/selinux.h>
26 #endif
27
28 #if HAVE_APPARMOR
29 #include <sys/apparmor.h>
30 #endif
31
32 #include "sd-messages.h"
33
34 #include "af-list.h"
35 #include "alloc-util.h"
36 #if HAVE_APPARMOR
37 #include "apparmor-util.h"
38 #endif
39 #include "argv-util.h"
40 #include "async.h"
41 #include "barrier.h"
42 #include "bpf-lsm.h"
43 #include "btrfs-util.h"
44 #include "cap-list.h"
45 #include "capability-util.h"
46 #include "chattr-util.h"
47 #include "cgroup-setup.h"
48 #include "chase.h"
49 #include "chown-recursive.h"
50 #include "constants.h"
51 #include "cpu-set-util.h"
52 #include "credential.h"
53 #include "data-fd-util.h"
54 #include "env-file.h"
55 #include "env-util.h"
56 #include "errno-list.h"
57 #include "escape.h"
58 #include "execute.h"
59 #include "exit-status.h"
60 #include "fd-util.h"
61 #include "format-util.h"
62 #include "glob-util.h"
63 #include "hexdecoct.h"
64 #include "io-util.h"
65 #include "ioprio-util.h"
66 #include "lock-util.h"
67 #include "log.h"
68 #include "macro.h"
69 #include "manager.h"
70 #include "manager-dump.h"
71 #include "memory-util.h"
72 #include "missing_fs.h"
73 #include "missing_ioprio.h"
74 #include "missing_prctl.h"
75 #include "mkdir-label.h"
76 #include "namespace.h"
77 #include "parse-util.h"
78 #include "path-util.h"
79 #include "proc-cmdline.h"
80 #include "process-util.h"
81 #include "psi-util.h"
82 #include "rlimit-util.h"
83 #include "rm-rf.h"
84 #include "seccomp-util.h"
85 #include "securebits-util.h"
86 #include "selinux-util.h"
87 #include "signal-util.h"
88 #include "smack-util.h"
89 #include "socket-util.h"
90 #include "sort-util.h"
91 #include "special.h"
92 #include "stat-util.h"
93 #include "string-table.h"
94 #include "string-util.h"
95 #include "strv.h"
96 #include "syslog-util.h"
97 #include "terminal-util.h"
98 #include "tmpfile-util.h"
99 #include "umask-util.h"
100 #include "unit-serialize.h"
101 #include "user-util.h"
102 #include "utmp-wtmp.h"
103
104 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
105 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
106
107 #define SNDBUF_SIZE (8*1024*1024)
108
109 static int shift_fds(int fds[], size_t n_fds) {
110 if (n_fds <= 0)
111 return 0;
112
113 /* Modifies the fds array! (sorts it) */
114
115 assert(fds);
116
117 for (int start = 0;;) {
118 int restart_from = -1;
119
120 for (int i = start; i < (int) n_fds; i++) {
121 int nfd;
122
123 /* Already at right index? */
124 if (fds[i] == i+3)
125 continue;
126
127 nfd = fcntl(fds[i], F_DUPFD, i + 3);
128 if (nfd < 0)
129 return -errno;
130
131 safe_close(fds[i]);
132 fds[i] = nfd;
133
134 /* Hmm, the fd we wanted isn't free? Then
135 * let's remember that and try again from here */
136 if (nfd != i+3 && restart_from < 0)
137 restart_from = i;
138 }
139
140 if (restart_from < 0)
141 break;
142
143 start = restart_from;
144 }
145
146 return 0;
147 }
148
149 static int flags_fds(
150 const int fds[],
151 size_t n_socket_fds,
152 size_t n_fds,
153 bool nonblock) {
154
155 int r;
156
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
164
165 for (size_t i = 0; i < n_fds; i++) {
166
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
172
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
176
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
179 return r;
180 }
181
182 return 0;
183 }
184
185 static const char *exec_context_tty_path(const ExecContext *context) {
186 assert(context);
187
188 if (context->stdio_as_fds)
189 return NULL;
190
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195 }
196
197 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
198 unsigned rows, cols;
199 const char *tty;
200
201 assert(context);
202 assert(ret_rows);
203 assert(ret_cols);
204
205 rows = context->tty_rows;
206 cols = context->tty_cols;
207
208 tty = exec_context_tty_path(context);
209 if (tty)
210 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
211
212 *ret_rows = rows;
213 *ret_cols = cols;
214
215 return 0;
216 }
217
218 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
219 _cleanup_close_ int fd = -EBADF;
220 const char *path = exec_context_tty_path(ASSERT_PTR(context));
221
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
225 */
226
227 if (p && p->stdin_fd >= 0) {
228 fd = xopenat_lock(p->stdin_fd, NULL,
229 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
230 if (fd < 0)
231 return;
232 } else if (path) {
233 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
234 if (fd < 0)
235 return;
236
237 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
238 return;
239 } else
240 return; /* nothing to do */
241
242 if (context->tty_vhangup)
243 (void) terminal_vhangup_fd(fd);
244
245 if (context->tty_reset)
246 (void) reset_terminal_fd(fd, true);
247
248 if (p && p->stdin_fd >= 0) {
249 unsigned rows = context->tty_rows, cols = context->tty_cols;
250
251 (void) exec_context_tty_size(context, &rows, &cols);
252 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
253 }
254
255 if (context->tty_vt_disallocate && path)
256 (void) vt_disallocate(path);
257 }
258
259 static bool is_terminal_input(ExecInput i) {
260 return IN_SET(i,
261 EXEC_INPUT_TTY,
262 EXEC_INPUT_TTY_FORCE,
263 EXEC_INPUT_TTY_FAIL);
264 }
265
266 static bool is_terminal_output(ExecOutput o) {
267 return IN_SET(o,
268 EXEC_OUTPUT_TTY,
269 EXEC_OUTPUT_KMSG_AND_CONSOLE,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
271 }
272
273 static bool is_kmsg_output(ExecOutput o) {
274 return IN_SET(o,
275 EXEC_OUTPUT_KMSG,
276 EXEC_OUTPUT_KMSG_AND_CONSOLE);
277 }
278
279 static bool exec_context_needs_term(const ExecContext *c) {
280 assert(c);
281
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
283
284 if (is_terminal_input(c->std_input))
285 return true;
286
287 if (is_terminal_output(c->std_output))
288 return true;
289
290 if (is_terminal_output(c->std_error))
291 return true;
292
293 return !!c->tty_path;
294 }
295
296 static int open_null_as(int flags, int nfd) {
297 int fd;
298
299 assert(nfd >= 0);
300
301 fd = open("/dev/null", flags|O_NOCTTY);
302 if (fd < 0)
303 return -errno;
304
305 return move_fd(fd, nfd, false);
306 }
307
308 static int connect_journal_socket(
309 int fd,
310 const char *log_namespace,
311 uid_t uid,
312 gid_t gid) {
313
314 uid_t olduid = UID_INVALID;
315 gid_t oldgid = GID_INVALID;
316 const char *j;
317 int r;
318
319 j = log_namespace ?
320 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
321 "/run/systemd/journal/stdout";
322
323 if (gid_is_valid(gid)) {
324 oldgid = getgid();
325
326 if (setegid(gid) < 0)
327 return -errno;
328 }
329
330 if (uid_is_valid(uid)) {
331 olduid = getuid();
332
333 if (seteuid(uid) < 0) {
334 r = -errno;
335 goto restore_gid;
336 }
337 }
338
339 r = connect_unix_path(fd, AT_FDCWD, j);
340
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
343
344 if (uid_is_valid(uid))
345 (void) seteuid(olduid);
346
347 restore_gid:
348 if (gid_is_valid(gid))
349 (void) setegid(oldgid);
350
351 return r;
352 }
353
354 static int connect_logger_as(
355 const Unit *unit,
356 const ExecContext *context,
357 const ExecParameters *params,
358 ExecOutput output,
359 const char *ident,
360 int nfd,
361 uid_t uid,
362 gid_t gid) {
363
364 _cleanup_close_ int fd = -EBADF;
365 int r;
366
367 assert(context);
368 assert(params);
369 assert(output < _EXEC_OUTPUT_MAX);
370 assert(ident);
371 assert(nfd >= 0);
372
373 fd = socket(AF_UNIX, SOCK_STREAM, 0);
374 if (fd < 0)
375 return -errno;
376
377 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
378 if (r < 0)
379 return r;
380
381 if (shutdown(fd, SHUT_RD) < 0)
382 return -errno;
383
384 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
385
386 if (dprintf(fd,
387 "%s\n"
388 "%s\n"
389 "%i\n"
390 "%i\n"
391 "%i\n"
392 "%i\n"
393 "%i\n",
394 context->syslog_identifier ?: ident,
395 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
396 context->syslog_priority,
397 !!context->syslog_level_prefix,
398 false,
399 is_kmsg_output(output),
400 is_terminal_output(output)) < 0)
401 return -errno;
402
403 return move_fd(TAKE_FD(fd), nfd, false);
404 }
405
406 static int open_terminal_as(const char *path, int flags, int nfd) {
407 int fd;
408
409 assert(path);
410 assert(nfd >= 0);
411
412 fd = open_terminal(path, flags | O_NOCTTY);
413 if (fd < 0)
414 return fd;
415
416 return move_fd(fd, nfd, false);
417 }
418
419 static int acquire_path(const char *path, int flags, mode_t mode) {
420 _cleanup_close_ int fd = -EBADF;
421 int r;
422
423 assert(path);
424
425 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
426 flags |= O_CREAT;
427
428 fd = open(path, flags|O_NOCTTY, mode);
429 if (fd >= 0)
430 return TAKE_FD(fd);
431
432 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
433 return -errno;
434
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
436
437 fd = socket(AF_UNIX, SOCK_STREAM, 0);
438 if (fd < 0)
439 return -errno;
440
441 r = connect_unix_path(fd, AT_FDCWD, path);
442 if (IN_SET(r, -ENOTSOCK, -EINVAL))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
445 return -ENXIO;
446 if (r < 0)
447 return r;
448
449 if ((flags & O_ACCMODE) == O_RDONLY)
450 r = shutdown(fd, SHUT_WR);
451 else if ((flags & O_ACCMODE) == O_WRONLY)
452 r = shutdown(fd, SHUT_RD);
453 else
454 r = 0;
455 if (r < 0)
456 return -errno;
457
458 return TAKE_FD(fd);
459 }
460
461 static int fixup_input(
462 const ExecContext *context,
463 int socket_fd,
464 bool apply_tty_stdin) {
465
466 ExecInput std_input;
467
468 assert(context);
469
470 std_input = context->std_input;
471
472 if (is_terminal_input(std_input) && !apply_tty_stdin)
473 return EXEC_INPUT_NULL;
474
475 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
476 return EXEC_INPUT_NULL;
477
478 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
479 return EXEC_INPUT_NULL;
480
481 return std_input;
482 }
483
484 static int fixup_output(ExecOutput output, int socket_fd) {
485
486 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
487 return EXEC_OUTPUT_INHERIT;
488
489 return output;
490 }
491
492 static int setup_input(
493 const ExecContext *context,
494 const ExecParameters *params,
495 int socket_fd,
496 const int named_iofds[static 3]) {
497
498 ExecInput i;
499 int r;
500
501 assert(context);
502 assert(params);
503 assert(named_iofds);
504
505 if (params->stdin_fd >= 0) {
506 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
507 return -errno;
508
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
510 if (isatty(STDIN_FILENO)) {
511 unsigned rows = context->tty_rows, cols = context->tty_cols;
512
513 (void) exec_context_tty_size(context, &rows, &cols);
514 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
515 (void) reset_terminal_fd(STDIN_FILENO, true);
516 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
517 }
518
519 return STDIN_FILENO;
520 }
521
522 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
523
524 switch (i) {
525
526 case EXEC_INPUT_NULL:
527 return open_null_as(O_RDONLY, STDIN_FILENO);
528
529 case EXEC_INPUT_TTY:
530 case EXEC_INPUT_TTY_FORCE:
531 case EXEC_INPUT_TTY_FAIL: {
532 unsigned rows, cols;
533 int fd;
534
535 fd = acquire_terminal(exec_context_tty_path(context),
536 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
537 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
538 ACQUIRE_TERMINAL_WAIT,
539 USEC_INFINITY);
540 if (fd < 0)
541 return fd;
542
543 r = exec_context_tty_size(context, &rows, &cols);
544 if (r < 0)
545 return r;
546
547 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
548 if (r < 0)
549 return r;
550
551 return move_fd(fd, STDIN_FILENO, false);
552 }
553
554 case EXEC_INPUT_SOCKET:
555 assert(socket_fd >= 0);
556
557 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
558
559 case EXEC_INPUT_NAMED_FD:
560 assert(named_iofds[STDIN_FILENO] >= 0);
561
562 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
563 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
564
565 case EXEC_INPUT_DATA: {
566 int fd;
567
568 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
569 if (fd < 0)
570 return fd;
571
572 return move_fd(fd, STDIN_FILENO, false);
573 }
574
575 case EXEC_INPUT_FILE: {
576 bool rw;
577 int fd;
578
579 assert(context->stdio_file[STDIN_FILENO]);
580
581 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
582 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
583
584 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
585 if (fd < 0)
586 return fd;
587
588 return move_fd(fd, STDIN_FILENO, false);
589 }
590
591 default:
592 assert_not_reached();
593 }
594 }
595
596 static bool can_inherit_stderr_from_stdout(
597 const ExecContext *context,
598 ExecOutput o,
599 ExecOutput e) {
600
601 assert(context);
602
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
604 * stderr fd */
605
606 if (e == EXEC_OUTPUT_INHERIT)
607 return true;
608 if (e != o)
609 return false;
610
611 if (e == EXEC_OUTPUT_NAMED_FD)
612 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
613
614 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
615 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
616
617 return true;
618 }
619
620 static int setup_output(
621 const Unit *unit,
622 const ExecContext *context,
623 const ExecParameters *params,
624 int fileno,
625 int socket_fd,
626 const int named_iofds[static 3],
627 const char *ident,
628 uid_t uid,
629 gid_t gid,
630 dev_t *journal_stream_dev,
631 ino_t *journal_stream_ino) {
632
633 ExecOutput o;
634 ExecInput i;
635 int r;
636
637 assert(unit);
638 assert(context);
639 assert(params);
640 assert(ident);
641 assert(journal_stream_dev);
642 assert(journal_stream_ino);
643
644 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
645
646 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
647 return -errno;
648
649 return STDOUT_FILENO;
650 }
651
652 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
653 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
654 return -errno;
655
656 return STDERR_FILENO;
657 }
658
659 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
660 o = fixup_output(context->std_output, socket_fd);
661
662 if (fileno == STDERR_FILENO) {
663 ExecOutput e;
664 e = fixup_output(context->std_error, socket_fd);
665
666 /* This expects the input and output are already set up */
667
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e == EXEC_OUTPUT_INHERIT &&
671 o == EXEC_OUTPUT_INHERIT &&
672 i == EXEC_INPUT_NULL &&
673 !is_terminal_input(context->std_input) &&
674 getppid() != 1)
675 return fileno;
676
677 /* Duplicate from stdout if possible */
678 if (can_inherit_stderr_from_stdout(context, o, e))
679 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
680
681 o = e;
682
683 } else if (o == EXEC_OUTPUT_INHERIT) {
684 /* If input got downgraded, inherit the original value */
685 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
686 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
687
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
690 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
691
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
693 if (getppid() != 1)
694 return fileno;
695
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY, fileno);
698 }
699
700 switch (o) {
701
702 case EXEC_OUTPUT_NULL:
703 return open_null_as(O_WRONLY, fileno);
704
705 case EXEC_OUTPUT_TTY:
706 if (is_terminal_input(i))
707 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
708
709 /* We don't reset the terminal if this is just about output */
710 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
711
712 case EXEC_OUTPUT_KMSG:
713 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
714 case EXEC_OUTPUT_JOURNAL:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
716 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
717 if (r < 0) {
718 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno == STDOUT_FILENO ? "stdout" : "stderr");
720 r = open_null_as(O_WRONLY, fileno);
721 } else {
722 struct stat st;
723
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
726 * services to detect whether they are connected to the journal or not.
727 *
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
730
731 if (fstat(fileno, &st) >= 0 &&
732 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
733 *journal_stream_dev = st.st_dev;
734 *journal_stream_ino = st.st_ino;
735 }
736 }
737 return r;
738
739 case EXEC_OUTPUT_SOCKET:
740 assert(socket_fd >= 0);
741
742 return RET_NERRNO(dup2(socket_fd, fileno));
743
744 case EXEC_OUTPUT_NAMED_FD:
745 assert(named_iofds[fileno] >= 0);
746
747 (void) fd_nonblock(named_iofds[fileno], false);
748 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
749
750 case EXEC_OUTPUT_FILE:
751 case EXEC_OUTPUT_FILE_APPEND:
752 case EXEC_OUTPUT_FILE_TRUNCATE: {
753 bool rw;
754 int fd, flags;
755
756 assert(context->stdio_file[fileno]);
757
758 rw = context->std_input == EXEC_INPUT_FILE &&
759 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
760
761 if (rw)
762 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
763
764 flags = O_WRONLY;
765 if (o == EXEC_OUTPUT_FILE_APPEND)
766 flags |= O_APPEND;
767 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
768 flags |= O_TRUNC;
769
770 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
771 if (fd < 0)
772 return fd;
773
774 return move_fd(fd, fileno, 0);
775 }
776
777 default:
778 assert_not_reached();
779 }
780 }
781
782 static int chown_terminal(int fd, uid_t uid) {
783 int r;
784
785 assert(fd >= 0);
786
787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
788 if (isatty(fd) < 1) {
789 if (IN_SET(errno, EINVAL, ENOTTY))
790 return 0; /* not a tty */
791
792 return -errno;
793 }
794
795 /* This might fail. What matters are the results. */
796 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
797 if (r < 0)
798 return r;
799
800 return 1;
801 }
802
803 static int setup_confirm_stdio(
804 const ExecContext *context,
805 const char *vc,
806 int *ret_saved_stdin,
807 int *ret_saved_stdout) {
808
809 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
810 unsigned rows, cols;
811 int r;
812
813 assert(ret_saved_stdin);
814 assert(ret_saved_stdout);
815
816 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
817 if (saved_stdin < 0)
818 return -errno;
819
820 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
821 if (saved_stdout < 0)
822 return -errno;
823
824 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
825 if (fd < 0)
826 return fd;
827
828 r = chown_terminal(fd, getuid());
829 if (r < 0)
830 return r;
831
832 r = reset_terminal_fd(fd, true);
833 if (r < 0)
834 return r;
835
836 r = exec_context_tty_size(context, &rows, &cols);
837 if (r < 0)
838 return r;
839
840 r = terminal_set_size_fd(fd, vc, rows, cols);
841 if (r < 0)
842 return r;
843
844 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
845 TAKE_FD(fd);
846 if (r < 0)
847 return r;
848
849 *ret_saved_stdin = TAKE_FD(saved_stdin);
850 *ret_saved_stdout = TAKE_FD(saved_stdout);
851 return 0;
852 }
853
854 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
855 assert(err < 0);
856
857 if (err == -ETIMEDOUT)
858 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
859 else {
860 errno = -err;
861 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
862 }
863 }
864
865 static void write_confirm_error(int err, const char *vc, const Unit *u) {
866 _cleanup_close_ int fd = -EBADF;
867
868 assert(vc);
869
870 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
871 if (fd < 0)
872 return;
873
874 write_confirm_error_fd(err, fd, u);
875 }
876
877 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
878 int r = 0;
879
880 assert(saved_stdin);
881 assert(saved_stdout);
882
883 release_terminal();
884
885 if (*saved_stdin >= 0)
886 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
887 r = -errno;
888
889 if (*saved_stdout >= 0)
890 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
891 r = -errno;
892
893 *saved_stdin = safe_close(*saved_stdin);
894 *saved_stdout = safe_close(*saved_stdout);
895
896 return r;
897 }
898
899 enum {
900 CONFIRM_PRETEND_FAILURE = -1,
901 CONFIRM_PRETEND_SUCCESS = 0,
902 CONFIRM_EXECUTE = 1,
903 };
904
905 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
906 int saved_stdout = -1, saved_stdin = -1, r;
907 _cleanup_free_ char *e = NULL;
908 char c;
909
910 /* For any internal errors, assume a positive response. */
911 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
912 if (r < 0) {
913 write_confirm_error(r, vc, u);
914 return CONFIRM_EXECUTE;
915 }
916
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u->manager)) {
919 r = 1;
920 goto restore_stdio;
921 }
922
923 e = ellipsize(cmdline, 60, 100);
924 if (!e) {
925 log_oom();
926 r = CONFIRM_EXECUTE;
927 goto restore_stdio;
928 }
929
930 for (;;) {
931 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
932 if (r < 0) {
933 write_confirm_error_fd(r, STDOUT_FILENO, u);
934 r = CONFIRM_EXECUTE;
935 goto restore_stdio;
936 }
937
938 switch (c) {
939 case 'c':
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
942 r = 1;
943 break;
944 case 'D':
945 unit_dump(u, stdout, " ");
946 continue; /* ask again */
947 case 'f':
948 printf("Failing execution.\n");
949 r = CONFIRM_PRETEND_FAILURE;
950 break;
951 case 'h':
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
954 " f - fail, don't execute the command and pretend it failed\n"
955 " h - help\n"
956 " i - info, show a short summary of the unit\n"
957 " j - jobs, show jobs that are in progress\n"
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
960 continue; /* ask again */
961 case 'i':
962 printf(" Description: %s\n"
963 " Unit: %s\n"
964 " Command: %s\n",
965 u->id, u->description, cmdline);
966 continue; /* ask again */
967 case 'j':
968 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
969 continue; /* ask again */
970 case 'n':
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
974 case 's':
975 printf("Skipping execution.\n");
976 r = CONFIRM_PRETEND_SUCCESS;
977 break;
978 case 'y':
979 r = CONFIRM_EXECUTE;
980 break;
981 default:
982 assert_not_reached();
983 }
984 break;
985 }
986
987 restore_stdio:
988 restore_confirm_stdio(&saved_stdin, &saved_stdout);
989 return r;
990 }
991
992 static int get_fixed_user(const ExecContext *c, const char **user,
993 uid_t *uid, gid_t *gid,
994 const char **home, const char **shell) {
995 int r;
996 const char *name;
997
998 assert(c);
999
1000 if (!c->user)
1001 return 0;
1002
1003 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1004 * (i.e. are "/" or "/bin/nologin"). */
1005
1006 name = c->user;
1007 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1008 if (r < 0)
1009 return r;
1010
1011 *user = name;
1012 return 0;
1013 }
1014
1015 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1016 int r;
1017 const char *name;
1018
1019 assert(c);
1020
1021 if (!c->group)
1022 return 0;
1023
1024 name = c->group;
1025 r = get_group_creds(&name, gid, 0);
1026 if (r < 0)
1027 return r;
1028
1029 *group = name;
1030 return 0;
1031 }
1032
1033 static int get_supplementary_groups(const ExecContext *c, const char *user,
1034 const char *group, gid_t gid,
1035 gid_t **supplementary_gids, int *ngids) {
1036 int r, k = 0;
1037 int ngroups_max;
1038 bool keep_groups = false;
1039 gid_t *groups = NULL;
1040 _cleanup_free_ gid_t *l_gids = NULL;
1041
1042 assert(c);
1043
1044 /*
1045 * If user is given, then lookup GID and supplementary groups list.
1046 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1047 * here and as early as possible so we keep the list of supplementary
1048 * groups of the caller.
1049 */
1050 if (user && gid_is_valid(gid) && gid != 0) {
1051 /* First step, initialize groups from /etc/groups */
1052 if (initgroups(user, gid) < 0)
1053 return -errno;
1054
1055 keep_groups = true;
1056 }
1057
1058 if (strv_isempty(c->supplementary_groups))
1059 return 0;
1060
1061 /*
1062 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1063 * be positive, otherwise fail.
1064 */
1065 errno = 0;
1066 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1067 if (ngroups_max <= 0)
1068 return errno_or_else(EOPNOTSUPP);
1069
1070 l_gids = new(gid_t, ngroups_max);
1071 if (!l_gids)
1072 return -ENOMEM;
1073
1074 if (keep_groups) {
1075 /*
1076 * Lookup the list of groups that the user belongs to, we
1077 * avoid NSS lookups here too for gid=0.
1078 */
1079 k = ngroups_max;
1080 if (getgrouplist(user, gid, l_gids, &k) < 0)
1081 return -EINVAL;
1082 } else
1083 k = 0;
1084
1085 STRV_FOREACH(i, c->supplementary_groups) {
1086 const char *g;
1087
1088 if (k >= ngroups_max)
1089 return -E2BIG;
1090
1091 g = *i;
1092 r = get_group_creds(&g, l_gids+k, 0);
1093 if (r < 0)
1094 return r;
1095
1096 k++;
1097 }
1098
1099 /*
1100 * Sets ngids to zero to drop all supplementary groups, happens
1101 * when we are under root and SupplementaryGroups= is empty.
1102 */
1103 if (k == 0) {
1104 *ngids = 0;
1105 return 0;
1106 }
1107
1108 /* Otherwise get the final list of supplementary groups */
1109 groups = memdup(l_gids, sizeof(gid_t) * k);
1110 if (!groups)
1111 return -ENOMEM;
1112
1113 *supplementary_gids = groups;
1114 *ngids = k;
1115
1116 groups = NULL;
1117
1118 return 0;
1119 }
1120
1121 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1122 int r;
1123
1124 /* Handle SupplementaryGroups= if it is not empty */
1125 if (ngids > 0) {
1126 r = maybe_setgroups(ngids, supplementary_gids);
1127 if (r < 0)
1128 return r;
1129 }
1130
1131 if (gid_is_valid(gid)) {
1132 /* Then set our gids */
1133 if (setresgid(gid, gid, gid) < 0)
1134 return -errno;
1135 }
1136
1137 return 0;
1138 }
1139
1140 static int set_securebits(unsigned bits, unsigned mask) {
1141 unsigned applied;
1142 int current;
1143
1144 current = prctl(PR_GET_SECUREBITS);
1145 if (current < 0)
1146 return -errno;
1147
1148 /* Clear all securebits defined in mask and set bits */
1149 applied = ((unsigned) current & ~mask) | bits;
1150 if ((unsigned) current == applied)
1151 return 0;
1152
1153 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1154 return -errno;
1155
1156 return 1;
1157 }
1158
1159 static int enforce_user(
1160 const ExecContext *context,
1161 uid_t uid,
1162 uint64_t capability_ambient_set) {
1163 assert(context);
1164 int r;
1165
1166 if (!uid_is_valid(uid))
1167 return 0;
1168
1169 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1170 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1171 * case. */
1172
1173 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1174
1175 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1176 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1177 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1178 if (r < 0)
1179 return r;
1180 }
1181
1182 /* Second step: actually set the uids */
1183 if (setresuid(uid, uid, uid) < 0)
1184 return -errno;
1185
1186 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1187 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1188 * outside of this call. */
1189 return 0;
1190 }
1191
1192 #if HAVE_PAM
1193
1194 static int null_conv(
1195 int num_msg,
1196 const struct pam_message **msg,
1197 struct pam_response **resp,
1198 void *appdata_ptr) {
1199
1200 /* We don't support conversations */
1201
1202 return PAM_CONV_ERR;
1203 }
1204
1205 #endif
1206
1207 static int setup_pam(
1208 const char *name,
1209 const char *user,
1210 uid_t uid,
1211 gid_t gid,
1212 const char *tty,
1213 char ***env, /* updated on success */
1214 const int fds[], size_t n_fds) {
1215
1216 #if HAVE_PAM
1217
1218 static const struct pam_conv conv = {
1219 .conv = null_conv,
1220 .appdata_ptr = NULL
1221 };
1222
1223 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1224 _cleanup_strv_free_ char **e = NULL;
1225 pam_handle_t *handle = NULL;
1226 sigset_t old_ss;
1227 int pam_code = PAM_SUCCESS, r;
1228 bool close_session = false;
1229 pid_t pam_pid = 0, parent_pid;
1230 int flags = 0;
1231
1232 assert(name);
1233 assert(user);
1234 assert(env);
1235
1236 /* We set up PAM in the parent process, then fork. The child
1237 * will then stay around until killed via PR_GET_PDEATHSIG or
1238 * systemd via the cgroup logic. It will then remove the PAM
1239 * session again. The parent process will exec() the actual
1240 * daemon. We do things this way to ensure that the main PID
1241 * of the daemon is the one we initially fork()ed. */
1242
1243 r = barrier_create(&barrier);
1244 if (r < 0)
1245 goto fail;
1246
1247 if (log_get_max_level() < LOG_DEBUG)
1248 flags |= PAM_SILENT;
1249
1250 pam_code = pam_start(name, user, &conv, &handle);
1251 if (pam_code != PAM_SUCCESS) {
1252 handle = NULL;
1253 goto fail;
1254 }
1255
1256 if (!tty) {
1257 _cleanup_free_ char *q = NULL;
1258
1259 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1260 * out if that's the case, and read the TTY off it. */
1261
1262 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1263 tty = strjoina("/dev/", q);
1264 }
1265
1266 if (tty) {
1267 pam_code = pam_set_item(handle, PAM_TTY, tty);
1268 if (pam_code != PAM_SUCCESS)
1269 goto fail;
1270 }
1271
1272 STRV_FOREACH(nv, *env) {
1273 pam_code = pam_putenv(handle, *nv);
1274 if (pam_code != PAM_SUCCESS)
1275 goto fail;
1276 }
1277
1278 pam_code = pam_acct_mgmt(handle, flags);
1279 if (pam_code != PAM_SUCCESS)
1280 goto fail;
1281
1282 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1283 if (pam_code != PAM_SUCCESS)
1284 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1285
1286 pam_code = pam_open_session(handle, flags);
1287 if (pam_code != PAM_SUCCESS)
1288 goto fail;
1289
1290 close_session = true;
1291
1292 e = pam_getenvlist(handle);
1293 if (!e) {
1294 pam_code = PAM_BUF_ERR;
1295 goto fail;
1296 }
1297
1298 /* Block SIGTERM, so that we know that it won't get lost in the child */
1299
1300 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1301
1302 parent_pid = getpid_cached();
1303
1304 r = safe_fork("(sd-pam)", 0, &pam_pid);
1305 if (r < 0)
1306 goto fail;
1307 if (r == 0) {
1308 int sig, ret = EXIT_PAM;
1309
1310 /* The child's job is to reset the PAM session on termination */
1311 barrier_set_role(&barrier, BARRIER_CHILD);
1312
1313 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1314 * those fds are open here that have been opened by PAM. */
1315 (void) close_many(fds, n_fds);
1316
1317 /* Drop privileges - we don't need any to pam_close_session and this will make
1318 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1319 * threads to fail to exit normally */
1320
1321 r = maybe_setgroups(0, NULL);
1322 if (r < 0)
1323 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1324 if (setresgid(gid, gid, gid) < 0)
1325 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1326 if (setresuid(uid, uid, uid) < 0)
1327 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1328
1329 (void) ignore_signals(SIGPIPE);
1330
1331 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1332 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1333 * this way. We rely on the control groups kill logic to do the rest for us. */
1334 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1335 goto child_finish;
1336
1337 /* Tell the parent that our setup is done. This is especially important regarding dropping
1338 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1339 *
1340 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1341 (void) barrier_place(&barrier);
1342
1343 /* Check if our parent process might already have died? */
1344 if (getppid() == parent_pid) {
1345 sigset_t ss;
1346
1347 assert_se(sigemptyset(&ss) >= 0);
1348 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1349
1350 for (;;) {
1351 if (sigwait(&ss, &sig) < 0) {
1352 if (errno == EINTR)
1353 continue;
1354
1355 goto child_finish;
1356 }
1357
1358 assert(sig == SIGTERM);
1359 break;
1360 }
1361 }
1362
1363 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1364 if (pam_code != PAM_SUCCESS)
1365 goto child_finish;
1366
1367 /* If our parent died we'll end the session */
1368 if (getppid() != parent_pid) {
1369 pam_code = pam_close_session(handle, flags);
1370 if (pam_code != PAM_SUCCESS)
1371 goto child_finish;
1372 }
1373
1374 ret = 0;
1375
1376 child_finish:
1377 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1378 * know about this. See pam_end(3) */
1379 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1380 _exit(ret);
1381 }
1382
1383 barrier_set_role(&barrier, BARRIER_PARENT);
1384
1385 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1386 * here. */
1387 handle = NULL;
1388
1389 /* Unblock SIGTERM again in the parent */
1390 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1391
1392 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1393 * this fd around. */
1394 closelog();
1395
1396 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1397 * recover. However, warn loudly if it happens. */
1398 if (!barrier_place_and_sync(&barrier))
1399 log_error("PAM initialization failed");
1400
1401 return strv_free_and_replace(*env, e);
1402
1403 fail:
1404 if (pam_code != PAM_SUCCESS) {
1405 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1406 r = -EPERM; /* PAM errors do not map to errno */
1407 } else
1408 log_error_errno(r, "PAM failed: %m");
1409
1410 if (handle) {
1411 if (close_session)
1412 pam_code = pam_close_session(handle, flags);
1413
1414 (void) pam_end(handle, pam_code | flags);
1415 }
1416
1417 closelog();
1418 return r;
1419 #else
1420 return 0;
1421 #endif
1422 }
1423
1424 static void rename_process_from_path(const char *path) {
1425 _cleanup_free_ char *buf = NULL;
1426 const char *p;
1427
1428 assert(path);
1429
1430 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1431 * /bin/ps */
1432
1433 if (path_extract_filename(path, &buf) < 0) {
1434 rename_process("(...)");
1435 return;
1436 }
1437
1438 size_t l = strlen(buf);
1439 if (l > 8) {
1440 /* The end of the process name is usually more interesting, since the first bit might just be
1441 * "systemd-" */
1442 p = buf + l - 8;
1443 l = 8;
1444 } else
1445 p = buf;
1446
1447 char process_name[11];
1448 process_name[0] = '(';
1449 memcpy(process_name+1, p, l);
1450 process_name[1+l] = ')';
1451 process_name[1+l+1] = 0;
1452
1453 rename_process(process_name);
1454 }
1455
1456 static bool context_has_address_families(const ExecContext *c) {
1457 assert(c);
1458
1459 return c->address_families_allow_list ||
1460 !set_isempty(c->address_families);
1461 }
1462
1463 static bool context_has_syscall_filters(const ExecContext *c) {
1464 assert(c);
1465
1466 return c->syscall_allow_list ||
1467 !hashmap_isempty(c->syscall_filter);
1468 }
1469
1470 static bool context_has_syscall_logs(const ExecContext *c) {
1471 assert(c);
1472
1473 return c->syscall_log_allow_list ||
1474 !hashmap_isempty(c->syscall_log);
1475 }
1476
1477 static bool context_has_no_new_privileges(const ExecContext *c) {
1478 assert(c);
1479
1480 if (c->no_new_privileges)
1481 return true;
1482
1483 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1484 return false;
1485
1486 /* We need NNP if we have any form of seccomp and are unprivileged */
1487 return c->lock_personality ||
1488 c->memory_deny_write_execute ||
1489 c->private_devices ||
1490 c->protect_clock ||
1491 c->protect_hostname ||
1492 c->protect_kernel_tunables ||
1493 c->protect_kernel_modules ||
1494 c->protect_kernel_logs ||
1495 context_has_address_families(c) ||
1496 exec_context_restrict_namespaces_set(c) ||
1497 c->restrict_realtime ||
1498 c->restrict_suid_sgid ||
1499 !set_isempty(c->syscall_archs) ||
1500 context_has_syscall_filters(c) ||
1501 context_has_syscall_logs(c);
1502 }
1503
1504 #if HAVE_SECCOMP
1505
1506 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1507
1508 if (is_seccomp_available())
1509 return false;
1510
1511 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1512 return true;
1513 }
1514
1515 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1516 uint32_t negative_action, default_action, action;
1517 int r;
1518
1519 assert(u);
1520 assert(c);
1521
1522 if (!context_has_syscall_filters(c))
1523 return 0;
1524
1525 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1526 return 0;
1527
1528 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1529
1530 if (c->syscall_allow_list) {
1531 default_action = negative_action;
1532 action = SCMP_ACT_ALLOW;
1533 } else {
1534 default_action = SCMP_ACT_ALLOW;
1535 action = negative_action;
1536 }
1537
1538 if (needs_ambient_hack) {
1539 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1540 if (r < 0)
1541 return r;
1542 }
1543
1544 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1545 }
1546
1547 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1548 #ifdef SCMP_ACT_LOG
1549 uint32_t default_action, action;
1550 #endif
1551
1552 assert(u);
1553 assert(c);
1554
1555 if (!context_has_syscall_logs(c))
1556 return 0;
1557
1558 #ifdef SCMP_ACT_LOG
1559 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1560 return 0;
1561
1562 if (c->syscall_log_allow_list) {
1563 /* Log nothing but the ones listed */
1564 default_action = SCMP_ACT_ALLOW;
1565 action = SCMP_ACT_LOG;
1566 } else {
1567 /* Log everything but the ones listed */
1568 default_action = SCMP_ACT_LOG;
1569 action = SCMP_ACT_ALLOW;
1570 }
1571
1572 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1573 #else
1574 /* old libseccomp */
1575 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1576 return 0;
1577 #endif
1578 }
1579
1580 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1581 assert(u);
1582 assert(c);
1583
1584 if (set_isempty(c->syscall_archs))
1585 return 0;
1586
1587 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1588 return 0;
1589
1590 return seccomp_restrict_archs(c->syscall_archs);
1591 }
1592
1593 static int apply_address_families(const Unit* u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
1596
1597 if (!context_has_address_families(c))
1598 return 0;
1599
1600 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1601 return 0;
1602
1603 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1604 }
1605
1606 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1607 int r;
1608
1609 assert(u);
1610 assert(c);
1611
1612 if (!c->memory_deny_write_execute)
1613 return 0;
1614
1615 /* use prctl() if kernel supports it (6.3) */
1616 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1617 if (r == 0) {
1618 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1619 return 0;
1620 }
1621 if (r < 0 && errno != EINVAL)
1622 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1623 /* else use seccomp */
1624 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1625
1626 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1627 return 0;
1628
1629 return seccomp_memory_deny_write_execute();
1630 }
1631
1632 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1633 assert(u);
1634 assert(c);
1635
1636 if (!c->restrict_realtime)
1637 return 0;
1638
1639 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1640 return 0;
1641
1642 return seccomp_restrict_realtime();
1643 }
1644
1645 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1646 assert(u);
1647 assert(c);
1648
1649 if (!c->restrict_suid_sgid)
1650 return 0;
1651
1652 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1653 return 0;
1654
1655 return seccomp_restrict_suid_sgid();
1656 }
1657
1658 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1659 assert(u);
1660 assert(c);
1661
1662 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1663 * let's protect even those systems where this is left on in the kernel. */
1664
1665 if (!c->protect_kernel_tunables)
1666 return 0;
1667
1668 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1669 return 0;
1670
1671 return seccomp_protect_sysctl();
1672 }
1673
1674 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1675 assert(u);
1676 assert(c);
1677
1678 /* Turn off module syscalls on ProtectKernelModules=yes */
1679
1680 if (!c->protect_kernel_modules)
1681 return 0;
1682
1683 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1684 return 0;
1685
1686 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1687 }
1688
1689 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1690 assert(u);
1691 assert(c);
1692
1693 if (!c->protect_kernel_logs)
1694 return 0;
1695
1696 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1697 return 0;
1698
1699 return seccomp_protect_syslog();
1700 }
1701
1702 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1703 assert(u);
1704 assert(c);
1705
1706 if (!c->protect_clock)
1707 return 0;
1708
1709 if (skip_seccomp_unavailable(u, "ProtectClock="))
1710 return 0;
1711
1712 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1713 }
1714
1715 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1716 assert(u);
1717 assert(c);
1718
1719 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1720
1721 if (!c->private_devices)
1722 return 0;
1723
1724 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1725 return 0;
1726
1727 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1728 }
1729
1730 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1731 assert(u);
1732 assert(c);
1733
1734 if (!exec_context_restrict_namespaces_set(c))
1735 return 0;
1736
1737 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1738 return 0;
1739
1740 return seccomp_restrict_namespaces(c->restrict_namespaces);
1741 }
1742
1743 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1744 unsigned long personality;
1745 int r;
1746
1747 assert(u);
1748 assert(c);
1749
1750 if (!c->lock_personality)
1751 return 0;
1752
1753 if (skip_seccomp_unavailable(u, "LockPersonality="))
1754 return 0;
1755
1756 personality = c->personality;
1757
1758 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1759 if (personality == PERSONALITY_INVALID) {
1760
1761 r = opinionated_personality(&personality);
1762 if (r < 0)
1763 return r;
1764 }
1765
1766 return seccomp_lock_personality(personality);
1767 }
1768
1769 #endif
1770
1771 #if HAVE_LIBBPF
1772 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1773 assert(u);
1774 assert(c);
1775
1776 if (!exec_context_restrict_filesystems_set(c))
1777 return 0;
1778
1779 if (!u->manager->restrict_fs) {
1780 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1781 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1782 return 0;
1783 }
1784
1785 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1786 }
1787 #endif
1788
1789 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1790 assert(u);
1791 assert(c);
1792
1793 if (!c->protect_hostname)
1794 return 0;
1795
1796 if (ns_type_supported(NAMESPACE_UTS)) {
1797 if (unshare(CLONE_NEWUTS) < 0) {
1798 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1799 *ret_exit_status = EXIT_NAMESPACE;
1800 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1801 }
1802
1803 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1804 }
1805 } else
1806 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1807
1808 #if HAVE_SECCOMP
1809 int r;
1810
1811 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1812 return 0;
1813
1814 r = seccomp_protect_hostname();
1815 if (r < 0) {
1816 *ret_exit_status = EXIT_SECCOMP;
1817 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1818 }
1819 #endif
1820
1821 return 0;
1822 }
1823
1824 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1825 assert(idle_pipe);
1826
1827 idle_pipe[1] = safe_close(idle_pipe[1]);
1828 idle_pipe[2] = safe_close(idle_pipe[2]);
1829
1830 if (idle_pipe[0] >= 0) {
1831 int r;
1832
1833 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1834
1835 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1836 ssize_t n;
1837
1838 /* Signal systemd that we are bored and want to continue. */
1839 n = write(idle_pipe[3], "x", 1);
1840 if (n > 0)
1841 /* Wait for systemd to react to the signal above. */
1842 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1843 }
1844
1845 idle_pipe[0] = safe_close(idle_pipe[0]);
1846
1847 }
1848
1849 idle_pipe[3] = safe_close(idle_pipe[3]);
1850 }
1851
1852 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1853
1854 static int build_environment(
1855 const Unit *u,
1856 const ExecContext *c,
1857 const ExecParameters *p,
1858 const CGroupContext *cgroup_context,
1859 size_t n_fds,
1860 char **fdnames,
1861 const char *home,
1862 const char *username,
1863 const char *shell,
1864 dev_t journal_stream_dev,
1865 ino_t journal_stream_ino,
1866 const char *memory_pressure_path,
1867 char ***ret) {
1868
1869 _cleanup_strv_free_ char **our_env = NULL;
1870 size_t n_env = 0;
1871 char *x;
1872 int r;
1873
1874 assert(u);
1875 assert(c);
1876 assert(p);
1877 assert(ret);
1878
1879 #define N_ENV_VARS 19
1880 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1881 if (!our_env)
1882 return -ENOMEM;
1883
1884 if (n_fds > 0) {
1885 _cleanup_free_ char *joined = NULL;
1886
1887 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1888 return -ENOMEM;
1889 our_env[n_env++] = x;
1890
1891 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1892 return -ENOMEM;
1893 our_env[n_env++] = x;
1894
1895 joined = strv_join(fdnames, ":");
1896 if (!joined)
1897 return -ENOMEM;
1898
1899 x = strjoin("LISTEN_FDNAMES=", joined);
1900 if (!x)
1901 return -ENOMEM;
1902 our_env[n_env++] = x;
1903 }
1904
1905 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1906 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1907 return -ENOMEM;
1908 our_env[n_env++] = x;
1909
1910 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1911 return -ENOMEM;
1912 our_env[n_env++] = x;
1913 }
1914
1915 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1916 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1917 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1918 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1919 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1920 if (!x)
1921 return -ENOMEM;
1922 our_env[n_env++] = x;
1923 }
1924
1925 if (home) {
1926 x = strjoin("HOME=", home);
1927 if (!x)
1928 return -ENOMEM;
1929
1930 path_simplify(x + 5);
1931 our_env[n_env++] = x;
1932 }
1933
1934 if (username) {
1935 x = strjoin("LOGNAME=", username);
1936 if (!x)
1937 return -ENOMEM;
1938 our_env[n_env++] = x;
1939
1940 x = strjoin("USER=", username);
1941 if (!x)
1942 return -ENOMEM;
1943 our_env[n_env++] = x;
1944 }
1945
1946 if (shell) {
1947 x = strjoin("SHELL=", shell);
1948 if (!x)
1949 return -ENOMEM;
1950
1951 path_simplify(x + 6);
1952 our_env[n_env++] = x;
1953 }
1954
1955 if (!sd_id128_is_null(u->invocation_id)) {
1956 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1957 return -ENOMEM;
1958
1959 our_env[n_env++] = x;
1960 }
1961
1962 if (exec_context_needs_term(c)) {
1963 _cleanup_free_ char *cmdline = NULL;
1964 const char *tty_path, *term = NULL;
1965
1966 tty_path = exec_context_tty_path(c);
1967
1968 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1969 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1970 * container manager passes to PID 1 ends up all the way in the console login shown. */
1971
1972 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1973 term = getenv("TERM");
1974 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1975 _cleanup_free_ char *key = NULL;
1976
1977 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1978 if (!key)
1979 return -ENOMEM;
1980
1981 r = proc_cmdline_get_key(key, 0, &cmdline);
1982 if (r < 0)
1983 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1984 else if (r > 0)
1985 term = cmdline;
1986 }
1987
1988 if (!term)
1989 term = default_term_for_tty(tty_path);
1990
1991 x = strjoin("TERM=", term);
1992 if (!x)
1993 return -ENOMEM;
1994 our_env[n_env++] = x;
1995 }
1996
1997 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1998 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1999 return -ENOMEM;
2000
2001 our_env[n_env++] = x;
2002 }
2003
2004 if (c->log_namespace) {
2005 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2006 if (!x)
2007 return -ENOMEM;
2008
2009 our_env[n_env++] = x;
2010 }
2011
2012 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2013 _cleanup_free_ char *joined = NULL;
2014 const char *n;
2015
2016 if (!p->prefix[t])
2017 continue;
2018
2019 if (c->directories[t].n_items == 0)
2020 continue;
2021
2022 n = exec_directory_env_name_to_string(t);
2023 if (!n)
2024 continue;
2025
2026 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2027 _cleanup_free_ char *prefixed = NULL;
2028
2029 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2030 if (!prefixed)
2031 return -ENOMEM;
2032
2033 if (!strextend_with_separator(&joined, ":", prefixed))
2034 return -ENOMEM;
2035 }
2036
2037 x = strjoin(n, "=", joined);
2038 if (!x)
2039 return -ENOMEM;
2040
2041 our_env[n_env++] = x;
2042 }
2043
2044 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2045 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2046 if (!x)
2047 return -ENOMEM;
2048
2049 our_env[n_env++] = x;
2050 }
2051
2052 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2053 return -ENOMEM;
2054
2055 our_env[n_env++] = x;
2056
2057 if (memory_pressure_path) {
2058 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2059 if (!x)
2060 return -ENOMEM;
2061
2062 our_env[n_env++] = x;
2063
2064 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2065 _cleanup_free_ char *b = NULL, *e = NULL;
2066
2067 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2068 MEMORY_PRESSURE_DEFAULT_TYPE,
2069 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2070 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2071 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2072 return -ENOMEM;
2073
2074 if (base64mem(b, strlen(b) + 1, &e) < 0)
2075 return -ENOMEM;
2076
2077 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2078 if (!x)
2079 return -ENOMEM;
2080
2081 our_env[n_env++] = x;
2082 }
2083 }
2084
2085 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2086 #undef N_ENV_VARS
2087
2088 *ret = TAKE_PTR(our_env);
2089
2090 return 0;
2091 }
2092
2093 static int build_pass_environment(const ExecContext *c, char ***ret) {
2094 _cleanup_strv_free_ char **pass_env = NULL;
2095 size_t n_env = 0;
2096
2097 STRV_FOREACH(i, c->pass_environment) {
2098 _cleanup_free_ char *x = NULL;
2099 char *v;
2100
2101 v = getenv(*i);
2102 if (!v)
2103 continue;
2104 x = strjoin(*i, "=", v);
2105 if (!x)
2106 return -ENOMEM;
2107
2108 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2109 return -ENOMEM;
2110
2111 pass_env[n_env++] = TAKE_PTR(x);
2112 pass_env[n_env] = NULL;
2113 }
2114
2115 *ret = TAKE_PTR(pass_env);
2116
2117 return 0;
2118 }
2119
2120 bool exec_needs_network_namespace(const ExecContext *context) {
2121 assert(context);
2122
2123 return context->private_network || context->network_namespace_path;
2124 }
2125
2126 static bool exec_needs_ephemeral(const ExecContext *context) {
2127 return (context->root_image || context->root_directory) && context->root_ephemeral;
2128 }
2129
2130 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2131 assert(context);
2132
2133 return context->private_ipc || context->ipc_namespace_path;
2134 }
2135
2136 bool exec_needs_mount_namespace(
2137 const ExecContext *context,
2138 const ExecParameters *params,
2139 const ExecRuntime *runtime) {
2140
2141 assert(context);
2142
2143 if (context->root_image)
2144 return true;
2145
2146 if (!strv_isempty(context->read_write_paths) ||
2147 !strv_isempty(context->read_only_paths) ||
2148 !strv_isempty(context->inaccessible_paths) ||
2149 !strv_isempty(context->exec_paths) ||
2150 !strv_isempty(context->no_exec_paths))
2151 return true;
2152
2153 if (context->n_bind_mounts > 0)
2154 return true;
2155
2156 if (context->n_temporary_filesystems > 0)
2157 return true;
2158
2159 if (context->n_mount_images > 0)
2160 return true;
2161
2162 if (context->n_extension_images > 0)
2163 return true;
2164
2165 if (!strv_isempty(context->extension_directories))
2166 return true;
2167
2168 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2169 return true;
2170
2171 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2172 return true;
2173
2174 if (context->private_devices ||
2175 context->private_mounts > 0 ||
2176 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2177 context->protect_system != PROTECT_SYSTEM_NO ||
2178 context->protect_home != PROTECT_HOME_NO ||
2179 context->protect_kernel_tunables ||
2180 context->protect_kernel_modules ||
2181 context->protect_kernel_logs ||
2182 context->protect_control_groups ||
2183 context->protect_proc != PROTECT_PROC_DEFAULT ||
2184 context->proc_subset != PROC_SUBSET_ALL ||
2185 exec_needs_ipc_namespace(context))
2186 return true;
2187
2188 if (context->root_directory) {
2189 if (exec_context_get_effective_mount_apivfs(context))
2190 return true;
2191
2192 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2193 if (params && !params->prefix[t])
2194 continue;
2195
2196 if (context->directories[t].n_items > 0)
2197 return true;
2198 }
2199 }
2200
2201 if (context->dynamic_user &&
2202 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2203 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2204 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2205 return true;
2206
2207 if (context->log_namespace)
2208 return true;
2209
2210 return false;
2211 }
2212
2213 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2214 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2215 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2216 _cleanup_close_ int unshare_ready_fd = -EBADF;
2217 _cleanup_(sigkill_waitp) pid_t pid = 0;
2218 uint64_t c = 1;
2219 ssize_t n;
2220 int r;
2221
2222 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2223 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2224 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2225 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2226 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2227 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2228 * continues execution normally.
2229 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2230 * does not need CAP_SETUID to write the single line mapping to itself. */
2231
2232 /* Can only set up multiple mappings with CAP_SETUID. */
2233 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2234 r = asprintf(&uid_map,
2235 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2236 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2237 ouid, ouid, uid, uid);
2238 else
2239 r = asprintf(&uid_map,
2240 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2241 ouid, ouid);
2242
2243 if (r < 0)
2244 return -ENOMEM;
2245
2246 /* Can only set up multiple mappings with CAP_SETGID. */
2247 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2248 r = asprintf(&gid_map,
2249 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2250 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2251 ogid, ogid, gid, gid);
2252 else
2253 r = asprintf(&gid_map,
2254 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2255 ogid, ogid);
2256
2257 if (r < 0)
2258 return -ENOMEM;
2259
2260 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2261 * namespace. */
2262 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2263 if (unshare_ready_fd < 0)
2264 return -errno;
2265
2266 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2267 * failed. */
2268 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2269 return -errno;
2270
2271 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2272 if (r < 0)
2273 return r;
2274 if (r == 0) {
2275 _cleanup_close_ int fd = -EBADF;
2276 const char *a;
2277 pid_t ppid;
2278
2279 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2280 * here, after the parent opened its own user namespace. */
2281
2282 ppid = getppid();
2283 errno_pipe[0] = safe_close(errno_pipe[0]);
2284
2285 /* Wait until the parent unshared the user namespace */
2286 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2287 r = -errno;
2288 goto child_fail;
2289 }
2290
2291 /* Disable the setgroups() system call in the child user namespace, for good. */
2292 a = procfs_file_alloca(ppid, "setgroups");
2293 fd = open(a, O_WRONLY|O_CLOEXEC);
2294 if (fd < 0) {
2295 if (errno != ENOENT) {
2296 r = -errno;
2297 goto child_fail;
2298 }
2299
2300 /* If the file is missing the kernel is too old, let's continue anyway. */
2301 } else {
2302 if (write(fd, "deny\n", 5) < 0) {
2303 r = -errno;
2304 goto child_fail;
2305 }
2306
2307 fd = safe_close(fd);
2308 }
2309
2310 /* First write the GID map */
2311 a = procfs_file_alloca(ppid, "gid_map");
2312 fd = open(a, O_WRONLY|O_CLOEXEC);
2313 if (fd < 0) {
2314 r = -errno;
2315 goto child_fail;
2316 }
2317 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2318 r = -errno;
2319 goto child_fail;
2320 }
2321 fd = safe_close(fd);
2322
2323 /* The write the UID map */
2324 a = procfs_file_alloca(ppid, "uid_map");
2325 fd = open(a, O_WRONLY|O_CLOEXEC);
2326 if (fd < 0) {
2327 r = -errno;
2328 goto child_fail;
2329 }
2330 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2331 r = -errno;
2332 goto child_fail;
2333 }
2334
2335 _exit(EXIT_SUCCESS);
2336
2337 child_fail:
2338 (void) write(errno_pipe[1], &r, sizeof(r));
2339 _exit(EXIT_FAILURE);
2340 }
2341
2342 errno_pipe[1] = safe_close(errno_pipe[1]);
2343
2344 if (unshare(CLONE_NEWUSER) < 0)
2345 return -errno;
2346
2347 /* Let the child know that the namespace is ready now */
2348 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2349 return -errno;
2350
2351 /* Try to read an error code from the child */
2352 n = read(errno_pipe[0], &r, sizeof(r));
2353 if (n < 0)
2354 return -errno;
2355 if (n == sizeof(r)) { /* an error code was sent to us */
2356 if (r < 0)
2357 return r;
2358 return -EIO;
2359 }
2360 if (n != 0) /* on success we should have read 0 bytes */
2361 return -EIO;
2362
2363 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2364 if (r < 0)
2365 return r;
2366 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2367 return -EIO;
2368
2369 return 0;
2370 }
2371
2372 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2373 assert(context);
2374
2375 if (!context->dynamic_user)
2376 return false;
2377
2378 if (type == EXEC_DIRECTORY_CONFIGURATION)
2379 return false;
2380
2381 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2382 return false;
2383
2384 return true;
2385 }
2386
2387 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2388 _cleanup_free_ char *src_abs = NULL;
2389 int r;
2390
2391 assert(source);
2392
2393 src_abs = path_join(root, source);
2394 if (!src_abs)
2395 return -ENOMEM;
2396
2397 STRV_FOREACH(dst, symlinks) {
2398 _cleanup_free_ char *dst_abs = NULL;
2399
2400 dst_abs = path_join(root, *dst);
2401 if (!dst_abs)
2402 return -ENOMEM;
2403
2404 r = mkdir_parents_label(dst_abs, 0755);
2405 if (r < 0)
2406 return r;
2407
2408 r = symlink_idempotent(src_abs, dst_abs, true);
2409 if (r < 0)
2410 return r;
2411 }
2412
2413 return 0;
2414 }
2415
2416 static int setup_exec_directory(
2417 Unit *u,
2418 const ExecContext *context,
2419 const ExecParameters *params,
2420 uid_t uid,
2421 gid_t gid,
2422 ExecDirectoryType type,
2423 bool needs_mount_namespace,
2424 int *exit_status) {
2425
2426 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2427 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2428 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2429 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2430 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2431 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2432 };
2433 int r;
2434
2435 assert(context);
2436 assert(params);
2437 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2438 assert(exit_status);
2439
2440 if (!params->prefix[type])
2441 return 0;
2442
2443 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2444 if (!uid_is_valid(uid))
2445 uid = 0;
2446 if (!gid_is_valid(gid))
2447 gid = 0;
2448 }
2449
2450 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2451 _cleanup_free_ char *p = NULL, *pp = NULL;
2452
2453 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2454 if (!p) {
2455 r = -ENOMEM;
2456 goto fail;
2457 }
2458
2459 r = mkdir_parents_label(p, 0755);
2460 if (r < 0)
2461 goto fail;
2462
2463 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2464
2465 /* If we are in user mode, and a configuration directory exists but a state directory
2466 * doesn't exist, then we likely are upgrading from an older systemd version that
2467 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2468 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2469 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2470 * separated. If a service has both dirs configured but only the configuration dir
2471 * exists and the state dir does not, we assume we are looking at an update
2472 * situation. Hence, create a compatibility symlink, so that all expectations are
2473 * met.
2474 *
2475 * (We also do something similar with the log directory, which still doesn't exist in
2476 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2477
2478 /* this assumes the state dir is always created before the configuration dir */
2479 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2480 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2481
2482 r = laccess(p, F_OK);
2483 if (r == -ENOENT) {
2484 _cleanup_free_ char *q = NULL;
2485
2486 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2487 * under the configuration hierarchy. */
2488
2489 if (type == EXEC_DIRECTORY_STATE)
2490 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2491 else if (type == EXEC_DIRECTORY_LOGS)
2492 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2493 else
2494 assert_not_reached();
2495 if (!q) {
2496 r = -ENOMEM;
2497 goto fail;
2498 }
2499
2500 r = laccess(q, F_OK);
2501 if (r >= 0) {
2502 /* It does exist! This hence looks like an update. Symlink the
2503 * configuration directory into the state directory. */
2504
2505 r = symlink_idempotent(q, p, /* make_relative= */ true);
2506 if (r < 0)
2507 goto fail;
2508
2509 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2510 continue;
2511 } else if (r != -ENOENT)
2512 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2513
2514 } else if (r < 0)
2515 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2516 }
2517
2518 if (exec_directory_is_private(context, type)) {
2519 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2520 * case we want to avoid leaving a directory around fully accessible that is owned by
2521 * a dynamic user whose UID is later on reused. To lock this down we use the same
2522 * trick used by container managers to prohibit host users to get access to files of
2523 * the same UID in containers: we place everything inside a directory that has an
2524 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2525 * for unprivileged host code. We then use fs namespacing to make this directory
2526 * permeable for the service itself.
2527 *
2528 * Specifically: for a service which wants a special directory "foo/" we first create
2529 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2530 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2531 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2532 * unprivileged host users can't look into it. Inside of the namespace of the unit
2533 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2534 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2535 * for the service and making sure it only gets access to the dirs it needs but no
2536 * others. Tricky? Yes, absolutely, but it works!
2537 *
2538 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2539 * to be owned by the service itself.
2540 *
2541 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2542 * for sharing files or sockets with other services. */
2543
2544 pp = path_join(params->prefix[type], "private");
2545 if (!pp) {
2546 r = -ENOMEM;
2547 goto fail;
2548 }
2549
2550 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2551 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2552 if (r < 0)
2553 goto fail;
2554
2555 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2556 r = -ENOMEM;
2557 goto fail;
2558 }
2559
2560 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2561 r = mkdir_parents_label(pp, 0755);
2562 if (r < 0)
2563 goto fail;
2564
2565 if (is_dir(p, false) > 0 &&
2566 (laccess(pp, F_OK) == -ENOENT)) {
2567
2568 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2569 * it over. Most likely the service has been upgraded from one that didn't use
2570 * DynamicUser=1, to one that does. */
2571
2572 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2573 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2574 exec_directory_type_to_string(type), p, pp);
2575
2576 r = RET_NERRNO(rename(p, pp));
2577 if (r < 0)
2578 goto fail;
2579 } else {
2580 /* Otherwise, create the actual directory for the service */
2581
2582 r = mkdir_label(pp, context->directories[type].mode);
2583 if (r < 0 && r != -EEXIST)
2584 goto fail;
2585 }
2586
2587 if (!context->directories[type].items[i].only_create) {
2588 /* And link it up from the original place.
2589 * Notes
2590 * 1) If a mount namespace is going to be used, then this symlink remains on
2591 * the host, and a new one for the child namespace will be created later.
2592 * 2) It is not necessary to create this symlink when one of its parent
2593 * directories is specified and already created. E.g.
2594 * StateDirectory=foo foo/bar
2595 * In that case, the inode points to pp and p for "foo/bar" are the same:
2596 * pp = "/var/lib/private/foo/bar"
2597 * p = "/var/lib/foo/bar"
2598 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2599 * we do not need to create the symlink, but we cannot create the symlink.
2600 * See issue #24783. */
2601 r = symlink_idempotent(pp, p, true);
2602 if (r < 0)
2603 goto fail;
2604 }
2605
2606 } else {
2607 _cleanup_free_ char *target = NULL;
2608
2609 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2610 readlink_and_make_absolute(p, &target) >= 0) {
2611 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2612
2613 /* This already exists and is a symlink? Interesting. Maybe it's one created
2614 * by DynamicUser=1 (see above)?
2615 *
2616 * We do this for all directory types except for ConfigurationDirectory=,
2617 * since they all support the private/ symlink logic at least in some
2618 * configurations, see above. */
2619
2620 r = chase(target, NULL, 0, &target_resolved, NULL);
2621 if (r < 0)
2622 goto fail;
2623
2624 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2625 if (!q) {
2626 r = -ENOMEM;
2627 goto fail;
2628 }
2629
2630 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2631 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2632 if (r < 0)
2633 goto fail;
2634
2635 if (path_equal(q_resolved, target_resolved)) {
2636
2637 /* Hmm, apparently DynamicUser= was once turned on for this service,
2638 * but is no longer. Let's move the directory back up. */
2639
2640 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2641 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2642 exec_directory_type_to_string(type), q, p);
2643
2644 r = RET_NERRNO(unlink(p));
2645 if (r < 0)
2646 goto fail;
2647
2648 r = RET_NERRNO(rename(q, p));
2649 if (r < 0)
2650 goto fail;
2651 }
2652 }
2653
2654 r = mkdir_label(p, context->directories[type].mode);
2655 if (r < 0) {
2656 if (r != -EEXIST)
2657 goto fail;
2658
2659 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2660 struct stat st;
2661
2662 /* Don't change the owner/access mode of the configuration directory,
2663 * as in the common case it is not written to by a service, and shall
2664 * not be writable. */
2665
2666 r = RET_NERRNO(stat(p, &st));
2667 if (r < 0)
2668 goto fail;
2669
2670 /* Still complain if the access mode doesn't match */
2671 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2672 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2673 "(File system: %o %sMode: %o)",
2674 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2675 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2676
2677 continue;
2678 }
2679 }
2680 }
2681
2682 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2683 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2684 * current UID/GID ownership.) */
2685 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2686 if (r < 0)
2687 goto fail;
2688
2689 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2690 * available to user code anyway */
2691 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2692 continue;
2693
2694 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2695 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2696 * assignments to exist. */
2697 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2698 if (r < 0)
2699 goto fail;
2700 }
2701
2702 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2703 * they are set up later, to allow configuring empty var/run/etc. */
2704 if (!needs_mount_namespace)
2705 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2706 r = create_many_symlinks(params->prefix[type],
2707 context->directories[type].items[i].path,
2708 context->directories[type].items[i].symlinks);
2709 if (r < 0)
2710 goto fail;
2711 }
2712
2713 return 0;
2714
2715 fail:
2716 *exit_status = exit_status_table[type];
2717 return r;
2718 }
2719
2720 #if ENABLE_SMACK
2721 static int setup_smack(
2722 const Manager *manager,
2723 const ExecContext *context,
2724 int executable_fd) {
2725 int r;
2726
2727 assert(context);
2728 assert(executable_fd >= 0);
2729
2730 if (context->smack_process_label) {
2731 r = mac_smack_apply_pid(0, context->smack_process_label);
2732 if (r < 0)
2733 return r;
2734 } else if (manager->default_smack_process_label) {
2735 _cleanup_free_ char *exec_label = NULL;
2736
2737 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
2738 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
2739 return r;
2740
2741 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
2742 if (r < 0)
2743 return r;
2744 }
2745
2746 return 0;
2747 }
2748 #endif
2749
2750 static int compile_bind_mounts(
2751 const ExecContext *context,
2752 const ExecParameters *params,
2753 BindMount **ret_bind_mounts,
2754 size_t *ret_n_bind_mounts,
2755 char ***ret_empty_directories) {
2756
2757 _cleanup_strv_free_ char **empty_directories = NULL;
2758 BindMount *bind_mounts = NULL;
2759 size_t n, h = 0;
2760 int r;
2761
2762 assert(context);
2763 assert(params);
2764 assert(ret_bind_mounts);
2765 assert(ret_n_bind_mounts);
2766 assert(ret_empty_directories);
2767
2768 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2769
2770 n = context->n_bind_mounts;
2771 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2772 if (!params->prefix[t])
2773 continue;
2774
2775 for (size_t i = 0; i < context->directories[t].n_items; i++)
2776 n += !context->directories[t].items[i].only_create;
2777 }
2778
2779 if (n <= 0) {
2780 *ret_bind_mounts = NULL;
2781 *ret_n_bind_mounts = 0;
2782 *ret_empty_directories = NULL;
2783 return 0;
2784 }
2785
2786 bind_mounts = new(BindMount, n);
2787 if (!bind_mounts)
2788 return -ENOMEM;
2789
2790 for (size_t i = 0; i < context->n_bind_mounts; i++) {
2791 BindMount *item = context->bind_mounts + i;
2792 _cleanup_free_ char *s = NULL, *d = NULL;
2793
2794 s = strdup(item->source);
2795 if (!s)
2796 return -ENOMEM;
2797
2798 d = strdup(item->destination);
2799 if (!d)
2800 return -ENOMEM;
2801
2802 bind_mounts[h++] = (BindMount) {
2803 .source = TAKE_PTR(s),
2804 .destination = TAKE_PTR(d),
2805 .read_only = item->read_only,
2806 .recursive = item->recursive,
2807 .ignore_enoent = item->ignore_enoent,
2808 };
2809 }
2810
2811 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2812 if (!params->prefix[t])
2813 continue;
2814
2815 if (context->directories[t].n_items == 0)
2816 continue;
2817
2818 if (exec_directory_is_private(context, t) &&
2819 !exec_context_with_rootfs(context)) {
2820 char *private_root;
2821
2822 /* So this is for a dynamic user, and we need to make sure the process can access its own
2823 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2824 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2825
2826 private_root = path_join(params->prefix[t], "private");
2827 if (!private_root)
2828 return -ENOMEM;
2829
2830 r = strv_consume(&empty_directories, private_root);
2831 if (r < 0)
2832 return r;
2833 }
2834
2835 for (size_t i = 0; i < context->directories[t].n_items; i++) {
2836 _cleanup_free_ char *s = NULL, *d = NULL;
2837
2838 /* When one of the parent directories is in the list, we cannot create the symlink
2839 * for the child directory. See also the comments in setup_exec_directory(). */
2840 if (context->directories[t].items[i].only_create)
2841 continue;
2842
2843 if (exec_directory_is_private(context, t))
2844 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
2845 else
2846 s = path_join(params->prefix[t], context->directories[t].items[i].path);
2847 if (!s)
2848 return -ENOMEM;
2849
2850 if (exec_directory_is_private(context, t) &&
2851 exec_context_with_rootfs(context))
2852 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2853 * directory is not created on the root directory. So, let's bind-mount the directory
2854 * on the 'non-private' place. */
2855 d = path_join(params->prefix[t], context->directories[t].items[i].path);
2856 else
2857 d = strdup(s);
2858 if (!d)
2859 return -ENOMEM;
2860
2861 bind_mounts[h++] = (BindMount) {
2862 .source = TAKE_PTR(s),
2863 .destination = TAKE_PTR(d),
2864 .read_only = false,
2865 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
2866 .recursive = true,
2867 .ignore_enoent = false,
2868 };
2869 }
2870 }
2871
2872 assert(h == n);
2873
2874 *ret_bind_mounts = TAKE_PTR(bind_mounts);
2875 *ret_n_bind_mounts = n;
2876 *ret_empty_directories = TAKE_PTR(empty_directories);
2877
2878 return (int) n;
2879 }
2880
2881 /* ret_symlinks will contain a list of pairs src:dest that describes
2882 * the symlinks to create later on. For example, the symlinks needed
2883 * to safely give private directories to DynamicUser=1 users. */
2884 static int compile_symlinks(
2885 const ExecContext *context,
2886 const ExecParameters *params,
2887 bool setup_os_release_symlink,
2888 char ***ret_symlinks) {
2889
2890 _cleanup_strv_free_ char **symlinks = NULL;
2891 int r;
2892
2893 assert(context);
2894 assert(params);
2895 assert(ret_symlinks);
2896
2897 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
2898 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2899 _cleanup_free_ char *private_path = NULL, *path = NULL;
2900
2901 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2902 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
2903
2904 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2905 dst_abs = path_join(params->prefix[dt], *symlink);
2906 if (!src_abs || !dst_abs)
2907 return -ENOMEM;
2908
2909 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2910 if (r < 0)
2911 return r;
2912 }
2913
2914 if (!exec_directory_is_private(context, dt) ||
2915 exec_context_with_rootfs(context) ||
2916 context->directories[dt].items[i].only_create)
2917 continue;
2918
2919 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
2920 if (!private_path)
2921 return -ENOMEM;
2922
2923 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2924 if (!path)
2925 return -ENOMEM;
2926
2927 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2928 if (r < 0)
2929 return r;
2930 }
2931 }
2932
2933 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2934 * and readers will never get a half-written version. Note that, while the paths specified here are
2935 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2936 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2937 if (setup_os_release_symlink) {
2938 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2939 if (r < 0)
2940 return r;
2941
2942 r = strv_extend(&symlinks, "/run/host/os-release");
2943 if (r < 0)
2944 return r;
2945 }
2946
2947 *ret_symlinks = TAKE_PTR(symlinks);
2948
2949 return 0;
2950 }
2951
2952 static bool insist_on_sandboxing(
2953 const ExecContext *context,
2954 const char *root_dir,
2955 const char *root_image,
2956 const BindMount *bind_mounts,
2957 size_t n_bind_mounts) {
2958
2959 assert(context);
2960 assert(n_bind_mounts == 0 || bind_mounts);
2961
2962 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
2963 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
2964 * rearrange stuff in a way we cannot ignore gracefully. */
2965
2966 if (context->n_temporary_filesystems > 0)
2967 return true;
2968
2969 if (root_dir || root_image)
2970 return true;
2971
2972 if (context->n_mount_images > 0)
2973 return true;
2974
2975 if (context->dynamic_user)
2976 return true;
2977
2978 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2979 return true;
2980
2981 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2982 * essential. */
2983 for (size_t i = 0; i < n_bind_mounts; i++)
2984 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2985 return true;
2986
2987 if (context->log_namespace)
2988 return true;
2989
2990 return false;
2991 }
2992
2993 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2994 _cleanup_close_ int fd = -EBADF;
2995 int r;
2996
2997 if (!runtime || !runtime->ephemeral_copy)
2998 return 0;
2999
3000 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3001 if (r < 0)
3002 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3003
3004 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3005
3006 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3007 if (fd >= 0)
3008 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3009 return 0;
3010
3011 if (fd != -EAGAIN)
3012 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3013
3014 log_debug("Making ephemeral snapshot of %s to %s",
3015 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3016
3017 if (context->root_image)
3018 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3019 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3020 else
3021 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3022 AT_FDCWD, runtime->ephemeral_copy,
3023 BTRFS_SNAPSHOT_FALLBACK_COPY |
3024 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3025 BTRFS_SNAPSHOT_RECURSIVE |
3026 BTRFS_SNAPSHOT_LOCK_BSD);
3027 if (fd < 0)
3028 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3029 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3030
3031 if (context->root_image) {
3032 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3033 * which tends to not perform well in combination with lots of random writes.
3034 *
3035 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3036 * copy, but we at least want to make the intention clear.
3037 */
3038 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3039 if (r < 0)
3040 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3041 }
3042
3043 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3044 if (r < 0)
3045 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3046
3047 return 1;
3048 }
3049
3050 static int verity_settings_prepare(
3051 VeritySettings *verity,
3052 const char *root_image,
3053 const void *root_hash,
3054 size_t root_hash_size,
3055 const char *root_hash_path,
3056 const void *root_hash_sig,
3057 size_t root_hash_sig_size,
3058 const char *root_hash_sig_path,
3059 const char *verity_data_path) {
3060
3061 int r;
3062
3063 assert(verity);
3064
3065 if (root_hash) {
3066 void *d;
3067
3068 d = memdup(root_hash, root_hash_size);
3069 if (!d)
3070 return -ENOMEM;
3071
3072 free_and_replace(verity->root_hash, d);
3073 verity->root_hash_size = root_hash_size;
3074 verity->designator = PARTITION_ROOT;
3075 }
3076
3077 if (root_hash_sig) {
3078 void *d;
3079
3080 d = memdup(root_hash_sig, root_hash_sig_size);
3081 if (!d)
3082 return -ENOMEM;
3083
3084 free_and_replace(verity->root_hash_sig, d);
3085 verity->root_hash_sig_size = root_hash_sig_size;
3086 verity->designator = PARTITION_ROOT;
3087 }
3088
3089 if (verity_data_path) {
3090 r = free_and_strdup(&verity->data_path, verity_data_path);
3091 if (r < 0)
3092 return r;
3093 }
3094
3095 r = verity_settings_load(
3096 verity,
3097 root_image,
3098 root_hash_path,
3099 root_hash_sig_path);
3100 if (r < 0)
3101 return log_debug_errno(r, "Failed to load root hash: %m");
3102
3103 return 0;
3104 }
3105
3106 static int apply_mount_namespace(
3107 const Unit *u,
3108 ExecCommandFlags command_flags,
3109 const ExecContext *context,
3110 const ExecParameters *params,
3111 ExecRuntime *runtime,
3112 const char *memory_pressure_path,
3113 char **error_path) {
3114
3115 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3116 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3117 **read_write_paths_cleanup = NULL;
3118 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3119 *extension_dir = NULL, *host_os_release_stage = NULL;
3120 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3121 char **read_write_paths;
3122 NamespaceInfo ns_info;
3123 bool needs_sandboxing, setup_os_release_symlink;
3124 BindMount *bind_mounts = NULL;
3125 size_t n_bind_mounts = 0;
3126 int r;
3127
3128 assert(context);
3129
3130 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3131
3132 if (params->flags & EXEC_APPLY_CHROOT) {
3133 r = setup_ephemeral(context, runtime);
3134 if (r < 0)
3135 return r;
3136
3137 if (context->root_image)
3138 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3139 else
3140 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3141 }
3142
3143 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3144 if (r < 0)
3145 return r;
3146
3147 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3148 * service will need to write to it in order to start the notifications. */
3149 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3150 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3151 if (!read_write_paths_cleanup)
3152 return -ENOMEM;
3153
3154 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3155 if (r < 0)
3156 return r;
3157
3158 read_write_paths = read_write_paths_cleanup;
3159 } else
3160 read_write_paths = context->read_write_paths;
3161
3162 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3163 if (needs_sandboxing) {
3164 /* The runtime struct only contains the parent of the private /tmp,
3165 * which is non-accessible to world users. Inside of it there's a /tmp
3166 * that is sticky, and that's the one we want to use here.
3167 * This does not apply when we are using /run/systemd/empty as fallback. */
3168
3169 if (context->private_tmp && runtime && runtime->shared) {
3170 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3171 tmp_dir = runtime->shared->tmp_dir;
3172 else if (runtime->shared->tmp_dir)
3173 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
3174
3175 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3176 var_tmp_dir = runtime->shared->var_tmp_dir;
3177 else if (runtime->shared->var_tmp_dir)
3178 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
3179 }
3180
3181 ns_info = (NamespaceInfo) {
3182 .ignore_protect_paths = false,
3183 .private_dev = context->private_devices,
3184 .protect_control_groups = context->protect_control_groups,
3185 .protect_kernel_tunables = context->protect_kernel_tunables,
3186 .protect_kernel_modules = context->protect_kernel_modules,
3187 .protect_kernel_logs = context->protect_kernel_logs,
3188 .protect_hostname = context->protect_hostname,
3189 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3190 .protect_home = context->protect_home,
3191 .protect_system = context->protect_system,
3192 .protect_proc = context->protect_proc,
3193 .proc_subset = context->proc_subset,
3194 .private_network = exec_needs_network_namespace(context),
3195 .private_ipc = exec_needs_ipc_namespace(context),
3196 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3197 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3198 };
3199 } else if (!context->dynamic_user && root_dir)
3200 /*
3201 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3202 * sandbox info, otherwise enforce it, don't ignore protected paths and
3203 * fail if we are enable to apply the sandbox inside the mount namespace.
3204 */
3205 ns_info = (NamespaceInfo) {
3206 .ignore_protect_paths = true,
3207 };
3208 else
3209 ns_info = (NamespaceInfo) {};
3210
3211 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3212 setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
3213 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3214 if (r < 0)
3215 return r;
3216
3217 if (context->mount_propagation_flag == MS_SHARED)
3218 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3219
3220 if (exec_context_has_credentials(context) &&
3221 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3222 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3223 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3224 if (!creds_path)
3225 return -ENOMEM;
3226 }
3227
3228 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
3229 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3230 if (!propagate_dir)
3231 return -ENOMEM;
3232
3233 incoming_dir = strdup("/run/systemd/incoming");
3234 if (!incoming_dir)
3235 return -ENOMEM;
3236
3237 extension_dir = strdup("/run/systemd/unit-extensions");
3238 if (!extension_dir)
3239 return -ENOMEM;
3240
3241 /* If running under a different root filesystem, propagate the host's os-release. We make a
3242 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
3243 if (setup_os_release_symlink) {
3244 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3245 if (!host_os_release_stage)
3246 return -ENOMEM;
3247 }
3248 } else {
3249 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3250
3251 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3252 return -ENOMEM;
3253
3254 if (setup_os_release_symlink) {
3255 if (asprintf(&host_os_release_stage,
3256 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3257 geteuid()) < 0)
3258 return -ENOMEM;
3259 }
3260 }
3261
3262 if (root_image) {
3263 r = verity_settings_prepare(
3264 &verity,
3265 root_image,
3266 context->root_hash, context->root_hash_size, context->root_hash_path,
3267 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3268 context->root_verity);
3269 if (r < 0)
3270 return r;
3271 }
3272
3273 r = setup_namespace(
3274 root_dir,
3275 root_image,
3276 context->root_image_options,
3277 context->root_image_policy ?: &image_policy_service,
3278 &ns_info,
3279 read_write_paths,
3280 needs_sandboxing ? context->read_only_paths : NULL,
3281 needs_sandboxing ? context->inaccessible_paths : NULL,
3282 needs_sandboxing ? context->exec_paths : NULL,
3283 needs_sandboxing ? context->no_exec_paths : NULL,
3284 empty_directories,
3285 symlinks,
3286 bind_mounts,
3287 n_bind_mounts,
3288 context->temporary_filesystems,
3289 context->n_temporary_filesystems,
3290 context->mount_images,
3291 context->n_mount_images,
3292 context->mount_image_policy ?: &image_policy_service,
3293 tmp_dir,
3294 var_tmp_dir,
3295 creds_path,
3296 context->log_namespace,
3297 context->mount_propagation_flag,
3298 &verity,
3299 context->extension_images,
3300 context->n_extension_images,
3301 context->extension_image_policy ?: &image_policy_sysext,
3302 context->extension_directories,
3303 propagate_dir,
3304 incoming_dir,
3305 extension_dir,
3306 root_dir || root_image ? params->notify_socket : NULL,
3307 host_os_release_stage,
3308 error_path);
3309
3310 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3311 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3312 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3313 * completely different execution environment. */
3314 if (r == -ENOANO) {
3315 if (insist_on_sandboxing(
3316 context,
3317 root_dir, root_image,
3318 bind_mounts,
3319 n_bind_mounts))
3320 return log_unit_debug_errno(u,
3321 SYNTHETIC_ERRNO(EOPNOTSUPP),
3322 "Failed to set up namespace, and refusing to continue since "
3323 "the selected namespacing options alter mount environment non-trivially.\n"
3324 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3325 n_bind_mounts,
3326 context->n_temporary_filesystems,
3327 yes_no(root_dir),
3328 yes_no(root_image),
3329 yes_no(context->dynamic_user));
3330
3331 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3332 return 0;
3333 }
3334
3335 return r;
3336 }
3337
3338 static int apply_working_directory(
3339 const ExecContext *context,
3340 const ExecParameters *params,
3341 ExecRuntime *runtime,
3342 const char *home,
3343 int *exit_status) {
3344
3345 const char *d, *wd;
3346
3347 assert(context);
3348 assert(exit_status);
3349
3350 if (context->working_directory_home) {
3351
3352 if (!home) {
3353 *exit_status = EXIT_CHDIR;
3354 return -ENXIO;
3355 }
3356
3357 wd = home;
3358
3359 } else
3360 wd = empty_to_root(context->working_directory);
3361
3362 if (params->flags & EXEC_APPLY_CHROOT)
3363 d = wd;
3364 else
3365 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
3366
3367 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3368 *exit_status = EXIT_CHDIR;
3369 return -errno;
3370 }
3371
3372 return 0;
3373 }
3374
3375 static int apply_root_directory(
3376 const ExecContext *context,
3377 const ExecParameters *params,
3378 ExecRuntime *runtime,
3379 const bool needs_mount_ns,
3380 int *exit_status) {
3381
3382 assert(context);
3383 assert(exit_status);
3384
3385 if (params->flags & EXEC_APPLY_CHROOT)
3386 if (!needs_mount_ns && context->root_directory)
3387 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
3388 *exit_status = EXIT_CHROOT;
3389 return -errno;
3390 }
3391
3392 return 0;
3393 }
3394
3395 static int setup_keyring(
3396 const Unit *u,
3397 const ExecContext *context,
3398 const ExecParameters *p,
3399 uid_t uid, gid_t gid) {
3400
3401 key_serial_t keyring;
3402 int r = 0;
3403 uid_t saved_uid;
3404 gid_t saved_gid;
3405
3406 assert(u);
3407 assert(context);
3408 assert(p);
3409
3410 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3411 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3412 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3413 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3414 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3415 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3416
3417 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3418 return 0;
3419
3420 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3421 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3422 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3423 * & group is just as nasty as acquiring a reference to the user keyring. */
3424
3425 saved_uid = getuid();
3426 saved_gid = getgid();
3427
3428 if (gid_is_valid(gid) && gid != saved_gid) {
3429 if (setregid(gid, -1) < 0)
3430 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3431 }
3432
3433 if (uid_is_valid(uid) && uid != saved_uid) {
3434 if (setreuid(uid, -1) < 0) {
3435 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3436 goto out;
3437 }
3438 }
3439
3440 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3441 if (keyring == -1) {
3442 if (errno == ENOSYS)
3443 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3444 else if (ERRNO_IS_PRIVILEGE(errno))
3445 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3446 else if (errno == EDQUOT)
3447 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3448 else
3449 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3450
3451 goto out;
3452 }
3453
3454 /* When requested link the user keyring into the session keyring. */
3455 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3456
3457 if (keyctl(KEYCTL_LINK,
3458 KEY_SPEC_USER_KEYRING,
3459 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3460 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3461 goto out;
3462 }
3463 }
3464
3465 /* Restore uid/gid back */
3466 if (uid_is_valid(uid) && uid != saved_uid) {
3467 if (setreuid(saved_uid, -1) < 0) {
3468 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3469 goto out;
3470 }
3471 }
3472
3473 if (gid_is_valid(gid) && gid != saved_gid) {
3474 if (setregid(saved_gid, -1) < 0)
3475 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3476 }
3477
3478 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3479 if (!sd_id128_is_null(u->invocation_id)) {
3480 key_serial_t key;
3481
3482 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3483 if (key == -1)
3484 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3485 else {
3486 if (keyctl(KEYCTL_SETPERM, key,
3487 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3488 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3489 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3490 }
3491 }
3492
3493 out:
3494 /* Revert back uid & gid for the last time, and exit */
3495 /* no extra logging, as only the first already reported error matters */
3496 if (getuid() != saved_uid)
3497 (void) setreuid(saved_uid, -1);
3498
3499 if (getgid() != saved_gid)
3500 (void) setregid(saved_gid, -1);
3501
3502 return r;
3503 }
3504
3505 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3506 assert(array);
3507 assert(n);
3508 assert(pair);
3509
3510 if (pair[0] >= 0)
3511 array[(*n)++] = pair[0];
3512 if (pair[1] >= 0)
3513 array[(*n)++] = pair[1];
3514 }
3515
3516 static int close_remaining_fds(
3517 const ExecParameters *params,
3518 const ExecRuntime *runtime,
3519 int user_lookup_fd,
3520 int socket_fd,
3521 const int *fds, size_t n_fds) {
3522
3523 size_t n_dont_close = 0;
3524 int dont_close[n_fds + 14];
3525
3526 assert(params);
3527
3528 if (params->stdin_fd >= 0)
3529 dont_close[n_dont_close++] = params->stdin_fd;
3530 if (params->stdout_fd >= 0)
3531 dont_close[n_dont_close++] = params->stdout_fd;
3532 if (params->stderr_fd >= 0)
3533 dont_close[n_dont_close++] = params->stderr_fd;
3534
3535 if (socket_fd >= 0)
3536 dont_close[n_dont_close++] = socket_fd;
3537 if (n_fds > 0) {
3538 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3539 n_dont_close += n_fds;
3540 }
3541
3542 if (runtime)
3543 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3544
3545 if (runtime && runtime->shared) {
3546 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3547 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
3548 }
3549
3550 if (runtime && runtime->dynamic_creds) {
3551 if (runtime->dynamic_creds->user)
3552 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3553 if (runtime->dynamic_creds->group)
3554 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
3555 }
3556
3557 if (user_lookup_fd >= 0)
3558 dont_close[n_dont_close++] = user_lookup_fd;
3559
3560 return close_all_fds(dont_close, n_dont_close);
3561 }
3562
3563 static int send_user_lookup(
3564 Unit *unit,
3565 int user_lookup_fd,
3566 uid_t uid,
3567 gid_t gid) {
3568
3569 assert(unit);
3570
3571 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3572 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3573 * specified. */
3574
3575 if (user_lookup_fd < 0)
3576 return 0;
3577
3578 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3579 return 0;
3580
3581 if (writev(user_lookup_fd,
3582 (struct iovec[]) {
3583 IOVEC_MAKE(&uid, sizeof(uid)),
3584 IOVEC_MAKE(&gid, sizeof(gid)),
3585 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
3586 return -errno;
3587
3588 return 0;
3589 }
3590
3591 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3592 int r;
3593
3594 assert(c);
3595 assert(home);
3596 assert(buf);
3597
3598 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3599
3600 if (*home)
3601 return 0;
3602
3603 if (!c->working_directory_home)
3604 return 0;
3605
3606 r = get_home_dir(buf);
3607 if (r < 0)
3608 return r;
3609
3610 *home = *buf;
3611 return 1;
3612 }
3613
3614 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3615 _cleanup_strv_free_ char ** list = NULL;
3616 int r;
3617
3618 assert(c);
3619 assert(p);
3620 assert(ret);
3621
3622 assert(c->dynamic_user);
3623
3624 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3625 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3626 * directories. */
3627
3628 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3629 if (t == EXEC_DIRECTORY_CONFIGURATION)
3630 continue;
3631
3632 if (!p->prefix[t])
3633 continue;
3634
3635 for (size_t i = 0; i < c->directories[t].n_items; i++) {
3636 char *e;
3637
3638 if (exec_directory_is_private(c, t))
3639 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
3640 else
3641 e = path_join(p->prefix[t], c->directories[t].items[i].path);
3642 if (!e)
3643 return -ENOMEM;
3644
3645 r = strv_consume(&list, e);
3646 if (r < 0)
3647 return r;
3648 }
3649 }
3650
3651 *ret = TAKE_PTR(list);
3652
3653 return 0;
3654 }
3655
3656 static int exec_parameters_get_cgroup_path(
3657 const ExecParameters *params,
3658 const CGroupContext *c,
3659 char **ret) {
3660
3661 const char *subgroup = NULL;
3662 char *p;
3663
3664 assert(params);
3665 assert(ret);
3666
3667 if (!params->cgroup_path)
3668 return -EINVAL;
3669
3670 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3671 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3672 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3673 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3674 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3675 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3676 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3677 * flag, which is only passed for the former statements, not for the latter. */
3678
3679 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3680 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3681 subgroup = ".control";
3682 else
3683 subgroup = c->delegate_subgroup;
3684 }
3685
3686 if (subgroup)
3687 p = path_join(params->cgroup_path, subgroup);
3688 else
3689 p = strdup(params->cgroup_path);
3690 if (!p)
3691 return -ENOMEM;
3692
3693 *ret = p;
3694 return !!subgroup;
3695 }
3696
3697 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3698 _cleanup_(cpu_set_reset) CPUSet s = {};
3699 int r;
3700
3701 assert(c);
3702 assert(ret);
3703
3704 if (!c->numa_policy.nodes.set) {
3705 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3706 return 0;
3707 }
3708
3709 r = numa_to_cpu_set(&c->numa_policy, &s);
3710 if (r < 0)
3711 return r;
3712
3713 cpu_set_reset(ret);
3714
3715 return cpu_set_add_all(ret, &s);
3716 }
3717
3718 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3719 assert(c);
3720
3721 return c->cpu_affinity_from_numa;
3722 }
3723
3724 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3725 int r;
3726
3727 assert(fds);
3728 assert(n_fds);
3729 assert(*n_fds < fds_size);
3730 assert(ret_fd);
3731
3732 if (fd < 0) {
3733 *ret_fd = -EBADF;
3734 return 0;
3735 }
3736
3737 if (fd < 3 + (int) *n_fds) {
3738 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3739 * the fds we pass to the process (or which are closed only during execve). */
3740
3741 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3742 if (r < 0)
3743 return -errno;
3744
3745 close_and_replace(fd, r);
3746 }
3747
3748 *ret_fd = fds[*n_fds] = fd;
3749 (*n_fds) ++;
3750 return 1;
3751 }
3752
3753 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3754 union sockaddr_union addr = {
3755 .un.sun_family = AF_UNIX,
3756 };
3757 socklen_t sa_len;
3758 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3759 int r;
3760
3761 assert(u);
3762 assert(of);
3763 assert(ofd >= 0);
3764
3765 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3766 if (r < 0)
3767 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3768
3769 sa_len = r;
3770
3771 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3772 _cleanup_close_ int fd = -EBADF;
3773
3774 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3775 if (fd < 0)
3776 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3777
3778 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3779 if (r == -EPROTOTYPE)
3780 continue;
3781 if (r < 0)
3782 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3783
3784 return TAKE_FD(fd);
3785 }
3786
3787 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3788 }
3789
3790 static int get_open_file_fd(Unit *u, const OpenFile *of) {
3791 struct stat st;
3792 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3793
3794 assert(u);
3795 assert(of);
3796
3797 ofd = open(of->path, O_PATH | O_CLOEXEC);
3798 if (ofd < 0)
3799 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3800
3801 if (fstat(ofd, &st) < 0)
3802 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
3803
3804 if (S_ISSOCK(st.st_mode)) {
3805 fd = connect_unix_harder(u, of, ofd);
3806 if (fd < 0)
3807 return fd;
3808
3809 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
3810 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3811 of->path);
3812
3813 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3814 } else {
3815 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3816 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3817 flags |= O_APPEND;
3818 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3819 flags |= O_TRUNC;
3820
3821 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3822 if (fd < 0)
3823 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3824
3825 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3826 }
3827
3828 return TAKE_FD(fd);
3829 }
3830
3831 static int collect_open_file_fds(
3832 Unit *u,
3833 OpenFile* open_files,
3834 int **fds,
3835 char ***fdnames,
3836 size_t *n_fds) {
3837 int r;
3838
3839 assert(u);
3840 assert(fds);
3841 assert(fdnames);
3842 assert(n_fds);
3843
3844 LIST_FOREACH(open_files, of, open_files) {
3845 _cleanup_close_ int fd = -EBADF;
3846
3847 fd = get_open_file_fd(u, of);
3848 if (fd < 0) {
3849 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3850 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3851 continue;
3852 }
3853
3854 return fd;
3855 }
3856
3857 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3858 return -ENOMEM;
3859
3860 r = strv_extend(fdnames, of->fdname);
3861 if (r < 0)
3862 return r;
3863
3864 (*fds)[*n_fds] = TAKE_FD(fd);
3865
3866 (*n_fds)++;
3867 }
3868
3869 return 0;
3870 }
3871
3872 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3873 assert(unit);
3874 assert(msg);
3875 assert(executable);
3876
3877 if (!DEBUG_LOGGING)
3878 return;
3879
3880 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3881
3882 log_unit_struct(unit, LOG_DEBUG,
3883 "EXECUTABLE=%s", executable,
3884 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3885 LOG_UNIT_INVOCATION_ID(unit));
3886 }
3887
3888 static bool exec_context_need_unprivileged_private_users(
3889 const ExecContext *context,
3890 const ExecParameters *params) {
3891
3892 assert(context);
3893 assert(params);
3894
3895 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3896 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3897 * (system manager) then we have privileges and don't need this. */
3898 if (params->runtime_scope != RUNTIME_SCOPE_USER)
3899 return false;
3900
3901 return context->private_users ||
3902 context->private_tmp ||
3903 context->private_devices ||
3904 context->private_network ||
3905 context->network_namespace_path ||
3906 context->private_ipc ||
3907 context->ipc_namespace_path ||
3908 context->private_mounts > 0 ||
3909 context->mount_apivfs ||
3910 context->n_bind_mounts > 0 ||
3911 context->n_temporary_filesystems > 0 ||
3912 context->root_directory ||
3913 !strv_isempty(context->extension_directories) ||
3914 context->protect_system != PROTECT_SYSTEM_NO ||
3915 context->protect_home != PROTECT_HOME_NO ||
3916 context->protect_kernel_tunables ||
3917 context->protect_kernel_modules ||
3918 context->protect_kernel_logs ||
3919 context->protect_control_groups ||
3920 context->protect_clock ||
3921 context->protect_hostname ||
3922 !strv_isempty(context->read_write_paths) ||
3923 !strv_isempty(context->read_only_paths) ||
3924 !strv_isempty(context->inaccessible_paths) ||
3925 !strv_isempty(context->exec_paths) ||
3926 !strv_isempty(context->no_exec_paths);
3927 }
3928
3929 static int exec_child(
3930 Unit *unit,
3931 const ExecCommand *command,
3932 const ExecContext *context,
3933 const ExecParameters *params,
3934 ExecRuntime *runtime,
3935 const CGroupContext *cgroup_context,
3936 int socket_fd,
3937 const int named_iofds[static 3],
3938 int *params_fds,
3939 size_t n_socket_fds,
3940 size_t n_storage_fds,
3941 char **files_env,
3942 int user_lookup_fd,
3943 int *exit_status) {
3944
3945 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
3946 int r, ngids = 0, exec_fd;
3947 _cleanup_free_ gid_t *supplementary_gids = NULL;
3948 const char *username = NULL, *groupname = NULL;
3949 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
3950 const char *home = NULL, *shell = NULL;
3951 char **final_argv = NULL;
3952 dev_t journal_stream_dev = 0;
3953 ino_t journal_stream_ino = 0;
3954 bool userns_set_up = false;
3955 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3956 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3957 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3958 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
3959 #if HAVE_SELINUX
3960 _cleanup_free_ char *mac_selinux_context_net = NULL;
3961 bool use_selinux = false;
3962 #endif
3963 #if ENABLE_SMACK
3964 bool use_smack = false;
3965 #endif
3966 #if HAVE_APPARMOR
3967 bool use_apparmor = false;
3968 #endif
3969 uid_t saved_uid = getuid();
3970 gid_t saved_gid = getgid();
3971 uid_t uid = UID_INVALID;
3972 gid_t gid = GID_INVALID;
3973 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3974 n_keep_fds; /* total number of fds not to close */
3975 int secure_bits;
3976 _cleanup_free_ gid_t *gids_after_pam = NULL;
3977 int ngids_after_pam = 0;
3978 _cleanup_free_ int *fds = NULL;
3979 _cleanup_strv_free_ char **fdnames = NULL;
3980
3981 assert(unit);
3982 assert(command);
3983 assert(context);
3984 assert(params);
3985 assert(exit_status);
3986
3987 /* Explicitly test for CVE-2021-4034 inspired invocations */
3988 assert(command->path);
3989 assert(!strv_isempty(command->argv));
3990
3991 rename_process_from_path(command->path);
3992
3993 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3994 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3995 * both of which will be demoted to SIG_DFL. */
3996 (void) default_signals(SIGNALS_CRASH_HANDLER,
3997 SIGNALS_IGNORE);
3998
3999 if (context->ignore_sigpipe)
4000 (void) ignore_signals(SIGPIPE);
4001
4002 r = reset_signal_mask();
4003 if (r < 0) {
4004 *exit_status = EXIT_SIGNAL_MASK;
4005 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4006 }
4007
4008 if (params->idle_pipe)
4009 do_idle_pipe_dance(params->idle_pipe);
4010
4011 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4012 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4013 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4014 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4015
4016 log_forget_fds();
4017 log_set_open_when_needed(true);
4018 log_settle_target();
4019
4020 /* In case anything used libc syslog(), close this here, too */
4021 closelog();
4022
4023 fds = newdup(int, params_fds, n_fds);
4024 if (!fds) {
4025 *exit_status = EXIT_MEMORY;
4026 return log_oom();
4027 }
4028
4029 fdnames = strv_copy((char**) params->fd_names);
4030 if (!fdnames) {
4031 *exit_status = EXIT_MEMORY;
4032 return log_oom();
4033 }
4034
4035 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4036 if (r < 0) {
4037 *exit_status = EXIT_FDS;
4038 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4039 }
4040
4041 int keep_fds[n_fds + 3];
4042 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4043 n_keep_fds = n_fds;
4044
4045 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4046 if (r < 0) {
4047 *exit_status = EXIT_FDS;
4048 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4049 }
4050
4051 #if HAVE_LIBBPF
4052 if (unit->manager->restrict_fs) {
4053 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4054 if (bpf_map_fd < 0) {
4055 *exit_status = EXIT_FDS;
4056 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4057 }
4058
4059 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4060 if (r < 0) {
4061 *exit_status = EXIT_FDS;
4062 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4063 }
4064 }
4065 #endif
4066
4067 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4068 if (r < 0) {
4069 *exit_status = EXIT_FDS;
4070 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4071 }
4072
4073 if (!context->same_pgrp &&
4074 setsid() < 0) {
4075 *exit_status = EXIT_SETSID;
4076 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4077 }
4078
4079 exec_context_tty_reset(context, params);
4080
4081 if (unit_shall_confirm_spawn(unit)) {
4082 _cleanup_free_ char *cmdline = NULL;
4083
4084 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4085 if (!cmdline) {
4086 *exit_status = EXIT_MEMORY;
4087 return log_oom();
4088 }
4089
4090 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4091 if (r != CONFIRM_EXECUTE) {
4092 if (r == CONFIRM_PRETEND_SUCCESS) {
4093 *exit_status = EXIT_SUCCESS;
4094 return 0;
4095 }
4096
4097 *exit_status = EXIT_CONFIRM;
4098 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4099 "Execution cancelled by the user");
4100 }
4101 }
4102
4103 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4104 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4105 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4106 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4107 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4108 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4109 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4110 *exit_status = EXIT_MEMORY;
4111 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4112 }
4113
4114 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4115 _cleanup_strv_free_ char **suggested_paths = NULL;
4116
4117 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4118 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4119 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4120 *exit_status = EXIT_USER;
4121 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4122 }
4123
4124 r = compile_suggested_paths(context, params, &suggested_paths);
4125 if (r < 0) {
4126 *exit_status = EXIT_MEMORY;
4127 return log_oom();
4128 }
4129
4130 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4131 if (r < 0) {
4132 *exit_status = EXIT_USER;
4133 if (r == -EILSEQ)
4134 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4135 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4136 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4137 }
4138
4139 if (!uid_is_valid(uid)) {
4140 *exit_status = EXIT_USER;
4141 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4142 }
4143
4144 if (!gid_is_valid(gid)) {
4145 *exit_status = EXIT_USER;
4146 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4147 }
4148
4149 if (runtime->dynamic_creds->user)
4150 username = runtime->dynamic_creds->user->name;
4151
4152 } else {
4153 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4154 if (r < 0) {
4155 *exit_status = EXIT_USER;
4156 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4157 }
4158
4159 r = get_fixed_group(context, &groupname, &gid);
4160 if (r < 0) {
4161 *exit_status = EXIT_GROUP;
4162 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4163 }
4164 }
4165
4166 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4167 r = get_supplementary_groups(context, username, groupname, gid,
4168 &supplementary_gids, &ngids);
4169 if (r < 0) {
4170 *exit_status = EXIT_GROUP;
4171 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4172 }
4173
4174 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4175 if (r < 0) {
4176 *exit_status = EXIT_USER;
4177 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4178 }
4179
4180 user_lookup_fd = safe_close(user_lookup_fd);
4181
4182 r = acquire_home(context, uid, &home, &home_buffer);
4183 if (r < 0) {
4184 *exit_status = EXIT_CHDIR;
4185 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4186 }
4187
4188 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
4189 if (socket_fd >= 0)
4190 (void) fd_nonblock(socket_fd, false);
4191
4192 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4193 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4194 if (params->cgroup_path) {
4195 _cleanup_free_ char *p = NULL;
4196
4197 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4198 if (r < 0) {
4199 *exit_status = EXIT_CGROUP;
4200 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4201 }
4202
4203 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4204 if (r == -EUCLEAN) {
4205 *exit_status = EXIT_CGROUP;
4206 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4207 "because the cgroup or one of its parents or "
4208 "siblings is in the threaded mode: %m", p);
4209 }
4210 if (r < 0) {
4211 *exit_status = EXIT_CGROUP;
4212 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4213 }
4214 }
4215
4216 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4217 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4218 if (r < 0) {
4219 *exit_status = EXIT_NETWORK;
4220 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4221 }
4222 }
4223
4224 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4225 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4226 if (r < 0) {
4227 *exit_status = EXIT_NAMESPACE;
4228 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4229 }
4230 }
4231
4232 r = setup_input(context, params, socket_fd, named_iofds);
4233 if (r < 0) {
4234 *exit_status = EXIT_STDIN;
4235 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4236 }
4237
4238 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4239 if (r < 0) {
4240 *exit_status = EXIT_STDOUT;
4241 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4242 }
4243
4244 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4245 if (r < 0) {
4246 *exit_status = EXIT_STDERR;
4247 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4248 }
4249
4250 if (context->oom_score_adjust_set) {
4251 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4252 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
4253 r = set_oom_score_adjust(context->oom_score_adjust);
4254 if (ERRNO_IS_NEG_PRIVILEGE(r))
4255 log_unit_debug_errno(unit, r,
4256 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4257 else if (r < 0) {
4258 *exit_status = EXIT_OOM_ADJUST;
4259 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4260 }
4261 }
4262
4263 if (context->coredump_filter_set) {
4264 r = set_coredump_filter(context->coredump_filter);
4265 if (ERRNO_IS_NEG_PRIVILEGE(r))
4266 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4267 else if (r < 0) {
4268 *exit_status = EXIT_LIMITS;
4269 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4270 }
4271 }
4272
4273 if (context->nice_set) {
4274 r = setpriority_closest(context->nice);
4275 if (r < 0) {
4276 *exit_status = EXIT_NICE;
4277 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4278 }
4279 }
4280
4281 if (context->cpu_sched_set) {
4282 struct sched_param param = {
4283 .sched_priority = context->cpu_sched_priority,
4284 };
4285
4286 r = sched_setscheduler(0,
4287 context->cpu_sched_policy |
4288 (context->cpu_sched_reset_on_fork ?
4289 SCHED_RESET_ON_FORK : 0),
4290 &param);
4291 if (r < 0) {
4292 *exit_status = EXIT_SETSCHEDULER;
4293 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4294 }
4295 }
4296
4297 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4298 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4299 const CPUSet *cpu_set;
4300
4301 if (context->cpu_affinity_from_numa) {
4302 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4303 if (r < 0) {
4304 *exit_status = EXIT_CPUAFFINITY;
4305 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4306 }
4307
4308 cpu_set = &converted_cpu_set;
4309 } else
4310 cpu_set = &context->cpu_set;
4311
4312 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4313 *exit_status = EXIT_CPUAFFINITY;
4314 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4315 }
4316 }
4317
4318 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4319 r = apply_numa_policy(&context->numa_policy);
4320 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4321 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4322 else if (r < 0) {
4323 *exit_status = EXIT_NUMA_POLICY;
4324 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4325 }
4326 }
4327
4328 if (context->ioprio_set)
4329 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4330 *exit_status = EXIT_IOPRIO;
4331 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4332 }
4333
4334 if (context->timer_slack_nsec != NSEC_INFINITY)
4335 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4336 *exit_status = EXIT_TIMERSLACK;
4337 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4338 }
4339
4340 if (context->personality != PERSONALITY_INVALID) {
4341 r = safe_personality(context->personality);
4342 if (r < 0) {
4343 *exit_status = EXIT_PERSONALITY;
4344 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4345 }
4346 }
4347
4348 if (context->utmp_id) {
4349 const char *line = context->tty_path ?
4350 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4351 NULL;
4352 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4353 line,
4354 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4355 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4356 USER_PROCESS,
4357 username);
4358 }
4359
4360 if (uid_is_valid(uid)) {
4361 r = chown_terminal(STDIN_FILENO, uid);
4362 if (r < 0) {
4363 *exit_status = EXIT_STDIN;
4364 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4365 }
4366 }
4367
4368 if (params->cgroup_path) {
4369 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4370 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4371 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4372 * touch a single hierarchy too. */
4373
4374 if (params->flags & EXEC_CGROUP_DELEGATE) {
4375 _cleanup_free_ char *p = NULL;
4376
4377 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4378 if (r < 0) {
4379 *exit_status = EXIT_CGROUP;
4380 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4381 }
4382
4383 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4384 if (r < 0) {
4385 *exit_status = EXIT_CGROUP;
4386 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4387 }
4388 if (r > 0) {
4389 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
4390 if (r < 0) {
4391 *exit_status = EXIT_CGROUP;
4392 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4393 }
4394 }
4395 }
4396
4397 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4398 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4399 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4400 if (r < 0) {
4401 *exit_status = EXIT_MEMORY;
4402 return log_oom();
4403 }
4404
4405 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4406 if (r < 0) {
4407 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4408 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4409 memory_pressure_path = mfree(memory_pressure_path);
4410 }
4411 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4412 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4413 if (!memory_pressure_path) {
4414 *exit_status = EXIT_MEMORY;
4415 return log_oom();
4416 }
4417 }
4418 }
4419 }
4420
4421 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4422
4423 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4424 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4425 if (r < 0)
4426 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4427 }
4428
4429 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4430 r = setup_credentials(context, params, unit->id, uid, gid);
4431 if (r < 0) {
4432 *exit_status = EXIT_CREDENTIALS;
4433 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4434 }
4435 }
4436
4437 r = build_environment(
4438 unit,
4439 context,
4440 params,
4441 cgroup_context,
4442 n_fds,
4443 fdnames,
4444 home,
4445 username,
4446 shell,
4447 journal_stream_dev,
4448 journal_stream_ino,
4449 memory_pressure_path,
4450 &our_env);
4451 if (r < 0) {
4452 *exit_status = EXIT_MEMORY;
4453 return log_oom();
4454 }
4455
4456 r = build_pass_environment(context, &pass_env);
4457 if (r < 0) {
4458 *exit_status = EXIT_MEMORY;
4459 return log_oom();
4460 }
4461
4462 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4463 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4464 * not specify PATH but the unit has ExecSearchPath. */
4465 if (!strv_isempty(context->exec_search_path)) {
4466 _cleanup_free_ char *joined = NULL;
4467
4468 joined = strv_join(context->exec_search_path, ":");
4469 if (!joined) {
4470 *exit_status = EXIT_MEMORY;
4471 return log_oom();
4472 }
4473
4474 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4475 if (r < 0) {
4476 *exit_status = EXIT_MEMORY;
4477 return log_oom();
4478 }
4479 }
4480
4481 accum_env = strv_env_merge(params->environment,
4482 our_env,
4483 joined_exec_search_path,
4484 pass_env,
4485 context->environment,
4486 files_env);
4487 if (!accum_env) {
4488 *exit_status = EXIT_MEMORY;
4489 return log_oom();
4490 }
4491 accum_env = strv_env_clean(accum_env);
4492
4493 (void) umask(context->umask);
4494
4495 r = setup_keyring(unit, context, params, uid, gid);
4496 if (r < 0) {
4497 *exit_status = EXIT_KEYRING;
4498 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4499 }
4500
4501 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4502 * from it. */
4503 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4504
4505 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4506 * for it, and the kernel doesn't actually support ambient caps. */
4507 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4508
4509 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4510 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4511 * desired. */
4512 if (needs_ambient_hack)
4513 needs_setuid = false;
4514 else
4515 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4516
4517 uint64_t capability_ambient_set = context->capability_ambient_set;
4518
4519 if (needs_sandboxing) {
4520 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4521 * /sys being present. The actual MAC context application will happen later, as late as
4522 * possible, to avoid impacting our own code paths. */
4523
4524 #if HAVE_SELINUX
4525 use_selinux = mac_selinux_use();
4526 #endif
4527 #if ENABLE_SMACK
4528 use_smack = mac_smack_use();
4529 #endif
4530 #if HAVE_APPARMOR
4531 use_apparmor = mac_apparmor_use();
4532 #endif
4533 }
4534
4535 if (needs_sandboxing) {
4536 int which_failed;
4537
4538 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4539 * is set here. (See below.) */
4540
4541 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4542 if (r < 0) {
4543 *exit_status = EXIT_LIMITS;
4544 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4545 }
4546 }
4547
4548 if (needs_setuid && context->pam_name && username) {
4549 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4550 * wins here. (See above.) */
4551
4552 /* All fds passed in the fds array will be closed in the pam child process. */
4553 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4554 if (r < 0) {
4555 *exit_status = EXIT_PAM;
4556 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4557 }
4558
4559 if (ambient_capabilities_supported()) {
4560 uint64_t ambient_after_pam;
4561
4562 /* PAM modules might have set some ambient caps. Query them here and merge them into
4563 * the caps we want to set in the end, so that we don't end up unsetting them. */
4564 r = capability_get_ambient(&ambient_after_pam);
4565 if (r < 0) {
4566 *exit_status = EXIT_CAPABILITIES;
4567 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4568 }
4569
4570 capability_ambient_set |= ambient_after_pam;
4571 }
4572
4573 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4574 if (ngids_after_pam < 0) {
4575 *exit_status = EXIT_MEMORY;
4576 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4577 }
4578 }
4579
4580 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
4581 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4582 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4583 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4584
4585 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4586 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4587 * the actual requested operations fail (or silently continue). */
4588 if (r < 0 && context->private_users) {
4589 *exit_status = EXIT_USER;
4590 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4591 }
4592 if (r < 0)
4593 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4594 else
4595 userns_set_up = true;
4596 }
4597
4598 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4599
4600 /* Try to enable network namespacing if network namespacing is available and we have
4601 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4602 * new network namespace. And if we don't have that, then we could only create a network
4603 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4604 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
4605 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
4606 if (ERRNO_IS_NEG_PRIVILEGE(r))
4607 log_unit_notice_errno(unit, r,
4608 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4609 else if (r < 0) {
4610 *exit_status = EXIT_NETWORK;
4611 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4612 }
4613 } else if (context->network_namespace_path) {
4614 *exit_status = EXIT_NETWORK;
4615 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4616 "NetworkNamespacePath= is not supported, refusing.");
4617 } else
4618 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
4619 }
4620
4621 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4622
4623 if (ns_type_supported(NAMESPACE_IPC)) {
4624 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
4625 if (r == -EPERM)
4626 log_unit_warning_errno(unit, r,
4627 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4628 else if (r < 0) {
4629 *exit_status = EXIT_NAMESPACE;
4630 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4631 }
4632 } else if (context->ipc_namespace_path) {
4633 *exit_status = EXIT_NAMESPACE;
4634 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4635 "IPCNamespacePath= is not supported, refusing.");
4636 } else
4637 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4638 }
4639
4640 if (needs_mount_namespace) {
4641 _cleanup_free_ char *error_path = NULL;
4642
4643 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
4644 if (r < 0) {
4645 *exit_status = EXIT_NAMESPACE;
4646 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4647 error_path ? ": " : "", strempty(error_path));
4648 }
4649 }
4650
4651 if (needs_sandboxing) {
4652 r = apply_protect_hostname(unit, context, exit_status);
4653 if (r < 0)
4654 return r;
4655 }
4656
4657 if (context->memory_ksm >= 0)
4658 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4659 if (ERRNO_IS_NOT_SUPPORTED(errno))
4660 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4661 else {
4662 *exit_status = EXIT_KSM;
4663 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4664 }
4665 }
4666
4667 /* Drop groups as early as possible.
4668 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4669 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4670 if (needs_setuid) {
4671 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4672 int ngids_to_enforce = 0;
4673
4674 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4675 ngids,
4676 gids_after_pam,
4677 ngids_after_pam,
4678 &gids_to_enforce);
4679 if (ngids_to_enforce < 0) {
4680 *exit_status = EXIT_MEMORY;
4681 return log_unit_error_errno(unit,
4682 ngids_to_enforce,
4683 "Failed to merge group lists. Group membership might be incorrect: %m");
4684 }
4685
4686 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4687 if (r < 0) {
4688 *exit_status = EXIT_GROUP;
4689 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4690 }
4691 }
4692
4693 /* If the user namespace was not set up above, try to do it now.
4694 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4695 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
4696 * case of mount namespaces being less privileged when the mount point list is copied from a
4697 * different user namespace). */
4698
4699 if (needs_sandboxing && context->private_users && !userns_set_up) {
4700 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4701 if (r < 0) {
4702 *exit_status = EXIT_USER;
4703 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4704 }
4705 }
4706
4707 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4708 * shall execute. */
4709
4710 _cleanup_free_ char *executable = NULL;
4711 _cleanup_close_ int executable_fd = -EBADF;
4712 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4713 if (r < 0) {
4714 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4715 log_unit_struct_errno(unit, LOG_INFO, r,
4716 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4717 LOG_UNIT_INVOCATION_ID(unit),
4718 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4719 command->path),
4720 "EXECUTABLE=%s", command->path);
4721 *exit_status = EXIT_SUCCESS;
4722 return 0;
4723 }
4724
4725 *exit_status = EXIT_EXEC;
4726 return log_unit_struct_errno(unit, LOG_INFO, r,
4727 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4728 LOG_UNIT_INVOCATION_ID(unit),
4729 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4730 command->path),
4731 "EXECUTABLE=%s", command->path);
4732 }
4733
4734 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4735 if (r < 0) {
4736 *exit_status = EXIT_FDS;
4737 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4738 }
4739
4740 #if HAVE_SELINUX
4741 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4742 int fd = -EBADF;
4743
4744 if (socket_fd >= 0)
4745 fd = socket_fd;
4746 else if (params->n_socket_fds == 1)
4747 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4748 * use context from that fd to compute the label. */
4749 fd = params->fds[0];
4750
4751 if (fd >= 0) {
4752 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4753 if (r < 0) {
4754 if (!context->selinux_context_ignore) {
4755 *exit_status = EXIT_SELINUX_CONTEXT;
4756 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4757 }
4758 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4759 }
4760 }
4761 }
4762 #endif
4763
4764 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4765 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4766 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4767 * execve(). */
4768
4769 r = close_all_fds(keep_fds, n_keep_fds);
4770 if (r >= 0)
4771 r = shift_fds(fds, n_fds);
4772 if (r >= 0)
4773 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4774 if (r < 0) {
4775 *exit_status = EXIT_FDS;
4776 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4777 }
4778
4779 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4780 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4781 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4782 * came this far. */
4783
4784 secure_bits = context->secure_bits;
4785
4786 if (needs_sandboxing) {
4787 uint64_t bset;
4788
4789 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4790 * (Note this is placed after the general resource limit initialization, see above, in order
4791 * to take precedence.) */
4792 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4793 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4794 *exit_status = EXIT_LIMITS;
4795 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
4796 }
4797 }
4798
4799 #if ENABLE_SMACK
4800 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4801 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4802 if (use_smack) {
4803 r = setup_smack(unit->manager, context, executable_fd);
4804 if (r < 0 && !context->smack_process_label_ignore) {
4805 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4806 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4807 }
4808 }
4809 #endif
4810
4811 bset = context->capability_bounding_set;
4812 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4813 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4814 * instead of us doing that */
4815 if (needs_ambient_hack)
4816 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4817 (UINT64_C(1) << CAP_SETUID) |
4818 (UINT64_C(1) << CAP_SETGID);
4819
4820 if (!cap_test_all(bset)) {
4821 r = capability_bounding_set_drop(bset, /* right_now= */ false);
4822 if (r < 0) {
4823 *exit_status = EXIT_CAPABILITIES;
4824 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
4825 }
4826 }
4827
4828 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4829 * keep-caps set.
4830 *
4831 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4832 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4833 * the ambient capabilities can be raised as they are present in the permitted and
4834 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4835 * without changing the user, so we also set the ambient capabilities here.
4836 *
4837 * The requested ambient capabilities are raised in the inheritable set if the second
4838 * argument is true. */
4839 if (!needs_ambient_hack) {
4840 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
4841 if (r < 0) {
4842 *exit_status = EXIT_CAPABILITIES;
4843 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
4844 }
4845 }
4846 }
4847
4848 /* chroot to root directory first, before we lose the ability to chroot */
4849 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
4850 if (r < 0)
4851 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4852
4853 if (needs_setuid) {
4854 if (uid_is_valid(uid)) {
4855 r = enforce_user(context, uid, capability_ambient_set);
4856 if (r < 0) {
4857 *exit_status = EXIT_USER;
4858 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
4859 }
4860
4861 if (!needs_ambient_hack && capability_ambient_set != 0) {
4862
4863 /* Raise the ambient capabilities after user change. */
4864 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
4865 if (r < 0) {
4866 *exit_status = EXIT_CAPABILITIES;
4867 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
4868 }
4869 }
4870 }
4871 }
4872
4873 /* Apply working directory here, because the working directory might be on NFS and only the user running
4874 * this service might have the correct privilege to change to the working directory */
4875 r = apply_working_directory(context, params, runtime, home, exit_status);
4876 if (r < 0)
4877 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4878
4879 if (needs_sandboxing) {
4880 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
4881 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4882 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4883 * are restricted. */
4884
4885 #if HAVE_SELINUX
4886 if (use_selinux) {
4887 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4888
4889 if (exec_context) {
4890 r = setexeccon(exec_context);
4891 if (r < 0) {
4892 if (!context->selinux_context_ignore) {
4893 *exit_status = EXIT_SELINUX_CONTEXT;
4894 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4895 }
4896 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
4897 }
4898 }
4899 }
4900 #endif
4901
4902 #if HAVE_APPARMOR
4903 if (use_apparmor && context->apparmor_profile) {
4904 r = aa_change_onexec(context->apparmor_profile);
4905 if (r < 0 && !context->apparmor_profile_ignore) {
4906 *exit_status = EXIT_APPARMOR_PROFILE;
4907 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
4908 }
4909 }
4910 #endif
4911
4912 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4913 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4914 * requires CAP_SETPCAP. */
4915 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
4916 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
4917 * effective set here.
4918 *
4919 * The effective set is overwritten during execve() with the following values:
4920 *
4921 * - ambient set (for non-root processes)
4922 *
4923 * - (inheritable | bounding) set for root processes)
4924 *
4925 * Hence there is no security impact to raise it in the effective set before execve
4926 */
4927 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
4928 if (r < 0) {
4929 *exit_status = EXIT_CAPABILITIES;
4930 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4931 }
4932 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
4933 *exit_status = EXIT_SECUREBITS;
4934 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
4935 }
4936 }
4937
4938 if (context_has_no_new_privileges(context))
4939 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
4940 *exit_status = EXIT_NO_NEW_PRIVILEGES;
4941 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
4942 }
4943
4944 #if HAVE_SECCOMP
4945 r = apply_address_families(unit, context);
4946 if (r < 0) {
4947 *exit_status = EXIT_ADDRESS_FAMILIES;
4948 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4949 }
4950
4951 r = apply_memory_deny_write_execute(unit, context);
4952 if (r < 0) {
4953 *exit_status = EXIT_SECCOMP;
4954 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
4955 }
4956
4957 r = apply_restrict_realtime(unit, context);
4958 if (r < 0) {
4959 *exit_status = EXIT_SECCOMP;
4960 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
4961 }
4962
4963 r = apply_restrict_suid_sgid(unit, context);
4964 if (r < 0) {
4965 *exit_status = EXIT_SECCOMP;
4966 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4967 }
4968
4969 r = apply_restrict_namespaces(unit, context);
4970 if (r < 0) {
4971 *exit_status = EXIT_SECCOMP;
4972 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
4973 }
4974
4975 r = apply_protect_sysctl(unit, context);
4976 if (r < 0) {
4977 *exit_status = EXIT_SECCOMP;
4978 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
4979 }
4980
4981 r = apply_protect_kernel_modules(unit, context);
4982 if (r < 0) {
4983 *exit_status = EXIT_SECCOMP;
4984 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
4985 }
4986
4987 r = apply_protect_kernel_logs(unit, context);
4988 if (r < 0) {
4989 *exit_status = EXIT_SECCOMP;
4990 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4991 }
4992
4993 r = apply_protect_clock(unit, context);
4994 if (r < 0) {
4995 *exit_status = EXIT_SECCOMP;
4996 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4997 }
4998
4999 r = apply_private_devices(unit, context);
5000 if (r < 0) {
5001 *exit_status = EXIT_SECCOMP;
5002 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5003 }
5004
5005 r = apply_syscall_archs(unit, context);
5006 if (r < 0) {
5007 *exit_status = EXIT_SECCOMP;
5008 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5009 }
5010
5011 r = apply_lock_personality(unit, context);
5012 if (r < 0) {
5013 *exit_status = EXIT_SECCOMP;
5014 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5015 }
5016
5017 r = apply_syscall_log(unit, context);
5018 if (r < 0) {
5019 *exit_status = EXIT_SECCOMP;
5020 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5021 }
5022
5023 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5024 * by the filter as little as possible. */
5025 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5026 if (r < 0) {
5027 *exit_status = EXIT_SECCOMP;
5028 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5029 }
5030 #endif
5031
5032 #if HAVE_LIBBPF
5033 r = apply_restrict_filesystems(unit, context);
5034 if (r < 0) {
5035 *exit_status = EXIT_BPF;
5036 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5037 }
5038 #endif
5039
5040 }
5041
5042 if (!strv_isempty(context->unset_environment)) {
5043 char **ee = NULL;
5044
5045 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5046 if (!ee) {
5047 *exit_status = EXIT_MEMORY;
5048 return log_oom();
5049 }
5050
5051 strv_free_and_replace(accum_env, ee);
5052 }
5053
5054 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5055 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5056
5057 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5058 if (r < 0) {
5059 *exit_status = EXIT_MEMORY;
5060 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5061 }
5062 final_argv = replaced_argv;
5063
5064 if (!strv_isempty(unset_variables)) {
5065 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5066 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5067 }
5068
5069 if (!strv_isempty(bad_variables)) {
5070 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5071 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5072 }
5073 } else
5074 final_argv = command->argv;
5075
5076 log_command_line(unit, "Executing", executable, final_argv);
5077
5078 if (exec_fd >= 0) {
5079 uint8_t hot = 1;
5080
5081 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5082 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5083
5084 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5085 *exit_status = EXIT_EXEC;
5086 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5087 }
5088 }
5089
5090 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5091
5092 if (exec_fd >= 0) {
5093 uint8_t hot = 0;
5094
5095 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5096 * that POLLHUP on it no longer means execve() succeeded. */
5097
5098 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5099 *exit_status = EXIT_EXEC;
5100 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5101 }
5102 }
5103
5104 *exit_status = EXIT_EXEC;
5105 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5106 }
5107
5108 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5109 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5110
5111 int exec_spawn(Unit *unit,
5112 ExecCommand *command,
5113 const ExecContext *context,
5114 const ExecParameters *params,
5115 ExecRuntime *runtime,
5116 const CGroupContext *cgroup_context,
5117 pid_t *ret) {
5118
5119 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5120 _cleanup_free_ char *subcgroup_path = NULL;
5121 _cleanup_strv_free_ char **files_env = NULL;
5122 size_t n_storage_fds = 0, n_socket_fds = 0;
5123 pid_t pid;
5124
5125 assert(unit);
5126 assert(command);
5127 assert(context);
5128 assert(ret);
5129 assert(params);
5130 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5131
5132 LOG_CONTEXT_PUSH_UNIT(unit);
5133
5134 if (context->std_input == EXEC_INPUT_SOCKET ||
5135 context->std_output == EXEC_OUTPUT_SOCKET ||
5136 context->std_error == EXEC_OUTPUT_SOCKET) {
5137
5138 if (params->n_socket_fds > 1)
5139 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5140
5141 if (params->n_socket_fds == 0)
5142 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5143
5144 socket_fd = params->fds[0];
5145 } else {
5146 socket_fd = -EBADF;
5147 fds = params->fds;
5148 n_socket_fds = params->n_socket_fds;
5149 n_storage_fds = params->n_storage_fds;
5150 }
5151
5152 r = exec_context_named_iofds(context, params, named_iofds);
5153 if (r < 0)
5154 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5155
5156 r = exec_context_load_environment(unit, context, &files_env);
5157 if (r < 0)
5158 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5159
5160 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5161 and, until the next SELinux policy changes, we save further reloads in future children. */
5162 mac_selinux_maybe_reload();
5163
5164 /* We won't know the real executable path until we create the mount namespace in the child, but we
5165 want to log from the parent, so we use the possibly inaccurate path here. */
5166 log_command_line(unit, "About to execute", command->path, command->argv);
5167
5168 if (params->cgroup_path) {
5169 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
5170 if (r < 0)
5171 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5172 if (r > 0) {
5173 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5174 * realized by the unit logic) */
5175
5176 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5177 if (r < 0)
5178 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
5179 }
5180 }
5181
5182 pid = fork();
5183 if (pid < 0)
5184 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5185
5186 if (pid == 0) {
5187 int exit_status;
5188
5189 r = exec_child(unit,
5190 command,
5191 context,
5192 params,
5193 runtime,
5194 cgroup_context,
5195 socket_fd,
5196 named_iofds,
5197 fds,
5198 n_socket_fds,
5199 n_storage_fds,
5200 files_env,
5201 unit->manager->user_lookup_fds[1],
5202 &exit_status);
5203
5204 if (r < 0) {
5205 const char *status = ASSERT_PTR(
5206 exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
5207
5208 log_unit_struct_errno(unit, LOG_ERR, r,
5209 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5210 LOG_UNIT_INVOCATION_ID(unit),
5211 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5212 status, command->path),
5213 "EXECUTABLE=%s", command->path);
5214 } else
5215 assert(exit_status == EXIT_SUCCESS);
5216
5217 _exit(exit_status);
5218 }
5219
5220 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5221
5222 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5223 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5224 * process will be killed too). */
5225 if (subcgroup_path)
5226 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5227
5228 exec_status_start(&command->exec_status, pid);
5229
5230 *ret = pid;
5231 return 0;
5232 }
5233
5234 void exec_context_init(ExecContext *c) {
5235 assert(c);
5236
5237 c->umask = 0022;
5238 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5239 c->cpu_sched_policy = SCHED_OTHER;
5240 c->syslog_priority = LOG_DAEMON|LOG_INFO;
5241 c->syslog_level_prefix = true;
5242 c->ignore_sigpipe = true;
5243 c->timer_slack_nsec = NSEC_INFINITY;
5244 c->personality = PERSONALITY_INVALID;
5245 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5246 c->directories[t].mode = 0755;
5247 c->timeout_clean_usec = USEC_INFINITY;
5248 c->capability_bounding_set = CAP_MASK_UNSET;
5249 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5250 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5251 c->log_level_max = -1;
5252 #if HAVE_SECCOMP
5253 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5254 #endif
5255 c->tty_rows = UINT_MAX;
5256 c->tty_cols = UINT_MAX;
5257 numa_policy_reset(&c->numa_policy);
5258 c->private_mounts = -1;
5259 c->memory_ksm = -1;
5260 }
5261
5262 void exec_context_done(ExecContext *c) {
5263 assert(c);
5264
5265 c->environment = strv_free(c->environment);
5266 c->environment_files = strv_free(c->environment_files);
5267 c->pass_environment = strv_free(c->pass_environment);
5268 c->unset_environment = strv_free(c->unset_environment);
5269
5270 rlimit_free_all(c->rlimit);
5271
5272 for (size_t l = 0; l < 3; l++) {
5273 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5274 c->stdio_file[l] = mfree(c->stdio_file[l]);
5275 }
5276
5277 c->working_directory = mfree(c->working_directory);
5278 c->root_directory = mfree(c->root_directory);
5279 c->root_image = mfree(c->root_image);
5280 c->root_image_options = mount_options_free_all(c->root_image_options);
5281 c->root_hash = mfree(c->root_hash);
5282 c->root_hash_size = 0;
5283 c->root_hash_path = mfree(c->root_hash_path);
5284 c->root_hash_sig = mfree(c->root_hash_sig);
5285 c->root_hash_sig_size = 0;
5286 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5287 c->root_verity = mfree(c->root_verity);
5288 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5289 c->extension_directories = strv_free(c->extension_directories);
5290 c->tty_path = mfree(c->tty_path);
5291 c->syslog_identifier = mfree(c->syslog_identifier);
5292 c->user = mfree(c->user);
5293 c->group = mfree(c->group);
5294
5295 c->supplementary_groups = strv_free(c->supplementary_groups);
5296
5297 c->pam_name = mfree(c->pam_name);
5298
5299 c->read_only_paths = strv_free(c->read_only_paths);
5300 c->read_write_paths = strv_free(c->read_write_paths);
5301 c->inaccessible_paths = strv_free(c->inaccessible_paths);
5302 c->exec_paths = strv_free(c->exec_paths);
5303 c->no_exec_paths = strv_free(c->no_exec_paths);
5304 c->exec_search_path = strv_free(c->exec_search_path);
5305
5306 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5307 c->bind_mounts = NULL;
5308 c->n_bind_mounts = 0;
5309 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5310 c->temporary_filesystems = NULL;
5311 c->n_temporary_filesystems = 0;
5312 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5313
5314 cpu_set_reset(&c->cpu_set);
5315 numa_policy_reset(&c->numa_policy);
5316
5317 c->utmp_id = mfree(c->utmp_id);
5318 c->selinux_context = mfree(c->selinux_context);
5319 c->apparmor_profile = mfree(c->apparmor_profile);
5320 c->smack_process_label = mfree(c->smack_process_label);
5321
5322 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
5323
5324 c->syscall_filter = hashmap_free(c->syscall_filter);
5325 c->syscall_archs = set_free(c->syscall_archs);
5326 c->address_families = set_free(c->address_families);
5327
5328 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5329 exec_directory_done(&c->directories[t]);
5330
5331 c->log_level_max = -1;
5332
5333 exec_context_free_log_extra_fields(c);
5334 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5335 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
5336
5337 c->log_ratelimit_interval_usec = 0;
5338 c->log_ratelimit_burst = 0;
5339
5340 c->stdin_data = mfree(c->stdin_data);
5341 c->stdin_data_size = 0;
5342
5343 c->network_namespace_path = mfree(c->network_namespace_path);
5344 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5345
5346 c->log_namespace = mfree(c->log_namespace);
5347
5348 c->load_credentials = hashmap_free(c->load_credentials);
5349 c->set_credentials = hashmap_free(c->set_credentials);
5350 c->import_credentials = set_free_free(c->import_credentials);
5351
5352 c->root_image_policy = image_policy_free(c->root_image_policy);
5353 c->mount_image_policy = image_policy_free(c->mount_image_policy);
5354 c->extension_image_policy = image_policy_free(c->extension_image_policy);
5355 }
5356
5357 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5358 assert(c);
5359
5360 if (!runtime_prefix)
5361 return 0;
5362
5363 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5364 _cleanup_free_ char *p = NULL;
5365
5366 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5367 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5368 else
5369 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5370 if (!p)
5371 return -ENOMEM;
5372
5373 /* We execute this synchronously, since we need to be sure this is gone when we start the
5374 * service next. */
5375 (void) rm_rf(p, REMOVE_ROOT);
5376
5377 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5378 _cleanup_free_ char *symlink_abs = NULL;
5379
5380 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5381 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5382 else
5383 symlink_abs = path_join(runtime_prefix, *symlink);
5384 if (!symlink_abs)
5385 return -ENOMEM;
5386
5387 (void) unlink(symlink_abs);
5388 }
5389 }
5390
5391 return 0;
5392 }
5393
5394 int exec_context_destroy_mount_ns_dir(Unit *u) {
5395 _cleanup_free_ char *p = NULL;
5396
5397 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5398 return 0;
5399
5400 p = path_join("/run/systemd/propagate/", u->id);
5401 if (!p)
5402 return -ENOMEM;
5403
5404 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5405 if (rmdir(p) < 0 && errno != ENOENT)
5406 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5407
5408 return 0;
5409 }
5410
5411 static void exec_command_done(ExecCommand *c) {
5412 assert(c);
5413
5414 c->path = mfree(c->path);
5415 c->argv = strv_free(c->argv);
5416 }
5417
5418 void exec_command_done_array(ExecCommand *c, size_t n) {
5419 for (size_t i = 0; i < n; i++)
5420 exec_command_done(c+i);
5421 }
5422
5423 ExecCommand* exec_command_free_list(ExecCommand *c) {
5424 ExecCommand *i;
5425
5426 while ((i = LIST_POP(command, c))) {
5427 exec_command_done(i);
5428 free(i);
5429 }
5430
5431 return NULL;
5432 }
5433
5434 void exec_command_free_array(ExecCommand **c, size_t n) {
5435 for (size_t i = 0; i < n; i++)
5436 c[i] = exec_command_free_list(c[i]);
5437 }
5438
5439 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5440 for (size_t i = 0; i < n; i++)
5441 exec_status_reset(&c[i].exec_status);
5442 }
5443
5444 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5445 for (size_t i = 0; i < n; i++)
5446 LIST_FOREACH(command, z, c[i])
5447 exec_status_reset(&z->exec_status);
5448 }
5449
5450 typedef struct InvalidEnvInfo {
5451 const Unit *unit;
5452 const char *path;
5453 } InvalidEnvInfo;
5454
5455 static void invalid_env(const char *p, void *userdata) {
5456 InvalidEnvInfo *info = userdata;
5457
5458 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5459 }
5460
5461 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5462 assert(c);
5463
5464 switch (fd_index) {
5465
5466 case STDIN_FILENO:
5467 if (c->std_input != EXEC_INPUT_NAMED_FD)
5468 return NULL;
5469
5470 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5471
5472 case STDOUT_FILENO:
5473 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5474 return NULL;
5475
5476 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5477
5478 case STDERR_FILENO:
5479 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5480 return NULL;
5481
5482 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5483
5484 default:
5485 return NULL;
5486 }
5487 }
5488
5489 static int exec_context_named_iofds(
5490 const ExecContext *c,
5491 const ExecParameters *p,
5492 int named_iofds[static 3]) {
5493
5494 size_t targets;
5495 const char* stdio_fdname[3];
5496 size_t n_fds;
5497
5498 assert(c);
5499 assert(p);
5500 assert(named_iofds);
5501
5502 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5503 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5504 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5505
5506 for (size_t i = 0; i < 3; i++)
5507 stdio_fdname[i] = exec_context_fdname(c, i);
5508
5509 n_fds = p->n_storage_fds + p->n_socket_fds;
5510
5511 for (size_t i = 0; i < n_fds && targets > 0; i++)
5512 if (named_iofds[STDIN_FILENO] < 0 &&
5513 c->std_input == EXEC_INPUT_NAMED_FD &&
5514 stdio_fdname[STDIN_FILENO] &&
5515 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5516
5517 named_iofds[STDIN_FILENO] = p->fds[i];
5518 targets--;
5519
5520 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5521 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5522 stdio_fdname[STDOUT_FILENO] &&
5523 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5524
5525 named_iofds[STDOUT_FILENO] = p->fds[i];
5526 targets--;
5527
5528 } else if (named_iofds[STDERR_FILENO] < 0 &&
5529 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5530 stdio_fdname[STDERR_FILENO] &&
5531 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5532
5533 named_iofds[STDERR_FILENO] = p->fds[i];
5534 targets--;
5535 }
5536
5537 return targets == 0 ? 0 : -ENOENT;
5538 }
5539
5540 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5541 _cleanup_strv_free_ char **v = NULL;
5542 int r;
5543
5544 assert(c);
5545 assert(ret);
5546
5547 STRV_FOREACH(i, c->environment_files) {
5548 _cleanup_globfree_ glob_t pglob = {};
5549 bool ignore = false;
5550 char *fn = *i;
5551
5552 if (fn[0] == '-') {
5553 ignore = true;
5554 fn++;
5555 }
5556
5557 if (!path_is_absolute(fn)) {
5558 if (ignore)
5559 continue;
5560 return -EINVAL;
5561 }
5562
5563 /* Filename supports globbing, take all matching files */
5564 r = safe_glob(fn, 0, &pglob);
5565 if (r < 0) {
5566 if (ignore)
5567 continue;
5568 return r;
5569 }
5570
5571 /* When we don't match anything, -ENOENT should be returned */
5572 assert(pglob.gl_pathc > 0);
5573
5574 for (size_t n = 0; n < pglob.gl_pathc; n++) {
5575 _cleanup_strv_free_ char **p = NULL;
5576
5577 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5578 if (r < 0) {
5579 if (ignore)
5580 continue;
5581 return r;
5582 }
5583
5584 /* Log invalid environment variables with filename */
5585 if (p) {
5586 InvalidEnvInfo info = {
5587 .unit = unit,
5588 .path = pglob.gl_pathv[n]
5589 };
5590
5591 p = strv_env_clean_with_callback(p, invalid_env, &info);
5592 }
5593
5594 if (!v)
5595 v = TAKE_PTR(p);
5596 else {
5597 char **m = strv_env_merge(v, p);
5598 if (!m)
5599 return -ENOMEM;
5600
5601 strv_free_and_replace(v, m);
5602 }
5603 }
5604 }
5605
5606 *ret = TAKE_PTR(v);
5607
5608 return 0;
5609 }
5610
5611 static bool tty_may_match_dev_console(const char *tty) {
5612 _cleanup_free_ char *resolved = NULL;
5613
5614 if (!tty)
5615 return true;
5616
5617 tty = skip_dev_prefix(tty);
5618
5619 /* trivial identity? */
5620 if (streq(tty, "console"))
5621 return true;
5622
5623 if (resolve_dev_console(&resolved) < 0)
5624 return true; /* if we could not resolve, assume it may */
5625
5626 /* "tty0" means the active VC, so it may be the same sometimes */
5627 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5628 }
5629
5630 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5631 assert(ec);
5632
5633 return ec->tty_reset ||
5634 ec->tty_vhangup ||
5635 ec->tty_vt_disallocate ||
5636 is_terminal_input(ec->std_input) ||
5637 is_terminal_output(ec->std_output) ||
5638 is_terminal_output(ec->std_error);
5639 }
5640
5641 bool exec_context_may_touch_console(const ExecContext *ec) {
5642
5643 return exec_context_may_touch_tty(ec) &&
5644 tty_may_match_dev_console(exec_context_tty_path(ec));
5645 }
5646
5647 static void strv_fprintf(FILE *f, char **l) {
5648 assert(f);
5649
5650 STRV_FOREACH(g, l)
5651 fprintf(f, " %s", *g);
5652 }
5653
5654 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5655 assert(f);
5656 assert(prefix);
5657 assert(name);
5658
5659 if (!strv_isempty(strv)) {
5660 fprintf(f, "%s%s:", prefix, name);
5661 strv_fprintf(f, strv);
5662 fputs("\n", f);
5663 }
5664 }
5665
5666 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5667 int r;
5668
5669 assert(c);
5670 assert(f);
5671
5672 prefix = strempty(prefix);
5673
5674 fprintf(f,
5675 "%sUMask: %04o\n"
5676 "%sWorkingDirectory: %s\n"
5677 "%sRootDirectory: %s\n"
5678 "%sRootEphemeral: %s\n"
5679 "%sNonBlocking: %s\n"
5680 "%sPrivateTmp: %s\n"
5681 "%sPrivateDevices: %s\n"
5682 "%sProtectKernelTunables: %s\n"
5683 "%sProtectKernelModules: %s\n"
5684 "%sProtectKernelLogs: %s\n"
5685 "%sProtectClock: %s\n"
5686 "%sProtectControlGroups: %s\n"
5687 "%sPrivateNetwork: %s\n"
5688 "%sPrivateUsers: %s\n"
5689 "%sProtectHome: %s\n"
5690 "%sProtectSystem: %s\n"
5691 "%sMountAPIVFS: %s\n"
5692 "%sIgnoreSIGPIPE: %s\n"
5693 "%sMemoryDenyWriteExecute: %s\n"
5694 "%sRestrictRealtime: %s\n"
5695 "%sRestrictSUIDSGID: %s\n"
5696 "%sKeyringMode: %s\n"
5697 "%sProtectHostname: %s\n"
5698 "%sProtectProc: %s\n"
5699 "%sProcSubset: %s\n",
5700 prefix, c->umask,
5701 prefix, empty_to_root(c->working_directory),
5702 prefix, empty_to_root(c->root_directory),
5703 prefix, yes_no(c->root_ephemeral),
5704 prefix, yes_no(c->non_blocking),
5705 prefix, yes_no(c->private_tmp),
5706 prefix, yes_no(c->private_devices),
5707 prefix, yes_no(c->protect_kernel_tunables),
5708 prefix, yes_no(c->protect_kernel_modules),
5709 prefix, yes_no(c->protect_kernel_logs),
5710 prefix, yes_no(c->protect_clock),
5711 prefix, yes_no(c->protect_control_groups),
5712 prefix, yes_no(c->private_network),
5713 prefix, yes_no(c->private_users),
5714 prefix, protect_home_to_string(c->protect_home),
5715 prefix, protect_system_to_string(c->protect_system),
5716 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5717 prefix, yes_no(c->ignore_sigpipe),
5718 prefix, yes_no(c->memory_deny_write_execute),
5719 prefix, yes_no(c->restrict_realtime),
5720 prefix, yes_no(c->restrict_suid_sgid),
5721 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5722 prefix, yes_no(c->protect_hostname),
5723 prefix, protect_proc_to_string(c->protect_proc),
5724 prefix, proc_subset_to_string(c->proc_subset));
5725
5726 if (c->root_image)
5727 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5728
5729 if (c->root_image_options) {
5730 fprintf(f, "%sRootImageOptions:", prefix);
5731 LIST_FOREACH(mount_options, o, c->root_image_options)
5732 if (!isempty(o->options))
5733 fprintf(f, " %s:%s",
5734 partition_designator_to_string(o->partition_designator),
5735 o->options);
5736 fprintf(f, "\n");
5737 }
5738
5739 if (c->root_hash) {
5740 _cleanup_free_ char *encoded = NULL;
5741 encoded = hexmem(c->root_hash, c->root_hash_size);
5742 if (encoded)
5743 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5744 }
5745
5746 if (c->root_hash_path)
5747 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5748
5749 if (c->root_hash_sig) {
5750 _cleanup_free_ char *encoded = NULL;
5751 ssize_t len;
5752 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5753 if (len)
5754 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5755 }
5756
5757 if (c->root_hash_sig_path)
5758 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5759
5760 if (c->root_verity)
5761 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5762
5763 STRV_FOREACH(e, c->environment)
5764 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5765
5766 STRV_FOREACH(e, c->environment_files)
5767 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5768
5769 STRV_FOREACH(e, c->pass_environment)
5770 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5771
5772 STRV_FOREACH(e, c->unset_environment)
5773 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5774
5775 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5776
5777 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5778 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5779
5780 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5781 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5782
5783 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5784 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5785 }
5786 }
5787
5788 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
5789
5790 if (c->nice_set)
5791 fprintf(f, "%sNice: %i\n", prefix, c->nice);
5792
5793 if (c->oom_score_adjust_set)
5794 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
5795
5796 if (c->coredump_filter_set)
5797 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
5798
5799 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
5800 if (c->rlimit[i]) {
5801 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
5802 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
5803 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
5804 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5805 }
5806
5807 if (c->ioprio_set) {
5808 _cleanup_free_ char *class_str = NULL;
5809
5810 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
5811 if (r >= 0)
5812 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5813
5814 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
5815 }
5816
5817 if (c->cpu_sched_set) {
5818 _cleanup_free_ char *policy_str = NULL;
5819
5820 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5821 if (r >= 0)
5822 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5823
5824 fprintf(f,
5825 "%sCPUSchedulingPriority: %i\n"
5826 "%sCPUSchedulingResetOnFork: %s\n",
5827 prefix, c->cpu_sched_priority,
5828 prefix, yes_no(c->cpu_sched_reset_on_fork));
5829 }
5830
5831 if (c->cpu_set.set) {
5832 _cleanup_free_ char *affinity = NULL;
5833
5834 affinity = cpu_set_to_range_string(&c->cpu_set);
5835 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
5836 }
5837
5838 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5839 _cleanup_free_ char *nodes = NULL;
5840
5841 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5842 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5843 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5844 }
5845
5846 if (c->timer_slack_nsec != NSEC_INFINITY)
5847 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
5848
5849 fprintf(f,
5850 "%sStandardInput: %s\n"
5851 "%sStandardOutput: %s\n"
5852 "%sStandardError: %s\n",
5853 prefix, exec_input_to_string(c->std_input),
5854 prefix, exec_output_to_string(c->std_output),
5855 prefix, exec_output_to_string(c->std_error));
5856
5857 if (c->std_input == EXEC_INPUT_NAMED_FD)
5858 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5859 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5860 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5861 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5862 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5863
5864 if (c->std_input == EXEC_INPUT_FILE)
5865 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5866 if (c->std_output == EXEC_OUTPUT_FILE)
5867 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5868 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5869 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5870 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5871 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
5872 if (c->std_error == EXEC_OUTPUT_FILE)
5873 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5874 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5875 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5876 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5877 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
5878
5879 if (c->tty_path)
5880 fprintf(f,
5881 "%sTTYPath: %s\n"
5882 "%sTTYReset: %s\n"
5883 "%sTTYVHangup: %s\n"
5884 "%sTTYVTDisallocate: %s\n"
5885 "%sTTYRows: %u\n"
5886 "%sTTYColumns: %u\n",
5887 prefix, c->tty_path,
5888 prefix, yes_no(c->tty_reset),
5889 prefix, yes_no(c->tty_vhangup),
5890 prefix, yes_no(c->tty_vt_disallocate),
5891 prefix, c->tty_rows,
5892 prefix, c->tty_cols);
5893
5894 if (IN_SET(c->std_output,
5895 EXEC_OUTPUT_KMSG,
5896 EXEC_OUTPUT_JOURNAL,
5897 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5898 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5899 IN_SET(c->std_error,
5900 EXEC_OUTPUT_KMSG,
5901 EXEC_OUTPUT_JOURNAL,
5902 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5903 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
5904
5905 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
5906
5907 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5908 if (r >= 0)
5909 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
5910
5911 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5912 if (r >= 0)
5913 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
5914 }
5915
5916 if (c->log_level_max >= 0) {
5917 _cleanup_free_ char *t = NULL;
5918
5919 (void) log_level_to_string_alloc(c->log_level_max, &t);
5920
5921 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5922 }
5923
5924 if (c->log_ratelimit_interval_usec > 0)
5925 fprintf(f,
5926 "%sLogRateLimitIntervalSec: %s\n",
5927 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
5928
5929 if (c->log_ratelimit_burst > 0)
5930 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
5931
5932 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5933 fprintf(f, "%sLogFilterPatterns:", prefix);
5934
5935 char *pattern;
5936 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5937 fprintf(f, " %s", pattern);
5938 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5939 fprintf(f, " ~%s", pattern);
5940 fputc('\n', f);
5941 }
5942
5943 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5944 fprintf(f, "%sLogExtraFields: ", prefix);
5945 fwrite(c->log_extra_fields[j].iov_base,
5946 1, c->log_extra_fields[j].iov_len,
5947 f);
5948 fputc('\n', f);
5949 }
5950
5951 if (c->log_namespace)
5952 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5953
5954 if (c->secure_bits) {
5955 _cleanup_free_ char *str = NULL;
5956
5957 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5958 if (r >= 0)
5959 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5960 }
5961
5962 if (c->capability_bounding_set != CAP_MASK_UNSET) {
5963 _cleanup_free_ char *str = NULL;
5964
5965 r = capability_set_to_string(c->capability_bounding_set, &str);
5966 if (r >= 0)
5967 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
5968 }
5969
5970 if (c->capability_ambient_set != 0) {
5971 _cleanup_free_ char *str = NULL;
5972
5973 r = capability_set_to_string(c->capability_ambient_set, &str);
5974 if (r >= 0)
5975 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
5976 }
5977
5978 if (c->user)
5979 fprintf(f, "%sUser: %s\n", prefix, c->user);
5980 if (c->group)
5981 fprintf(f, "%sGroup: %s\n", prefix, c->group);
5982
5983 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5984
5985 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
5986
5987 if (c->pam_name)
5988 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5989
5990 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5991 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5992 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5993 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5994 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
5995 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
5996
5997 for (size_t i = 0; i < c->n_bind_mounts; i++)
5998 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5999 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6000 c->bind_mounts[i].ignore_enoent ? "-": "",
6001 c->bind_mounts[i].source,
6002 c->bind_mounts[i].destination,
6003 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6004
6005 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6006 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6007
6008 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6009 t->path,
6010 isempty(t->options) ? "" : ":",
6011 strempty(t->options));
6012 }
6013
6014 if (c->utmp_id)
6015 fprintf(f,
6016 "%sUtmpIdentifier: %s\n",
6017 prefix, c->utmp_id);
6018
6019 if (c->selinux_context)
6020 fprintf(f,
6021 "%sSELinuxContext: %s%s\n",
6022 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6023
6024 if (c->apparmor_profile)
6025 fprintf(f,
6026 "%sAppArmorProfile: %s%s\n",
6027 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6028
6029 if (c->smack_process_label)
6030 fprintf(f,
6031 "%sSmackProcessLabel: %s%s\n",
6032 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6033
6034 if (c->personality != PERSONALITY_INVALID)
6035 fprintf(f,
6036 "%sPersonality: %s\n",
6037 prefix, strna(personality_to_string(c->personality)));
6038
6039 fprintf(f,
6040 "%sLockPersonality: %s\n",
6041 prefix, yes_no(c->lock_personality));
6042
6043 if (c->syscall_filter) {
6044 fprintf(f,
6045 "%sSystemCallFilter: ",
6046 prefix);
6047
6048 if (!c->syscall_allow_list)
6049 fputc('~', f);
6050
6051 #if HAVE_SECCOMP
6052 void *id, *val;
6053 bool first = true;
6054 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6055 _cleanup_free_ char *name = NULL;
6056 const char *errno_name = NULL;
6057 int num = PTR_TO_INT(val);
6058
6059 if (first)
6060 first = false;
6061 else
6062 fputc(' ', f);
6063
6064 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6065 fputs(strna(name), f);
6066
6067 if (num >= 0) {
6068 errno_name = seccomp_errno_or_action_to_string(num);
6069 if (errno_name)
6070 fprintf(f, ":%s", errno_name);
6071 else
6072 fprintf(f, ":%d", num);
6073 }
6074 }
6075 #endif
6076
6077 fputc('\n', f);
6078 }
6079
6080 if (c->syscall_archs) {
6081 fprintf(f,
6082 "%sSystemCallArchitectures:",
6083 prefix);
6084
6085 #if HAVE_SECCOMP
6086 void *id;
6087 SET_FOREACH(id, c->syscall_archs)
6088 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6089 #endif
6090 fputc('\n', f);
6091 }
6092
6093 if (exec_context_restrict_namespaces_set(c)) {
6094 _cleanup_free_ char *s = NULL;
6095
6096 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6097 if (r >= 0)
6098 fprintf(f, "%sRestrictNamespaces: %s\n",
6099 prefix, strna(s));
6100 }
6101
6102 #if HAVE_LIBBPF
6103 if (exec_context_restrict_filesystems_set(c)) {
6104 char *fs;
6105 SET_FOREACH(fs, c->restrict_filesystems)
6106 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6107 }
6108 #endif
6109
6110 if (c->network_namespace_path)
6111 fprintf(f,
6112 "%sNetworkNamespacePath: %s\n",
6113 prefix, c->network_namespace_path);
6114
6115 if (c->syscall_errno > 0) {
6116 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6117
6118 #if HAVE_SECCOMP
6119 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6120 if (errno_name)
6121 fputs(errno_name, f);
6122 else
6123 fprintf(f, "%d", c->syscall_errno);
6124 #endif
6125 fputc('\n', f);
6126 }
6127
6128 for (size_t i = 0; i < c->n_mount_images; i++) {
6129 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6130 c->mount_images[i].ignore_enoent ? "-": "",
6131 c->mount_images[i].source,
6132 c->mount_images[i].destination);
6133 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6134 fprintf(f, ":%s:%s",
6135 partition_designator_to_string(o->partition_designator),
6136 strempty(o->options));
6137 fprintf(f, "\n");
6138 }
6139
6140 for (size_t i = 0; i < c->n_extension_images; i++) {
6141 fprintf(f, "%sExtensionImages: %s%s", prefix,
6142 c->extension_images[i].ignore_enoent ? "-": "",
6143 c->extension_images[i].source);
6144 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6145 fprintf(f, ":%s:%s",
6146 partition_designator_to_string(o->partition_designator),
6147 strempty(o->options));
6148 fprintf(f, "\n");
6149 }
6150
6151 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6152 }
6153
6154 bool exec_context_maintains_privileges(const ExecContext *c) {
6155 assert(c);
6156
6157 /* Returns true if the process forked off would run under
6158 * an unchanged UID or as root. */
6159
6160 if (!c->user)
6161 return true;
6162
6163 if (streq(c->user, "root") || streq(c->user, "0"))
6164 return true;
6165
6166 return false;
6167 }
6168
6169 int exec_context_get_effective_ioprio(const ExecContext *c) {
6170 int p;
6171
6172 assert(c);
6173
6174 if (c->ioprio_set)
6175 return c->ioprio;
6176
6177 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6178 if (p < 0)
6179 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6180
6181 return ioprio_normalize(p);
6182 }
6183
6184 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6185 assert(c);
6186
6187 /* Explicit setting wins */
6188 if (c->mount_apivfs_set)
6189 return c->mount_apivfs;
6190
6191 /* Default to "yes" if root directory or image are specified */
6192 if (exec_context_with_rootfs(c))
6193 return true;
6194
6195 return false;
6196 }
6197
6198 void exec_context_free_log_extra_fields(ExecContext *c) {
6199 assert(c);
6200
6201 for (size_t l = 0; l < c->n_log_extra_fields; l++)
6202 free(c->log_extra_fields[l].iov_base);
6203 c->log_extra_fields = mfree(c->log_extra_fields);
6204 c->n_log_extra_fields = 0;
6205 }
6206
6207 void exec_context_revert_tty(ExecContext *c) {
6208 _cleanup_close_ int fd = -EBADF;
6209 const char *path;
6210 struct stat st;
6211 int r;
6212
6213 assert(c);
6214
6215 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6216 exec_context_tty_reset(c, NULL);
6217
6218 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6219 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6220 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6221 if (!exec_context_may_touch_tty(c))
6222 return;
6223
6224 path = exec_context_tty_path(c);
6225 if (!path)
6226 return;
6227
6228 fd = open(path, O_PATH|O_CLOEXEC);
6229 if (fd < 0)
6230 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6231 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6232 path);
6233
6234 if (fstat(fd, &st) < 0)
6235 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6236
6237 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6238 * if things are a character device, since a proper check either means we'd have to open the TTY and
6239 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6240 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6241 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6242 if (!S_ISCHR(st.st_mode))
6243 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6244
6245 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6246 if (r < 0)
6247 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6248 }
6249
6250 int exec_context_get_clean_directories(
6251 ExecContext *c,
6252 char **prefix,
6253 ExecCleanMask mask,
6254 char ***ret) {
6255
6256 _cleanup_strv_free_ char **l = NULL;
6257 int r;
6258
6259 assert(c);
6260 assert(prefix);
6261 assert(ret);
6262
6263 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6264 if (!FLAGS_SET(mask, 1U << t))
6265 continue;
6266
6267 if (!prefix[t])
6268 continue;
6269
6270 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6271 char *j;
6272
6273 j = path_join(prefix[t], c->directories[t].items[i].path);
6274 if (!j)
6275 return -ENOMEM;
6276
6277 r = strv_consume(&l, j);
6278 if (r < 0)
6279 return r;
6280
6281 /* Also remove private directories unconditionally. */
6282 if (t != EXEC_DIRECTORY_CONFIGURATION) {
6283 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6284 if (!j)
6285 return -ENOMEM;
6286
6287 r = strv_consume(&l, j);
6288 if (r < 0)
6289 return r;
6290 }
6291
6292 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6293 j = path_join(prefix[t], *symlink);
6294 if (!j)
6295 return -ENOMEM;
6296
6297 r = strv_consume(&l, j);
6298 if (r < 0)
6299 return r;
6300 }
6301 }
6302 }
6303
6304 *ret = TAKE_PTR(l);
6305 return 0;
6306 }
6307
6308 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6309 ExecCleanMask mask = 0;
6310
6311 assert(c);
6312 assert(ret);
6313
6314 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6315 if (c->directories[t].n_items > 0)
6316 mask |= 1U << t;
6317
6318 *ret = mask;
6319 return 0;
6320 }
6321
6322 void exec_status_start(ExecStatus *s, pid_t pid) {
6323 assert(s);
6324
6325 *s = (ExecStatus) {
6326 .pid = pid,
6327 };
6328
6329 dual_timestamp_get(&s->start_timestamp);
6330 }
6331
6332 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6333 assert(s);
6334
6335 if (s->pid != pid)
6336 *s = (ExecStatus) {
6337 .pid = pid,
6338 };
6339
6340 dual_timestamp_get(&s->exit_timestamp);
6341
6342 s->code = code;
6343 s->status = status;
6344
6345 if (context && context->utmp_id)
6346 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6347 }
6348
6349 void exec_status_reset(ExecStatus *s) {
6350 assert(s);
6351
6352 *s = (ExecStatus) {};
6353 }
6354
6355 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6356 assert(s);
6357 assert(f);
6358
6359 if (s->pid <= 0)
6360 return;
6361
6362 prefix = strempty(prefix);
6363
6364 fprintf(f,
6365 "%sPID: "PID_FMT"\n",
6366 prefix, s->pid);
6367
6368 if (dual_timestamp_is_set(&s->start_timestamp))
6369 fprintf(f,
6370 "%sStart Timestamp: %s\n",
6371 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6372
6373 if (dual_timestamp_is_set(&s->exit_timestamp))
6374 fprintf(f,
6375 "%sExit Timestamp: %s\n"
6376 "%sExit Code: %s\n"
6377 "%sExit Status: %i\n",
6378 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6379 prefix, sigchld_code_to_string(s->code),
6380 prefix, s->status);
6381 }
6382
6383 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6384 _cleanup_free_ char *cmd = NULL;
6385 const char *prefix2;
6386
6387 assert(c);
6388 assert(f);
6389
6390 prefix = strempty(prefix);
6391 prefix2 = strjoina(prefix, "\t");
6392
6393 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6394
6395 fprintf(f,
6396 "%sCommand Line: %s\n",
6397 prefix, strnull(cmd));
6398
6399 exec_status_dump(&c->exec_status, f, prefix2);
6400 }
6401
6402 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6403 assert(f);
6404
6405 prefix = strempty(prefix);
6406
6407 LIST_FOREACH(command, i, c)
6408 exec_command_dump(i, f, prefix);
6409 }
6410
6411 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6412 ExecCommand *end;
6413
6414 assert(l);
6415 assert(e);
6416
6417 if (*l) {
6418 /* It's kind of important, that we keep the order here */
6419 end = LIST_FIND_TAIL(command, *l);
6420 LIST_INSERT_AFTER(command, *l, end, e);
6421 } else
6422 *l = e;
6423 }
6424
6425 int exec_command_set(ExecCommand *c, const char *path, ...) {
6426 va_list ap;
6427 char **l, *p;
6428
6429 assert(c);
6430 assert(path);
6431
6432 va_start(ap, path);
6433 l = strv_new_ap(path, ap);
6434 va_end(ap);
6435
6436 if (!l)
6437 return -ENOMEM;
6438
6439 p = strdup(path);
6440 if (!p) {
6441 strv_free(l);
6442 return -ENOMEM;
6443 }
6444
6445 free_and_replace(c->path, p);
6446
6447 return strv_free_and_replace(c->argv, l);
6448 }
6449
6450 int exec_command_append(ExecCommand *c, const char *path, ...) {
6451 _cleanup_strv_free_ char **l = NULL;
6452 va_list ap;
6453 int r;
6454
6455 assert(c);
6456 assert(path);
6457
6458 va_start(ap, path);
6459 l = strv_new_ap(path, ap);
6460 va_end(ap);
6461
6462 if (!l)
6463 return -ENOMEM;
6464
6465 r = strv_extend_strv(&c->argv, l, false);
6466 if (r < 0)
6467 return r;
6468
6469 return 0;
6470 }
6471
6472 static char *destroy_tree(char *path) {
6473 if (!path)
6474 return NULL;
6475
6476 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6477 log_debug("Spawning process to nuke '%s'", path);
6478
6479 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6480 }
6481
6482 return mfree(path);
6483 }
6484
6485 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
6486 if (!rt)
6487 return NULL;
6488
6489 if (rt->manager)
6490 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
6491
6492 rt->id = mfree(rt->id);
6493 rt->tmp_dir = mfree(rt->tmp_dir);
6494 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6495 safe_close_pair(rt->netns_storage_socket);
6496 safe_close_pair(rt->ipcns_storage_socket);
6497 return mfree(rt);
6498 }
6499
6500 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6501 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6502
6503 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
6504 if (!rt)
6505 return NULL;
6506
6507 assert(rt->n_ref > 0);
6508 rt->n_ref--;
6509
6510 if (rt->n_ref > 0)
6511 return NULL;
6512
6513 rt->tmp_dir = destroy_tree(rt->tmp_dir);
6514 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
6515
6516 return exec_shared_runtime_free(rt);
6517 }
6518
6519 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
6520 _cleanup_free_ char *id_copy = NULL;
6521 ExecSharedRuntime *n;
6522
6523 assert(ret);
6524
6525 id_copy = strdup(id);
6526 if (!id_copy)
6527 return -ENOMEM;
6528
6529 n = new(ExecSharedRuntime, 1);
6530 if (!n)
6531 return -ENOMEM;
6532
6533 *n = (ExecSharedRuntime) {
6534 .id = TAKE_PTR(id_copy),
6535 .netns_storage_socket = PIPE_EBADF,
6536 .ipcns_storage_socket = PIPE_EBADF,
6537 };
6538
6539 *ret = n;
6540 return 0;
6541 }
6542
6543 static int exec_shared_runtime_add(
6544 Manager *m,
6545 const char *id,
6546 char **tmp_dir,
6547 char **var_tmp_dir,
6548 int netns_storage_socket[2],
6549 int ipcns_storage_socket[2],
6550 ExecSharedRuntime **ret) {
6551
6552 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
6553 int r;
6554
6555 assert(m);
6556 assert(id);
6557
6558 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6559
6560 r = exec_shared_runtime_allocate(&rt, id);
6561 if (r < 0)
6562 return r;
6563
6564 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
6565 if (r < 0)
6566 return r;
6567
6568 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6569 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6570 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6571
6572 if (netns_storage_socket) {
6573 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6574 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6575 }
6576
6577 if (ipcns_storage_socket) {
6578 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6579 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6580 }
6581
6582 rt->manager = m;
6583
6584 if (ret)
6585 *ret = rt;
6586 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
6587 TAKE_PTR(rt);
6588 return 0;
6589 }
6590
6591 static int exec_shared_runtime_make(
6592 Manager *m,
6593 const ExecContext *c,
6594 const char *id,
6595 ExecSharedRuntime **ret) {
6596
6597 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6598 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6599 int r;
6600
6601 assert(m);
6602 assert(c);
6603 assert(id);
6604
6605 /* It is not necessary to create ExecSharedRuntime object. */
6606 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
6607 *ret = NULL;
6608 return 0;
6609 }
6610
6611 if (c->private_tmp &&
6612 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6613 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6614 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6615 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6616 if (r < 0)
6617 return r;
6618 }
6619
6620 if (exec_needs_network_namespace(c)) {
6621 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6622 return -errno;
6623 }
6624
6625 if (exec_needs_ipc_namespace(c)) {
6626 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6627 return -errno;
6628 }
6629
6630 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6631 if (r < 0)
6632 return r;
6633
6634 return 1;
6635 }
6636
6637 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6638 ExecSharedRuntime *rt;
6639 int r;
6640
6641 assert(m);
6642 assert(id);
6643 assert(ret);
6644
6645 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
6646 if (rt)
6647 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
6648 goto ref;
6649
6650 if (!create) {
6651 *ret = NULL;
6652 return 0;
6653 }
6654
6655 /* If not found, then create a new object. */
6656 r = exec_shared_runtime_make(m, c, id, &rt);
6657 if (r < 0)
6658 return r;
6659 if (r == 0) {
6660 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
6661 *ret = NULL;
6662 return 0;
6663 }
6664
6665 ref:
6666 /* increment reference counter. */
6667 rt->n_ref++;
6668 *ret = rt;
6669 return 1;
6670 }
6671
6672 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6673 ExecSharedRuntime *rt;
6674
6675 assert(m);
6676 assert(f);
6677 assert(fds);
6678
6679 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6680 fprintf(f, "exec-runtime=%s", rt->id);
6681
6682 if (rt->tmp_dir)
6683 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6684
6685 if (rt->var_tmp_dir)
6686 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6687
6688 if (rt->netns_storage_socket[0] >= 0) {
6689 int copy;
6690
6691 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6692 if (copy < 0)
6693 return copy;
6694
6695 fprintf(f, " netns-socket-0=%i", copy);
6696 }
6697
6698 if (rt->netns_storage_socket[1] >= 0) {
6699 int copy;
6700
6701 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6702 if (copy < 0)
6703 return copy;
6704
6705 fprintf(f, " netns-socket-1=%i", copy);
6706 }
6707
6708 if (rt->ipcns_storage_socket[0] >= 0) {
6709 int copy;
6710
6711 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6712 if (copy < 0)
6713 return copy;
6714
6715 fprintf(f, " ipcns-socket-0=%i", copy);
6716 }
6717
6718 if (rt->ipcns_storage_socket[1] >= 0) {
6719 int copy;
6720
6721 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6722 if (copy < 0)
6723 return copy;
6724
6725 fprintf(f, " ipcns-socket-1=%i", copy);
6726 }
6727
6728 fputc('\n', f);
6729 }
6730
6731 return 0;
6732 }
6733
6734 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6735 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6736 ExecSharedRuntime *rt;
6737 int r;
6738
6739 /* This is for the migration from old (v237 or earlier) deserialization text.
6740 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6741 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
6742 * so or not from the serialized text, then we always creates a new object owned by this. */
6743
6744 assert(u);
6745 assert(key);
6746 assert(value);
6747
6748 /* Manager manages ExecSharedRuntime objects by the unit id.
6749 * So, we omit the serialized text when the unit does not have id (yet?)... */
6750 if (isempty(u->id)) {
6751 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6752 return 0;
6753 }
6754
6755 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6756 return log_oom();
6757
6758 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6759 if (!rt) {
6760 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
6761 return log_oom();
6762
6763 rt = rt_create;
6764 }
6765
6766 if (streq(key, "tmp-dir")) {
6767 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6768 return -ENOMEM;
6769
6770 } else if (streq(key, "var-tmp-dir")) {
6771 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6772 return -ENOMEM;
6773
6774 } else if (streq(key, "netns-socket-0")) {
6775 int fd;
6776
6777 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6778 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6779 return 0;
6780 }
6781
6782 safe_close(rt->netns_storage_socket[0]);
6783 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6784
6785 } else if (streq(key, "netns-socket-1")) {
6786 int fd;
6787
6788 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
6789 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
6790 return 0;
6791 }
6792
6793 safe_close(rt->netns_storage_socket[1]);
6794 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
6795
6796 } else
6797 return 0;
6798
6799 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
6800 if (rt_create) {
6801 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
6802 if (r < 0) {
6803 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
6804 return 0;
6805 }
6806
6807 rt_create->manager = u->manager;
6808
6809 /* Avoid cleanup */
6810 TAKE_PTR(rt_create);
6811 }
6812
6813 return 1;
6814 }
6815
6816 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6817 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6818 char *id = NULL;
6819 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
6820 const char *p, *v = ASSERT_PTR(value);
6821 size_t n;
6822
6823 assert(m);
6824 assert(fds);
6825
6826 n = strcspn(v, " ");
6827 id = strndupa_safe(v, n);
6828 if (v[n] != ' ')
6829 goto finalize;
6830 p = v + n + 1;
6831
6832 v = startswith(p, "tmp-dir=");
6833 if (v) {
6834 n = strcspn(v, " ");
6835 tmp_dir = strndup(v, n);
6836 if (!tmp_dir)
6837 return log_oom();
6838 if (v[n] != ' ')
6839 goto finalize;
6840 p = v + n + 1;
6841 }
6842
6843 v = startswith(p, "var-tmp-dir=");
6844 if (v) {
6845 n = strcspn(v, " ");
6846 var_tmp_dir = strndup(v, n);
6847 if (!var_tmp_dir)
6848 return log_oom();
6849 if (v[n] != ' ')
6850 goto finalize;
6851 p = v + n + 1;
6852 }
6853
6854 v = startswith(p, "netns-socket-0=");
6855 if (v) {
6856 char *buf;
6857
6858 n = strcspn(v, " ");
6859 buf = strndupa_safe(v, n);
6860
6861 netns_fdpair[0] = parse_fd(buf);
6862 if (netns_fdpair[0] < 0)
6863 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
6864 if (!fdset_contains(fds, netns_fdpair[0]))
6865 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6866 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6867 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
6868 if (v[n] != ' ')
6869 goto finalize;
6870 p = v + n + 1;
6871 }
6872
6873 v = startswith(p, "netns-socket-1=");
6874 if (v) {
6875 char *buf;
6876
6877 n = strcspn(v, " ");
6878 buf = strndupa_safe(v, n);
6879
6880 netns_fdpair[1] = parse_fd(buf);
6881 if (netns_fdpair[1] < 0)
6882 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
6883 if (!fdset_contains(fds, netns_fdpair[1]))
6884 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6885 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6886 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6887 if (v[n] != ' ')
6888 goto finalize;
6889 p = v + n + 1;
6890 }
6891
6892 v = startswith(p, "ipcns-socket-0=");
6893 if (v) {
6894 char *buf;
6895
6896 n = strcspn(v, " ");
6897 buf = strndupa_safe(v, n);
6898
6899 ipcns_fdpair[0] = parse_fd(buf);
6900 if (ipcns_fdpair[0] < 0)
6901 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6902 if (!fdset_contains(fds, ipcns_fdpair[0]))
6903 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6904 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6905 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6906 if (v[n] != ' ')
6907 goto finalize;
6908 p = v + n + 1;
6909 }
6910
6911 v = startswith(p, "ipcns-socket-1=");
6912 if (v) {
6913 char *buf;
6914
6915 n = strcspn(v, " ");
6916 buf = strndupa_safe(v, n);
6917
6918 ipcns_fdpair[1] = parse_fd(buf);
6919 if (ipcns_fdpair[1] < 0)
6920 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6921 if (!fdset_contains(fds, ipcns_fdpair[1]))
6922 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6923 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6924 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
6925 }
6926
6927 finalize:
6928 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
6929 if (r < 0)
6930 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6931 return 0;
6932 }
6933
6934 void exec_shared_runtime_vacuum(Manager *m) {
6935 ExecSharedRuntime *rt;
6936
6937 assert(m);
6938
6939 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
6940
6941 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
6942 if (rt->n_ref > 0)
6943 continue;
6944
6945 (void) exec_shared_runtime_free(rt);
6946 }
6947 }
6948
6949 int exec_runtime_make(
6950 const Unit *unit,
6951 const ExecContext *context,
6952 ExecSharedRuntime *shared,
6953 DynamicCreds *creds,
6954 ExecRuntime **ret) {
6955 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6956 _cleanup_free_ char *ephemeral = NULL;
6957 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6958 int r;
6959
6960 assert(unit);
6961 assert(context);
6962 assert(ret);
6963
6964 if (!shared && !creds && !exec_needs_ephemeral(context)) {
6965 *ret = NULL;
6966 return 0;
6967 }
6968
6969 if (exec_needs_ephemeral(context)) {
6970 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
6971 if (r < 0)
6972 return r;
6973
6974 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
6975 if (r < 0)
6976 return r;
6977
6978 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
6979 return -errno;
6980 }
6981
6982 rt = new(ExecRuntime, 1);
6983 if (!rt)
6984 return -ENOMEM;
6985
6986 *rt = (ExecRuntime) {
6987 .shared = shared,
6988 .dynamic_creds = creds,
6989 .ephemeral_copy = TAKE_PTR(ephemeral),
6990 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
6991 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
6992 };
6993
6994 *ret = TAKE_PTR(rt);
6995 return 1;
6996 }
6997
6998 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
6999 if (!rt)
7000 return NULL;
7001
7002 exec_shared_runtime_unref(rt->shared);
7003 dynamic_creds_unref(rt->dynamic_creds);
7004
7005 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7006
7007 safe_close_pair(rt->ephemeral_storage_socket);
7008 return mfree(rt);
7009 }
7010
7011 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7012 if (!rt)
7013 return NULL;
7014
7015 rt->shared = exec_shared_runtime_destroy(rt->shared);
7016 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7017 return exec_runtime_free(rt);
7018 }
7019
7020 void exec_params_clear(ExecParameters *p) {
7021 if (!p)
7022 return;
7023
7024 p->environment = strv_free(p->environment);
7025 p->fd_names = strv_free(p->fd_names);
7026 p->fds = mfree(p->fds);
7027 p->exec_fd = safe_close(p->exec_fd);
7028 }
7029
7030 void exec_directory_done(ExecDirectory *d) {
7031 if (!d)
7032 return;
7033
7034 for (size_t i = 0; i < d->n_items; i++) {
7035 free(d->items[i].path);
7036 strv_free(d->items[i].symlinks);
7037 }
7038
7039 d->items = mfree(d->items);
7040 d->n_items = 0;
7041 d->mode = 0755;
7042 }
7043
7044 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7045 assert(d);
7046 assert(path);
7047
7048 for (size_t i = 0; i < d->n_items; i++)
7049 if (path_equal(d->items[i].path, path))
7050 return &d->items[i];
7051
7052 return NULL;
7053 }
7054
7055 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7056 _cleanup_strv_free_ char **s = NULL;
7057 _cleanup_free_ char *p = NULL;
7058 ExecDirectoryItem *existing;
7059 int r;
7060
7061 assert(d);
7062 assert(path);
7063
7064 existing = exec_directory_find(d, path);
7065 if (existing) {
7066 r = strv_extend(&existing->symlinks, symlink);
7067 if (r < 0)
7068 return r;
7069
7070 return 0; /* existing item is updated */
7071 }
7072
7073 p = strdup(path);
7074 if (!p)
7075 return -ENOMEM;
7076
7077 if (symlink) {
7078 s = strv_new(symlink);
7079 if (!s)
7080 return -ENOMEM;
7081 }
7082
7083 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7084 return -ENOMEM;
7085
7086 d->items[d->n_items++] = (ExecDirectoryItem) {
7087 .path = TAKE_PTR(p),
7088 .symlinks = TAKE_PTR(s),
7089 };
7090
7091 return 1; /* new item is added */
7092 }
7093
7094 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7095 assert(a);
7096 assert(b);
7097
7098 return path_compare(a->path, b->path);
7099 }
7100
7101 void exec_directory_sort(ExecDirectory *d) {
7102 assert(d);
7103
7104 /* Sort the exec directories to make always parent directories processed at first in
7105 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7106 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7107 * list. See also comments in setup_exec_directory() and issue #24783. */
7108
7109 if (d->n_items <= 1)
7110 return;
7111
7112 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7113
7114 for (size_t i = 1; i < d->n_items; i++)
7115 for (size_t j = 0; j < i; j++)
7116 if (path_startswith(d->items[i].path, d->items[j].path)) {
7117 d->items[i].only_create = true;
7118 break;
7119 }
7120 }
7121
7122 ExecCleanMask exec_clean_mask_from_string(const char *s) {
7123 ExecDirectoryType t;
7124
7125 assert(s);
7126
7127 if (streq(s, "all"))
7128 return EXEC_CLEAN_ALL;
7129 if (streq(s, "fdstore"))
7130 return EXEC_CLEAN_FDSTORE;
7131
7132 t = exec_resource_type_from_string(s);
7133 if (t < 0)
7134 return (ExecCleanMask) t;
7135
7136 return 1U << t;
7137 }
7138
7139 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7140 [EXEC_INPUT_NULL] = "null",
7141 [EXEC_INPUT_TTY] = "tty",
7142 [EXEC_INPUT_TTY_FORCE] = "tty-force",
7143 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7144 [EXEC_INPUT_SOCKET] = "socket",
7145 [EXEC_INPUT_NAMED_FD] = "fd",
7146 [EXEC_INPUT_DATA] = "data",
7147 [EXEC_INPUT_FILE] = "file",
7148 };
7149
7150 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7151
7152 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7153 [EXEC_OUTPUT_INHERIT] = "inherit",
7154 [EXEC_OUTPUT_NULL] = "null",
7155 [EXEC_OUTPUT_TTY] = "tty",
7156 [EXEC_OUTPUT_KMSG] = "kmsg",
7157 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7158 [EXEC_OUTPUT_JOURNAL] = "journal",
7159 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7160 [EXEC_OUTPUT_SOCKET] = "socket",
7161 [EXEC_OUTPUT_NAMED_FD] = "fd",
7162 [EXEC_OUTPUT_FILE] = "file",
7163 [EXEC_OUTPUT_FILE_APPEND] = "append",
7164 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7165 };
7166
7167 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7168
7169 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7170 [EXEC_UTMP_INIT] = "init",
7171 [EXEC_UTMP_LOGIN] = "login",
7172 [EXEC_UTMP_USER] = "user",
7173 };
7174
7175 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7176
7177 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7178 [EXEC_PRESERVE_NO] = "no",
7179 [EXEC_PRESERVE_YES] = "yes",
7180 [EXEC_PRESERVE_RESTART] = "restart",
7181 };
7182
7183 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7184
7185 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7186 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7187 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7188 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7189 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7190 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7191 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7192 };
7193
7194 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7195
7196 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7197 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7198 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7199 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7200 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7201 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7202 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7203 };
7204
7205 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7206
7207 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7208 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7209 * directories, specifically .timer units with their timestamp touch file. */
7210 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7211 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7212 [EXEC_DIRECTORY_STATE] = "state",
7213 [EXEC_DIRECTORY_CACHE] = "cache",
7214 [EXEC_DIRECTORY_LOGS] = "logs",
7215 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7216 };
7217
7218 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7219
7220 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7221 * the service payload in. */
7222 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7223 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7224 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7225 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7226 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7227 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7228 };
7229
7230 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7231
7232 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7233 [EXEC_KEYRING_INHERIT] = "inherit",
7234 [EXEC_KEYRING_PRIVATE] = "private",
7235 [EXEC_KEYRING_SHARED] = "shared",
7236 };
7237
7238 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);