]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: add OpenFile setting
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #if HAVE_PAM
19 #include <security/pam_appl.h>
20 #endif
21
22 #if HAVE_SELINUX
23 #include <selinux/selinux.h>
24 #endif
25
26 #if HAVE_SECCOMP
27 #include <seccomp.h>
28 #endif
29
30 #if HAVE_APPARMOR
31 #include <sys/apparmor.h>
32 #endif
33
34 #include "sd-messages.h"
35
36 #include "acl-util.h"
37 #include "af-list.h"
38 #include "alloc-util.h"
39 #if HAVE_APPARMOR
40 #include "apparmor-util.h"
41 #endif
42 #include "argv-util.h"
43 #include "async.h"
44 #include "barrier.h"
45 #include "bpf-lsm.h"
46 #include "cap-list.h"
47 #include "capability-util.h"
48 #include "cgroup-setup.h"
49 #include "chase-symlinks.h"
50 #include "chown-recursive.h"
51 #include "constants.h"
52 #include "cpu-set-util.h"
53 #include "creds-util.h"
54 #include "data-fd-util.h"
55 #include "env-file.h"
56 #include "env-util.h"
57 #include "errno-list.h"
58 #include "escape.h"
59 #include "execute.h"
60 #include "exit-status.h"
61 #include "fd-util.h"
62 #include "fileio.h"
63 #include "format-util.h"
64 #include "glob-util.h"
65 #include "hexdecoct.h"
66 #include "io-util.h"
67 #include "ioprio-util.h"
68 #include "label.h"
69 #include "log.h"
70 #include "macro.h"
71 #include "manager.h"
72 #include "manager-dump.h"
73 #include "memory-util.h"
74 #include "missing_fs.h"
75 #include "missing_ioprio.h"
76 #include "mkdir-label.h"
77 #include "mount-util.h"
78 #include "mountpoint-util.h"
79 #include "namespace.h"
80 #include "parse-util.h"
81 #include "path-util.h"
82 #include "process-util.h"
83 #include "random-util.h"
84 #include "recurse-dir.h"
85 #include "rlimit-util.h"
86 #include "rm-rf.h"
87 #if HAVE_SECCOMP
88 #include "seccomp-util.h"
89 #endif
90 #include "securebits-util.h"
91 #include "selinux-util.h"
92 #include "signal-util.h"
93 #include "smack-util.h"
94 #include "socket-util.h"
95 #include "sort-util.h"
96 #include "special.h"
97 #include "stat-util.h"
98 #include "string-table.h"
99 #include "string-util.h"
100 #include "strv.h"
101 #include "syslog-util.h"
102 #include "terminal-util.h"
103 #include "tmpfile-util.h"
104 #include "umask-util.h"
105 #include "unit-serialize.h"
106 #include "user-util.h"
107 #include "utmp-wtmp.h"
108
109 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
110 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
111
112 #define SNDBUF_SIZE (8*1024*1024)
113
114 static int shift_fds(int fds[], size_t n_fds) {
115 if (n_fds <= 0)
116 return 0;
117
118 /* Modifies the fds array! (sorts it) */
119
120 assert(fds);
121
122 for (int start = 0;;) {
123 int restart_from = -1;
124
125 for (int i = start; i < (int) n_fds; i++) {
126 int nfd;
127
128 /* Already at right index? */
129 if (fds[i] == i+3)
130 continue;
131
132 nfd = fcntl(fds[i], F_DUPFD, i + 3);
133 if (nfd < 0)
134 return -errno;
135
136 safe_close(fds[i]);
137 fds[i] = nfd;
138
139 /* Hmm, the fd we wanted isn't free? Then
140 * let's remember that and try again from here */
141 if (nfd != i+3 && restart_from < 0)
142 restart_from = i;
143 }
144
145 if (restart_from < 0)
146 break;
147
148 start = restart_from;
149 }
150
151 return 0;
152 }
153
154 static int flags_fds(
155 const int fds[],
156 size_t n_socket_fds,
157 size_t n_fds,
158 bool nonblock) {
159
160 int r;
161
162 if (n_fds <= 0)
163 return 0;
164
165 assert(fds);
166
167 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
168 * O_NONBLOCK only applies to socket activation though. */
169
170 for (size_t i = 0; i < n_fds; i++) {
171
172 if (i < n_socket_fds) {
173 r = fd_nonblock(fds[i], nonblock);
174 if (r < 0)
175 return r;
176 }
177
178 /* We unconditionally drop FD_CLOEXEC from the fds,
179 * since after all we want to pass these fds to our
180 * children */
181
182 r = fd_cloexec(fds[i], false);
183 if (r < 0)
184 return r;
185 }
186
187 return 0;
188 }
189
190 static const char *exec_context_tty_path(const ExecContext *context) {
191 assert(context);
192
193 if (context->stdio_as_fds)
194 return NULL;
195
196 if (context->tty_path)
197 return context->tty_path;
198
199 return "/dev/console";
200 }
201
202 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
203 const char *path;
204
205 assert(context);
206
207 path = exec_context_tty_path(context);
208
209 if (context->tty_vhangup) {
210 if (p && p->stdin_fd >= 0)
211 (void) terminal_vhangup_fd(p->stdin_fd);
212 else if (path)
213 (void) terminal_vhangup(path);
214 }
215
216 if (context->tty_reset) {
217 if (p && p->stdin_fd >= 0)
218 (void) reset_terminal_fd(p->stdin_fd, true);
219 else if (path)
220 (void) reset_terminal(path);
221 }
222
223 if (p && p->stdin_fd >= 0)
224 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
225
226 if (context->tty_vt_disallocate && path)
227 (void) vt_disallocate(path);
228 }
229
230 static bool is_terminal_input(ExecInput i) {
231 return IN_SET(i,
232 EXEC_INPUT_TTY,
233 EXEC_INPUT_TTY_FORCE,
234 EXEC_INPUT_TTY_FAIL);
235 }
236
237 static bool is_terminal_output(ExecOutput o) {
238 return IN_SET(o,
239 EXEC_OUTPUT_TTY,
240 EXEC_OUTPUT_KMSG_AND_CONSOLE,
241 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
242 }
243
244 static bool is_kmsg_output(ExecOutput o) {
245 return IN_SET(o,
246 EXEC_OUTPUT_KMSG,
247 EXEC_OUTPUT_KMSG_AND_CONSOLE);
248 }
249
250 static bool exec_context_needs_term(const ExecContext *c) {
251 assert(c);
252
253 /* Return true if the execution context suggests we should set $TERM to something useful. */
254
255 if (is_terminal_input(c->std_input))
256 return true;
257
258 if (is_terminal_output(c->std_output))
259 return true;
260
261 if (is_terminal_output(c->std_error))
262 return true;
263
264 return !!c->tty_path;
265 }
266
267 static int open_null_as(int flags, int nfd) {
268 int fd;
269
270 assert(nfd >= 0);
271
272 fd = open("/dev/null", flags|O_NOCTTY);
273 if (fd < 0)
274 return -errno;
275
276 return move_fd(fd, nfd, false);
277 }
278
279 static int connect_journal_socket(
280 int fd,
281 const char *log_namespace,
282 uid_t uid,
283 gid_t gid) {
284
285 uid_t olduid = UID_INVALID;
286 gid_t oldgid = GID_INVALID;
287 const char *j;
288 int r;
289
290 j = log_namespace ?
291 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
292 "/run/systemd/journal/stdout";
293
294 if (gid_is_valid(gid)) {
295 oldgid = getgid();
296
297 if (setegid(gid) < 0)
298 return -errno;
299 }
300
301 if (uid_is_valid(uid)) {
302 olduid = getuid();
303
304 if (seteuid(uid) < 0) {
305 r = -errno;
306 goto restore_gid;
307 }
308 }
309
310 r = connect_unix_path(fd, AT_FDCWD, j);
311
312 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
313 an LSM interferes. */
314
315 if (uid_is_valid(uid))
316 (void) seteuid(olduid);
317
318 restore_gid:
319 if (gid_is_valid(gid))
320 (void) setegid(oldgid);
321
322 return r;
323 }
324
325 static int connect_logger_as(
326 const Unit *unit,
327 const ExecContext *context,
328 const ExecParameters *params,
329 ExecOutput output,
330 const char *ident,
331 int nfd,
332 uid_t uid,
333 gid_t gid) {
334
335 _cleanup_close_ int fd = -EBADF;
336 int r;
337
338 assert(context);
339 assert(params);
340 assert(output < _EXEC_OUTPUT_MAX);
341 assert(ident);
342 assert(nfd >= 0);
343
344 fd = socket(AF_UNIX, SOCK_STREAM, 0);
345 if (fd < 0)
346 return -errno;
347
348 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
349 if (r < 0)
350 return r;
351
352 if (shutdown(fd, SHUT_RD) < 0)
353 return -errno;
354
355 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
356
357 if (dprintf(fd,
358 "%s\n"
359 "%s\n"
360 "%i\n"
361 "%i\n"
362 "%i\n"
363 "%i\n"
364 "%i\n",
365 context->syslog_identifier ?: ident,
366 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
367 context->syslog_priority,
368 !!context->syslog_level_prefix,
369 false,
370 is_kmsg_output(output),
371 is_terminal_output(output)) < 0)
372 return -errno;
373
374 return move_fd(TAKE_FD(fd), nfd, false);
375 }
376
377 static int open_terminal_as(const char *path, int flags, int nfd) {
378 int fd;
379
380 assert(path);
381 assert(nfd >= 0);
382
383 fd = open_terminal(path, flags | O_NOCTTY);
384 if (fd < 0)
385 return fd;
386
387 return move_fd(fd, nfd, false);
388 }
389
390 static int acquire_path(const char *path, int flags, mode_t mode) {
391 _cleanup_close_ int fd = -EBADF;
392 int r;
393
394 assert(path);
395
396 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
397 flags |= O_CREAT;
398
399 fd = open(path, flags|O_NOCTTY, mode);
400 if (fd >= 0)
401 return TAKE_FD(fd);
402
403 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
404 return -errno;
405
406 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
407
408 fd = socket(AF_UNIX, SOCK_STREAM, 0);
409 if (fd < 0)
410 return -errno;
411
412 r = connect_unix_path(fd, AT_FDCWD, path);
413 if (IN_SET(r, -ENOTSOCK, -EINVAL))
414 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
415 * wasn't an AF_UNIX socket after all */
416 return -ENXIO;
417 if (r < 0)
418 return r;
419
420 if ((flags & O_ACCMODE) == O_RDONLY)
421 r = shutdown(fd, SHUT_WR);
422 else if ((flags & O_ACCMODE) == O_WRONLY)
423 r = shutdown(fd, SHUT_RD);
424 else
425 r = 0;
426 if (r < 0)
427 return -errno;
428
429 return TAKE_FD(fd);
430 }
431
432 static int fixup_input(
433 const ExecContext *context,
434 int socket_fd,
435 bool apply_tty_stdin) {
436
437 ExecInput std_input;
438
439 assert(context);
440
441 std_input = context->std_input;
442
443 if (is_terminal_input(std_input) && !apply_tty_stdin)
444 return EXEC_INPUT_NULL;
445
446 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
447 return EXEC_INPUT_NULL;
448
449 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
450 return EXEC_INPUT_NULL;
451
452 return std_input;
453 }
454
455 static int fixup_output(ExecOutput output, int socket_fd) {
456
457 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
458 return EXEC_OUTPUT_INHERIT;
459
460 return output;
461 }
462
463 static int setup_input(
464 const ExecContext *context,
465 const ExecParameters *params,
466 int socket_fd,
467 const int named_iofds[static 3]) {
468
469 ExecInput i;
470 int r;
471
472 assert(context);
473 assert(params);
474 assert(named_iofds);
475
476 if (params->stdin_fd >= 0) {
477 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
478 return -errno;
479
480 /* Try to make this the controlling tty, if it is a tty, and reset it */
481 if (isatty(STDIN_FILENO)) {
482 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
483 (void) reset_terminal_fd(STDIN_FILENO, true);
484 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
485 }
486
487 return STDIN_FILENO;
488 }
489
490 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
491
492 switch (i) {
493
494 case EXEC_INPUT_NULL:
495 return open_null_as(O_RDONLY, STDIN_FILENO);
496
497 case EXEC_INPUT_TTY:
498 case EXEC_INPUT_TTY_FORCE:
499 case EXEC_INPUT_TTY_FAIL: {
500 int fd;
501
502 fd = acquire_terminal(exec_context_tty_path(context),
503 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
504 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
505 ACQUIRE_TERMINAL_WAIT,
506 USEC_INFINITY);
507 if (fd < 0)
508 return fd;
509
510 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
511 if (r < 0)
512 return r;
513
514 return move_fd(fd, STDIN_FILENO, false);
515 }
516
517 case EXEC_INPUT_SOCKET:
518 assert(socket_fd >= 0);
519
520 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
521
522 case EXEC_INPUT_NAMED_FD:
523 assert(named_iofds[STDIN_FILENO] >= 0);
524
525 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
526 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
527
528 case EXEC_INPUT_DATA: {
529 int fd;
530
531 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
532 if (fd < 0)
533 return fd;
534
535 return move_fd(fd, STDIN_FILENO, false);
536 }
537
538 case EXEC_INPUT_FILE: {
539 bool rw;
540 int fd;
541
542 assert(context->stdio_file[STDIN_FILENO]);
543
544 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
545 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
546
547 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
548 if (fd < 0)
549 return fd;
550
551 return move_fd(fd, STDIN_FILENO, false);
552 }
553
554 default:
555 assert_not_reached();
556 }
557 }
558
559 static bool can_inherit_stderr_from_stdout(
560 const ExecContext *context,
561 ExecOutput o,
562 ExecOutput e) {
563
564 assert(context);
565
566 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
567 * stderr fd */
568
569 if (e == EXEC_OUTPUT_INHERIT)
570 return true;
571 if (e != o)
572 return false;
573
574 if (e == EXEC_OUTPUT_NAMED_FD)
575 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
576
577 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
578 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
579
580 return true;
581 }
582
583 static int setup_output(
584 const Unit *unit,
585 const ExecContext *context,
586 const ExecParameters *params,
587 int fileno,
588 int socket_fd,
589 const int named_iofds[static 3],
590 const char *ident,
591 uid_t uid,
592 gid_t gid,
593 dev_t *journal_stream_dev,
594 ino_t *journal_stream_ino) {
595
596 ExecOutput o;
597 ExecInput i;
598 int r;
599
600 assert(unit);
601 assert(context);
602 assert(params);
603 assert(ident);
604 assert(journal_stream_dev);
605 assert(journal_stream_ino);
606
607 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
608
609 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
610 return -errno;
611
612 return STDOUT_FILENO;
613 }
614
615 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
616 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
617 return -errno;
618
619 return STDERR_FILENO;
620 }
621
622 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
623 o = fixup_output(context->std_output, socket_fd);
624
625 if (fileno == STDERR_FILENO) {
626 ExecOutput e;
627 e = fixup_output(context->std_error, socket_fd);
628
629 /* This expects the input and output are already set up */
630
631 /* Don't change the stderr file descriptor if we inherit all
632 * the way and are not on a tty */
633 if (e == EXEC_OUTPUT_INHERIT &&
634 o == EXEC_OUTPUT_INHERIT &&
635 i == EXEC_INPUT_NULL &&
636 !is_terminal_input(context->std_input) &&
637 getppid() != 1)
638 return fileno;
639
640 /* Duplicate from stdout if possible */
641 if (can_inherit_stderr_from_stdout(context, o, e))
642 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
643
644 o = e;
645
646 } else if (o == EXEC_OUTPUT_INHERIT) {
647 /* If input got downgraded, inherit the original value */
648 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
649 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
650
651 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
652 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
653 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
654
655 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
656 if (getppid() != 1)
657 return fileno;
658
659 /* We need to open /dev/null here anew, to get the right access mode. */
660 return open_null_as(O_WRONLY, fileno);
661 }
662
663 switch (o) {
664
665 case EXEC_OUTPUT_NULL:
666 return open_null_as(O_WRONLY, fileno);
667
668 case EXEC_OUTPUT_TTY:
669 if (is_terminal_input(i))
670 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
671
672 /* We don't reset the terminal if this is just about output */
673 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
674
675 case EXEC_OUTPUT_KMSG:
676 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
677 case EXEC_OUTPUT_JOURNAL:
678 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
679 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
680 if (r < 0) {
681 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
682 fileno == STDOUT_FILENO ? "stdout" : "stderr");
683 r = open_null_as(O_WRONLY, fileno);
684 } else {
685 struct stat st;
686
687 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
688 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
689 * services to detect whether they are connected to the journal or not.
690 *
691 * If both stdout and stderr are connected to a stream then let's make sure to store the data
692 * about STDERR as that's usually the best way to do logging. */
693
694 if (fstat(fileno, &st) >= 0 &&
695 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
696 *journal_stream_dev = st.st_dev;
697 *journal_stream_ino = st.st_ino;
698 }
699 }
700 return r;
701
702 case EXEC_OUTPUT_SOCKET:
703 assert(socket_fd >= 0);
704
705 return RET_NERRNO(dup2(socket_fd, fileno));
706
707 case EXEC_OUTPUT_NAMED_FD:
708 assert(named_iofds[fileno] >= 0);
709
710 (void) fd_nonblock(named_iofds[fileno], false);
711 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
712
713 case EXEC_OUTPUT_FILE:
714 case EXEC_OUTPUT_FILE_APPEND:
715 case EXEC_OUTPUT_FILE_TRUNCATE: {
716 bool rw;
717 int fd, flags;
718
719 assert(context->stdio_file[fileno]);
720
721 rw = context->std_input == EXEC_INPUT_FILE &&
722 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
723
724 if (rw)
725 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
726
727 flags = O_WRONLY;
728 if (o == EXEC_OUTPUT_FILE_APPEND)
729 flags |= O_APPEND;
730 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
731 flags |= O_TRUNC;
732
733 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
734 if (fd < 0)
735 return fd;
736
737 return move_fd(fd, fileno, 0);
738 }
739
740 default:
741 assert_not_reached();
742 }
743 }
744
745 static int chown_terminal(int fd, uid_t uid) {
746 int r;
747
748 assert(fd >= 0);
749
750 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
751 if (isatty(fd) < 1) {
752 if (IN_SET(errno, EINVAL, ENOTTY))
753 return 0; /* not a tty */
754
755 return -errno;
756 }
757
758 /* This might fail. What matters are the results. */
759 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
760 if (r < 0)
761 return r;
762
763 return 1;
764 }
765
766 static int setup_confirm_stdio(
767 const ExecContext *context,
768 const char *vc,
769 int *ret_saved_stdin,
770 int *ret_saved_stdout) {
771
772 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
773 int r;
774
775 assert(ret_saved_stdin);
776 assert(ret_saved_stdout);
777
778 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
779 if (saved_stdin < 0)
780 return -errno;
781
782 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
783 if (saved_stdout < 0)
784 return -errno;
785
786 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
787 if (fd < 0)
788 return fd;
789
790 r = chown_terminal(fd, getuid());
791 if (r < 0)
792 return r;
793
794 r = reset_terminal_fd(fd, true);
795 if (r < 0)
796 return r;
797
798 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
799 if (r < 0)
800 return r;
801
802 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
803 TAKE_FD(fd);
804 if (r < 0)
805 return r;
806
807 *ret_saved_stdin = TAKE_FD(saved_stdin);
808 *ret_saved_stdout = TAKE_FD(saved_stdout);
809 return 0;
810 }
811
812 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
813 assert(err < 0);
814
815 if (err == -ETIMEDOUT)
816 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
817 else {
818 errno = -err;
819 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
820 }
821 }
822
823 static void write_confirm_error(int err, const char *vc, const Unit *u) {
824 _cleanup_close_ int fd = -EBADF;
825
826 assert(vc);
827
828 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
829 if (fd < 0)
830 return;
831
832 write_confirm_error_fd(err, fd, u);
833 }
834
835 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
836 int r = 0;
837
838 assert(saved_stdin);
839 assert(saved_stdout);
840
841 release_terminal();
842
843 if (*saved_stdin >= 0)
844 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
845 r = -errno;
846
847 if (*saved_stdout >= 0)
848 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
849 r = -errno;
850
851 *saved_stdin = safe_close(*saved_stdin);
852 *saved_stdout = safe_close(*saved_stdout);
853
854 return r;
855 }
856
857 enum {
858 CONFIRM_PRETEND_FAILURE = -1,
859 CONFIRM_PRETEND_SUCCESS = 0,
860 CONFIRM_EXECUTE = 1,
861 };
862
863 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
864 int saved_stdout = -1, saved_stdin = -1, r;
865 _cleanup_free_ char *e = NULL;
866 char c;
867
868 /* For any internal errors, assume a positive response. */
869 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
870 if (r < 0) {
871 write_confirm_error(r, vc, u);
872 return CONFIRM_EXECUTE;
873 }
874
875 /* confirm_spawn might have been disabled while we were sleeping. */
876 if (manager_is_confirm_spawn_disabled(u->manager)) {
877 r = 1;
878 goto restore_stdio;
879 }
880
881 e = ellipsize(cmdline, 60, 100);
882 if (!e) {
883 log_oom();
884 r = CONFIRM_EXECUTE;
885 goto restore_stdio;
886 }
887
888 for (;;) {
889 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
890 if (r < 0) {
891 write_confirm_error_fd(r, STDOUT_FILENO, u);
892 r = CONFIRM_EXECUTE;
893 goto restore_stdio;
894 }
895
896 switch (c) {
897 case 'c':
898 printf("Resuming normal execution.\n");
899 manager_disable_confirm_spawn();
900 r = 1;
901 break;
902 case 'D':
903 unit_dump(u, stdout, " ");
904 continue; /* ask again */
905 case 'f':
906 printf("Failing execution.\n");
907 r = CONFIRM_PRETEND_FAILURE;
908 break;
909 case 'h':
910 printf(" c - continue, proceed without asking anymore\n"
911 " D - dump, show the state of the unit\n"
912 " f - fail, don't execute the command and pretend it failed\n"
913 " h - help\n"
914 " i - info, show a short summary of the unit\n"
915 " j - jobs, show jobs that are in progress\n"
916 " s - skip, don't execute the command and pretend it succeeded\n"
917 " y - yes, execute the command\n");
918 continue; /* ask again */
919 case 'i':
920 printf(" Description: %s\n"
921 " Unit: %s\n"
922 " Command: %s\n",
923 u->id, u->description, cmdline);
924 continue; /* ask again */
925 case 'j':
926 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
927 continue; /* ask again */
928 case 'n':
929 /* 'n' was removed in favor of 'f'. */
930 printf("Didn't understand 'n', did you mean 'f'?\n");
931 continue; /* ask again */
932 case 's':
933 printf("Skipping execution.\n");
934 r = CONFIRM_PRETEND_SUCCESS;
935 break;
936 case 'y':
937 r = CONFIRM_EXECUTE;
938 break;
939 default:
940 assert_not_reached();
941 }
942 break;
943 }
944
945 restore_stdio:
946 restore_confirm_stdio(&saved_stdin, &saved_stdout);
947 return r;
948 }
949
950 static int get_fixed_user(const ExecContext *c, const char **user,
951 uid_t *uid, gid_t *gid,
952 const char **home, const char **shell) {
953 int r;
954 const char *name;
955
956 assert(c);
957
958 if (!c->user)
959 return 0;
960
961 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
962 * (i.e. are "/" or "/bin/nologin"). */
963
964 name = c->user;
965 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
966 if (r < 0)
967 return r;
968
969 *user = name;
970 return 0;
971 }
972
973 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
974 int r;
975 const char *name;
976
977 assert(c);
978
979 if (!c->group)
980 return 0;
981
982 name = c->group;
983 r = get_group_creds(&name, gid, 0);
984 if (r < 0)
985 return r;
986
987 *group = name;
988 return 0;
989 }
990
991 static int get_supplementary_groups(const ExecContext *c, const char *user,
992 const char *group, gid_t gid,
993 gid_t **supplementary_gids, int *ngids) {
994 int r, k = 0;
995 int ngroups_max;
996 bool keep_groups = false;
997 gid_t *groups = NULL;
998 _cleanup_free_ gid_t *l_gids = NULL;
999
1000 assert(c);
1001
1002 /*
1003 * If user is given, then lookup GID and supplementary groups list.
1004 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1005 * here and as early as possible so we keep the list of supplementary
1006 * groups of the caller.
1007 */
1008 if (user && gid_is_valid(gid) && gid != 0) {
1009 /* First step, initialize groups from /etc/groups */
1010 if (initgroups(user, gid) < 0)
1011 return -errno;
1012
1013 keep_groups = true;
1014 }
1015
1016 if (strv_isempty(c->supplementary_groups))
1017 return 0;
1018
1019 /*
1020 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1021 * be positive, otherwise fail.
1022 */
1023 errno = 0;
1024 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1025 if (ngroups_max <= 0)
1026 return errno_or_else(EOPNOTSUPP);
1027
1028 l_gids = new(gid_t, ngroups_max);
1029 if (!l_gids)
1030 return -ENOMEM;
1031
1032 if (keep_groups) {
1033 /*
1034 * Lookup the list of groups that the user belongs to, we
1035 * avoid NSS lookups here too for gid=0.
1036 */
1037 k = ngroups_max;
1038 if (getgrouplist(user, gid, l_gids, &k) < 0)
1039 return -EINVAL;
1040 } else
1041 k = 0;
1042
1043 STRV_FOREACH(i, c->supplementary_groups) {
1044 const char *g;
1045
1046 if (k >= ngroups_max)
1047 return -E2BIG;
1048
1049 g = *i;
1050 r = get_group_creds(&g, l_gids+k, 0);
1051 if (r < 0)
1052 return r;
1053
1054 k++;
1055 }
1056
1057 /*
1058 * Sets ngids to zero to drop all supplementary groups, happens
1059 * when we are under root and SupplementaryGroups= is empty.
1060 */
1061 if (k == 0) {
1062 *ngids = 0;
1063 return 0;
1064 }
1065
1066 /* Otherwise get the final list of supplementary groups */
1067 groups = memdup(l_gids, sizeof(gid_t) * k);
1068 if (!groups)
1069 return -ENOMEM;
1070
1071 *supplementary_gids = groups;
1072 *ngids = k;
1073
1074 groups = NULL;
1075
1076 return 0;
1077 }
1078
1079 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1080 int r;
1081
1082 /* Handle SupplementaryGroups= if it is not empty */
1083 if (ngids > 0) {
1084 r = maybe_setgroups(ngids, supplementary_gids);
1085 if (r < 0)
1086 return r;
1087 }
1088
1089 if (gid_is_valid(gid)) {
1090 /* Then set our gids */
1091 if (setresgid(gid, gid, gid) < 0)
1092 return -errno;
1093 }
1094
1095 return 0;
1096 }
1097
1098 static int set_securebits(int bits, int mask) {
1099 int current, applied;
1100 current = prctl(PR_GET_SECUREBITS);
1101 if (current < 0)
1102 return -errno;
1103 /* Clear all securebits defined in mask and set bits */
1104 applied = (current & ~mask) | bits;
1105 if (current == applied)
1106 return 0;
1107 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1108 return -errno;
1109 return 1;
1110 }
1111
1112 static int enforce_user(const ExecContext *context, uid_t uid) {
1113 assert(context);
1114 int r;
1115
1116 if (!uid_is_valid(uid))
1117 return 0;
1118
1119 /* Sets (but doesn't look up) the uid and make sure we keep the
1120 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1121 * required, so we also need keep-caps in this case.
1122 */
1123
1124 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
1125
1126 /* First step: If we need to keep capabilities but
1127 * drop privileges we need to make sure we keep our
1128 * caps, while we drop privileges. */
1129 if (uid != 0) {
1130 /* Add KEEP_CAPS to the securebits */
1131 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1132 if (r < 0)
1133 return r;
1134 }
1135 }
1136
1137 /* Second step: actually set the uids */
1138 if (setresuid(uid, uid, uid) < 0)
1139 return -errno;
1140
1141 /* At this point we should have all necessary capabilities but
1142 are otherwise a normal user. However, the caps might got
1143 corrupted due to the setresuid() so we need clean them up
1144 later. This is done outside of this call. */
1145
1146 return 0;
1147 }
1148
1149 #if HAVE_PAM
1150
1151 static int null_conv(
1152 int num_msg,
1153 const struct pam_message **msg,
1154 struct pam_response **resp,
1155 void *appdata_ptr) {
1156
1157 /* We don't support conversations */
1158
1159 return PAM_CONV_ERR;
1160 }
1161
1162 #endif
1163
1164 static int setup_pam(
1165 const char *name,
1166 const char *user,
1167 uid_t uid,
1168 gid_t gid,
1169 const char *tty,
1170 char ***env, /* updated on success */
1171 const int fds[], size_t n_fds) {
1172
1173 #if HAVE_PAM
1174
1175 static const struct pam_conv conv = {
1176 .conv = null_conv,
1177 .appdata_ptr = NULL
1178 };
1179
1180 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1181 _cleanup_strv_free_ char **e = NULL;
1182 pam_handle_t *handle = NULL;
1183 sigset_t old_ss;
1184 int pam_code = PAM_SUCCESS, r;
1185 bool close_session = false;
1186 pid_t pam_pid = 0, parent_pid;
1187 int flags = 0;
1188
1189 assert(name);
1190 assert(user);
1191 assert(env);
1192
1193 /* We set up PAM in the parent process, then fork. The child
1194 * will then stay around until killed via PR_GET_PDEATHSIG or
1195 * systemd via the cgroup logic. It will then remove the PAM
1196 * session again. The parent process will exec() the actual
1197 * daemon. We do things this way to ensure that the main PID
1198 * of the daemon is the one we initially fork()ed. */
1199
1200 r = barrier_create(&barrier);
1201 if (r < 0)
1202 goto fail;
1203
1204 if (log_get_max_level() < LOG_DEBUG)
1205 flags |= PAM_SILENT;
1206
1207 pam_code = pam_start(name, user, &conv, &handle);
1208 if (pam_code != PAM_SUCCESS) {
1209 handle = NULL;
1210 goto fail;
1211 }
1212
1213 if (!tty) {
1214 _cleanup_free_ char *q = NULL;
1215
1216 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1217 * out if that's the case, and read the TTY off it. */
1218
1219 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1220 tty = strjoina("/dev/", q);
1221 }
1222
1223 if (tty) {
1224 pam_code = pam_set_item(handle, PAM_TTY, tty);
1225 if (pam_code != PAM_SUCCESS)
1226 goto fail;
1227 }
1228
1229 STRV_FOREACH(nv, *env) {
1230 pam_code = pam_putenv(handle, *nv);
1231 if (pam_code != PAM_SUCCESS)
1232 goto fail;
1233 }
1234
1235 pam_code = pam_acct_mgmt(handle, flags);
1236 if (pam_code != PAM_SUCCESS)
1237 goto fail;
1238
1239 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1240 if (pam_code != PAM_SUCCESS)
1241 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1242
1243 pam_code = pam_open_session(handle, flags);
1244 if (pam_code != PAM_SUCCESS)
1245 goto fail;
1246
1247 close_session = true;
1248
1249 e = pam_getenvlist(handle);
1250 if (!e) {
1251 pam_code = PAM_BUF_ERR;
1252 goto fail;
1253 }
1254
1255 /* Block SIGTERM, so that we know that it won't get lost in the child */
1256
1257 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1258
1259 parent_pid = getpid_cached();
1260
1261 r = safe_fork("(sd-pam)", 0, &pam_pid);
1262 if (r < 0)
1263 goto fail;
1264 if (r == 0) {
1265 int sig, ret = EXIT_PAM;
1266
1267 /* The child's job is to reset the PAM session on termination */
1268 barrier_set_role(&barrier, BARRIER_CHILD);
1269
1270 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1271 * those fds are open here that have been opened by PAM. */
1272 (void) close_many(fds, n_fds);
1273
1274 /* Drop privileges - we don't need any to pam_close_session and this will make
1275 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1276 * threads to fail to exit normally */
1277
1278 r = maybe_setgroups(0, NULL);
1279 if (r < 0)
1280 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1281 if (setresgid(gid, gid, gid) < 0)
1282 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1283 if (setresuid(uid, uid, uid) < 0)
1284 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1285
1286 (void) ignore_signals(SIGPIPE);
1287
1288 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1289 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1290 * this way. We rely on the control groups kill logic to do the rest for us. */
1291 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1292 goto child_finish;
1293
1294 /* Tell the parent that our setup is done. This is especially important regarding dropping
1295 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1296 *
1297 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1298 (void) barrier_place(&barrier);
1299
1300 /* Check if our parent process might already have died? */
1301 if (getppid() == parent_pid) {
1302 sigset_t ss;
1303
1304 assert_se(sigemptyset(&ss) >= 0);
1305 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1306
1307 for (;;) {
1308 if (sigwait(&ss, &sig) < 0) {
1309 if (errno == EINTR)
1310 continue;
1311
1312 goto child_finish;
1313 }
1314
1315 assert(sig == SIGTERM);
1316 break;
1317 }
1318 }
1319
1320 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1321 if (pam_code != PAM_SUCCESS)
1322 goto child_finish;
1323
1324 /* If our parent died we'll end the session */
1325 if (getppid() != parent_pid) {
1326 pam_code = pam_close_session(handle, flags);
1327 if (pam_code != PAM_SUCCESS)
1328 goto child_finish;
1329 }
1330
1331 ret = 0;
1332
1333 child_finish:
1334 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1335 * know about this. See pam_end(3) */
1336 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1337 _exit(ret);
1338 }
1339
1340 barrier_set_role(&barrier, BARRIER_PARENT);
1341
1342 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1343 * here. */
1344 handle = NULL;
1345
1346 /* Unblock SIGTERM again in the parent */
1347 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1348
1349 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1350 * this fd around. */
1351 closelog();
1352
1353 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1354 * recover. However, warn loudly if it happens. */
1355 if (!barrier_place_and_sync(&barrier))
1356 log_error("PAM initialization failed");
1357
1358 return strv_free_and_replace(*env, e);
1359
1360 fail:
1361 if (pam_code != PAM_SUCCESS) {
1362 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1363 r = -EPERM; /* PAM errors do not map to errno */
1364 } else
1365 log_error_errno(r, "PAM failed: %m");
1366
1367 if (handle) {
1368 if (close_session)
1369 pam_code = pam_close_session(handle, flags);
1370
1371 (void) pam_end(handle, pam_code | flags);
1372 }
1373
1374 closelog();
1375 return r;
1376 #else
1377 return 0;
1378 #endif
1379 }
1380
1381 static void rename_process_from_path(const char *path) {
1382 _cleanup_free_ char *buf = NULL;
1383 const char *p;
1384
1385 assert(path);
1386
1387 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1388 * /bin/ps */
1389
1390 if (path_extract_filename(path, &buf) < 0) {
1391 rename_process("(...)");
1392 return;
1393 }
1394
1395 size_t l = strlen(buf);
1396 if (l > 8) {
1397 /* The end of the process name is usually more interesting, since the first bit might just be
1398 * "systemd-" */
1399 p = buf + l - 8;
1400 l = 8;
1401 } else
1402 p = buf;
1403
1404 char process_name[11];
1405 process_name[0] = '(';
1406 memcpy(process_name+1, p, l);
1407 process_name[1+l] = ')';
1408 process_name[1+l+1] = 0;
1409
1410 rename_process(process_name);
1411 }
1412
1413 static bool context_has_address_families(const ExecContext *c) {
1414 assert(c);
1415
1416 return c->address_families_allow_list ||
1417 !set_isempty(c->address_families);
1418 }
1419
1420 static bool context_has_syscall_filters(const ExecContext *c) {
1421 assert(c);
1422
1423 return c->syscall_allow_list ||
1424 !hashmap_isempty(c->syscall_filter);
1425 }
1426
1427 static bool context_has_syscall_logs(const ExecContext *c) {
1428 assert(c);
1429
1430 return c->syscall_log_allow_list ||
1431 !hashmap_isempty(c->syscall_log);
1432 }
1433
1434 static bool context_has_no_new_privileges(const ExecContext *c) {
1435 assert(c);
1436
1437 if (c->no_new_privileges)
1438 return true;
1439
1440 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1441 return false;
1442
1443 /* We need NNP if we have any form of seccomp and are unprivileged */
1444 return c->lock_personality ||
1445 c->memory_deny_write_execute ||
1446 c->private_devices ||
1447 c->protect_clock ||
1448 c->protect_hostname ||
1449 c->protect_kernel_tunables ||
1450 c->protect_kernel_modules ||
1451 c->protect_kernel_logs ||
1452 context_has_address_families(c) ||
1453 exec_context_restrict_namespaces_set(c) ||
1454 c->restrict_realtime ||
1455 c->restrict_suid_sgid ||
1456 !set_isempty(c->syscall_archs) ||
1457 context_has_syscall_filters(c) ||
1458 context_has_syscall_logs(c);
1459 }
1460
1461 static bool exec_context_has_credentials(const ExecContext *context) {
1462
1463 assert(context);
1464
1465 return !hashmap_isempty(context->set_credentials) ||
1466 !hashmap_isempty(context->load_credentials);
1467 }
1468
1469 #if HAVE_SECCOMP
1470
1471 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1472
1473 if (is_seccomp_available())
1474 return false;
1475
1476 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1477 return true;
1478 }
1479
1480 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1481 uint32_t negative_action, default_action, action;
1482 int r;
1483
1484 assert(u);
1485 assert(c);
1486
1487 if (!context_has_syscall_filters(c))
1488 return 0;
1489
1490 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1491 return 0;
1492
1493 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1494
1495 if (c->syscall_allow_list) {
1496 default_action = negative_action;
1497 action = SCMP_ACT_ALLOW;
1498 } else {
1499 default_action = SCMP_ACT_ALLOW;
1500 action = negative_action;
1501 }
1502
1503 if (needs_ambient_hack) {
1504 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1505 if (r < 0)
1506 return r;
1507 }
1508
1509 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1510 }
1511
1512 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1513 #ifdef SCMP_ACT_LOG
1514 uint32_t default_action, action;
1515 #endif
1516
1517 assert(u);
1518 assert(c);
1519
1520 if (!context_has_syscall_logs(c))
1521 return 0;
1522
1523 #ifdef SCMP_ACT_LOG
1524 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1525 return 0;
1526
1527 if (c->syscall_log_allow_list) {
1528 /* Log nothing but the ones listed */
1529 default_action = SCMP_ACT_ALLOW;
1530 action = SCMP_ACT_LOG;
1531 } else {
1532 /* Log everything but the ones listed */
1533 default_action = SCMP_ACT_LOG;
1534 action = SCMP_ACT_ALLOW;
1535 }
1536
1537 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1538 #else
1539 /* old libseccomp */
1540 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1541 return 0;
1542 #endif
1543 }
1544
1545 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1546 assert(u);
1547 assert(c);
1548
1549 if (set_isempty(c->syscall_archs))
1550 return 0;
1551
1552 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1553 return 0;
1554
1555 return seccomp_restrict_archs(c->syscall_archs);
1556 }
1557
1558 static int apply_address_families(const Unit* u, const ExecContext *c) {
1559 assert(u);
1560 assert(c);
1561
1562 if (!context_has_address_families(c))
1563 return 0;
1564
1565 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1566 return 0;
1567
1568 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1569 }
1570
1571 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1572 assert(u);
1573 assert(c);
1574
1575 if (!c->memory_deny_write_execute)
1576 return 0;
1577
1578 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1579 return 0;
1580
1581 return seccomp_memory_deny_write_execute();
1582 }
1583
1584 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1585 assert(u);
1586 assert(c);
1587
1588 if (!c->restrict_realtime)
1589 return 0;
1590
1591 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1592 return 0;
1593
1594 return seccomp_restrict_realtime();
1595 }
1596
1597 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1598 assert(u);
1599 assert(c);
1600
1601 if (!c->restrict_suid_sgid)
1602 return 0;
1603
1604 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1605 return 0;
1606
1607 return seccomp_restrict_suid_sgid();
1608 }
1609
1610 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1611 assert(u);
1612 assert(c);
1613
1614 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1615 * let's protect even those systems where this is left on in the kernel. */
1616
1617 if (!c->protect_kernel_tunables)
1618 return 0;
1619
1620 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1621 return 0;
1622
1623 return seccomp_protect_sysctl();
1624 }
1625
1626 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1627 assert(u);
1628 assert(c);
1629
1630 /* Turn off module syscalls on ProtectKernelModules=yes */
1631
1632 if (!c->protect_kernel_modules)
1633 return 0;
1634
1635 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1636 return 0;
1637
1638 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1639 }
1640
1641 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1642 assert(u);
1643 assert(c);
1644
1645 if (!c->protect_kernel_logs)
1646 return 0;
1647
1648 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1649 return 0;
1650
1651 return seccomp_protect_syslog();
1652 }
1653
1654 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1655 assert(u);
1656 assert(c);
1657
1658 if (!c->protect_clock)
1659 return 0;
1660
1661 if (skip_seccomp_unavailable(u, "ProtectClock="))
1662 return 0;
1663
1664 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1665 }
1666
1667 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1668 assert(u);
1669 assert(c);
1670
1671 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1672
1673 if (!c->private_devices)
1674 return 0;
1675
1676 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1677 return 0;
1678
1679 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1680 }
1681
1682 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1683 assert(u);
1684 assert(c);
1685
1686 if (!exec_context_restrict_namespaces_set(c))
1687 return 0;
1688
1689 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1690 return 0;
1691
1692 return seccomp_restrict_namespaces(c->restrict_namespaces);
1693 }
1694
1695 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1696 unsigned long personality;
1697 int r;
1698
1699 assert(u);
1700 assert(c);
1701
1702 if (!c->lock_personality)
1703 return 0;
1704
1705 if (skip_seccomp_unavailable(u, "LockPersonality="))
1706 return 0;
1707
1708 personality = c->personality;
1709
1710 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1711 if (personality == PERSONALITY_INVALID) {
1712
1713 r = opinionated_personality(&personality);
1714 if (r < 0)
1715 return r;
1716 }
1717
1718 return seccomp_lock_personality(personality);
1719 }
1720
1721 #endif
1722
1723 #if HAVE_LIBBPF
1724 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1725 assert(u);
1726 assert(c);
1727
1728 if (!exec_context_restrict_filesystems_set(c))
1729 return 0;
1730
1731 if (!u->manager->restrict_fs) {
1732 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1733 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1734 return 0;
1735 }
1736
1737 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1738 }
1739 #endif
1740
1741 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1742 assert(u);
1743 assert(c);
1744
1745 if (!c->protect_hostname)
1746 return 0;
1747
1748 if (ns_type_supported(NAMESPACE_UTS)) {
1749 if (unshare(CLONE_NEWUTS) < 0) {
1750 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1751 *ret_exit_status = EXIT_NAMESPACE;
1752 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1753 }
1754
1755 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1756 }
1757 } else
1758 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1759
1760 #if HAVE_SECCOMP
1761 int r;
1762
1763 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1764 return 0;
1765
1766 r = seccomp_protect_hostname();
1767 if (r < 0) {
1768 *ret_exit_status = EXIT_SECCOMP;
1769 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1770 }
1771 #endif
1772
1773 return 0;
1774 }
1775
1776 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1777 assert(idle_pipe);
1778
1779 idle_pipe[1] = safe_close(idle_pipe[1]);
1780 idle_pipe[2] = safe_close(idle_pipe[2]);
1781
1782 if (idle_pipe[0] >= 0) {
1783 int r;
1784
1785 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1786
1787 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1788 ssize_t n;
1789
1790 /* Signal systemd that we are bored and want to continue. */
1791 n = write(idle_pipe[3], "x", 1);
1792 if (n > 0)
1793 /* Wait for systemd to react to the signal above. */
1794 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1795 }
1796
1797 idle_pipe[0] = safe_close(idle_pipe[0]);
1798
1799 }
1800
1801 idle_pipe[3] = safe_close(idle_pipe[3]);
1802 }
1803
1804 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1805
1806 static int build_environment(
1807 const Unit *u,
1808 const ExecContext *c,
1809 const ExecParameters *p,
1810 size_t n_fds,
1811 char **fdnames,
1812 const char *home,
1813 const char *username,
1814 const char *shell,
1815 dev_t journal_stream_dev,
1816 ino_t journal_stream_ino,
1817 char ***ret) {
1818
1819 _cleanup_strv_free_ char **our_env = NULL;
1820 size_t n_env = 0;
1821 char *x;
1822
1823 assert(u);
1824 assert(c);
1825 assert(p);
1826 assert(ret);
1827
1828 #define N_ENV_VARS 17
1829 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1830 if (!our_env)
1831 return -ENOMEM;
1832
1833 if (n_fds > 0) {
1834 _cleanup_free_ char *joined = NULL;
1835
1836 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1837 return -ENOMEM;
1838 our_env[n_env++] = x;
1839
1840 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1841 return -ENOMEM;
1842 our_env[n_env++] = x;
1843
1844 joined = strv_join(fdnames, ":");
1845 if (!joined)
1846 return -ENOMEM;
1847
1848 x = strjoin("LISTEN_FDNAMES=", joined);
1849 if (!x)
1850 return -ENOMEM;
1851 our_env[n_env++] = x;
1852 }
1853
1854 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1855 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1856 return -ENOMEM;
1857 our_env[n_env++] = x;
1858
1859 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1860 return -ENOMEM;
1861 our_env[n_env++] = x;
1862 }
1863
1864 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1865 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1866 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1867 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1868 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1869 if (!x)
1870 return -ENOMEM;
1871 our_env[n_env++] = x;
1872 }
1873
1874 if (home) {
1875 x = strjoin("HOME=", home);
1876 if (!x)
1877 return -ENOMEM;
1878
1879 path_simplify(x + 5);
1880 our_env[n_env++] = x;
1881 }
1882
1883 if (username) {
1884 x = strjoin("LOGNAME=", username);
1885 if (!x)
1886 return -ENOMEM;
1887 our_env[n_env++] = x;
1888
1889 x = strjoin("USER=", username);
1890 if (!x)
1891 return -ENOMEM;
1892 our_env[n_env++] = x;
1893 }
1894
1895 if (shell) {
1896 x = strjoin("SHELL=", shell);
1897 if (!x)
1898 return -ENOMEM;
1899
1900 path_simplify(x + 6);
1901 our_env[n_env++] = x;
1902 }
1903
1904 if (!sd_id128_is_null(u->invocation_id)) {
1905 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1906 return -ENOMEM;
1907
1908 our_env[n_env++] = x;
1909 }
1910
1911 if (exec_context_needs_term(c)) {
1912 const char *tty_path, *term = NULL;
1913
1914 tty_path = exec_context_tty_path(c);
1915
1916 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1917 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1918 * container manager passes to PID 1 ends up all the way in the console login shown. */
1919
1920 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1921 term = getenv("TERM");
1922
1923 if (!term)
1924 term = default_term_for_tty(tty_path);
1925
1926 x = strjoin("TERM=", term);
1927 if (!x)
1928 return -ENOMEM;
1929 our_env[n_env++] = x;
1930 }
1931
1932 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1933 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1934 return -ENOMEM;
1935
1936 our_env[n_env++] = x;
1937 }
1938
1939 if (c->log_namespace) {
1940 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1941 if (!x)
1942 return -ENOMEM;
1943
1944 our_env[n_env++] = x;
1945 }
1946
1947 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1948 _cleanup_free_ char *joined = NULL;
1949 const char *n;
1950
1951 if (!p->prefix[t])
1952 continue;
1953
1954 if (c->directories[t].n_items == 0)
1955 continue;
1956
1957 n = exec_directory_env_name_to_string(t);
1958 if (!n)
1959 continue;
1960
1961 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1962 _cleanup_free_ char *prefixed = NULL;
1963
1964 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1965 if (!prefixed)
1966 return -ENOMEM;
1967
1968 if (!strextend_with_separator(&joined, ":", prefixed))
1969 return -ENOMEM;
1970 }
1971
1972 x = strjoin(n, "=", joined);
1973 if (!x)
1974 return -ENOMEM;
1975
1976 our_env[n_env++] = x;
1977 }
1978
1979 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1980 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1981 if (!x)
1982 return -ENOMEM;
1983
1984 our_env[n_env++] = x;
1985 }
1986
1987 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1988 return -ENOMEM;
1989
1990 our_env[n_env++] = x;
1991
1992 our_env[n_env++] = NULL;
1993 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1994 #undef N_ENV_VARS
1995
1996 *ret = TAKE_PTR(our_env);
1997
1998 return 0;
1999 }
2000
2001 static int build_pass_environment(const ExecContext *c, char ***ret) {
2002 _cleanup_strv_free_ char **pass_env = NULL;
2003 size_t n_env = 0;
2004
2005 STRV_FOREACH(i, c->pass_environment) {
2006 _cleanup_free_ char *x = NULL;
2007 char *v;
2008
2009 v = getenv(*i);
2010 if (!v)
2011 continue;
2012 x = strjoin(*i, "=", v);
2013 if (!x)
2014 return -ENOMEM;
2015
2016 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2017 return -ENOMEM;
2018
2019 pass_env[n_env++] = TAKE_PTR(x);
2020 pass_env[n_env] = NULL;
2021 }
2022
2023 *ret = TAKE_PTR(pass_env);
2024
2025 return 0;
2026 }
2027
2028 bool exec_needs_mount_namespace(
2029 const ExecContext *context,
2030 const ExecParameters *params,
2031 const ExecRuntime *runtime) {
2032
2033 assert(context);
2034
2035 if (context->root_image)
2036 return true;
2037
2038 if (!strv_isempty(context->read_write_paths) ||
2039 !strv_isempty(context->read_only_paths) ||
2040 !strv_isempty(context->inaccessible_paths) ||
2041 !strv_isempty(context->exec_paths) ||
2042 !strv_isempty(context->no_exec_paths))
2043 return true;
2044
2045 if (context->n_bind_mounts > 0)
2046 return true;
2047
2048 if (context->n_temporary_filesystems > 0)
2049 return true;
2050
2051 if (context->n_mount_images > 0)
2052 return true;
2053
2054 if (context->n_extension_images > 0)
2055 return true;
2056
2057 if (!strv_isempty(context->extension_directories))
2058 return true;
2059
2060 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
2061 return true;
2062
2063 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2064 return true;
2065
2066 if (context->private_devices ||
2067 context->private_mounts ||
2068 context->protect_system != PROTECT_SYSTEM_NO ||
2069 context->protect_home != PROTECT_HOME_NO ||
2070 context->protect_kernel_tunables ||
2071 context->protect_kernel_modules ||
2072 context->protect_kernel_logs ||
2073 context->protect_control_groups ||
2074 context->protect_proc != PROTECT_PROC_DEFAULT ||
2075 context->proc_subset != PROC_SUBSET_ALL ||
2076 context->private_ipc ||
2077 context->ipc_namespace_path)
2078 return true;
2079
2080 if (context->root_directory) {
2081 if (exec_context_get_effective_mount_apivfs(context))
2082 return true;
2083
2084 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2085 if (params && !params->prefix[t])
2086 continue;
2087
2088 if (context->directories[t].n_items > 0)
2089 return true;
2090 }
2091 }
2092
2093 if (context->dynamic_user &&
2094 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2095 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2096 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2097 return true;
2098
2099 if (context->log_namespace)
2100 return true;
2101
2102 return false;
2103 }
2104
2105 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2106 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2107 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2108 _cleanup_close_ int unshare_ready_fd = -EBADF;
2109 _cleanup_(sigkill_waitp) pid_t pid = 0;
2110 uint64_t c = 1;
2111 ssize_t n;
2112 int r;
2113
2114 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2115 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2116 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2117 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2118 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2119 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2120 * continues execution normally.
2121 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2122 * does not need CAP_SETUID to write the single line mapping to itself. */
2123
2124 /* Can only set up multiple mappings with CAP_SETUID. */
2125 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2126 r = asprintf(&uid_map,
2127 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2128 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2129 ouid, ouid, uid, uid);
2130 else
2131 r = asprintf(&uid_map,
2132 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2133 ouid, ouid);
2134
2135 if (r < 0)
2136 return -ENOMEM;
2137
2138 /* Can only set up multiple mappings with CAP_SETGID. */
2139 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2140 r = asprintf(&gid_map,
2141 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2142 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2143 ogid, ogid, gid, gid);
2144 else
2145 r = asprintf(&gid_map,
2146 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2147 ogid, ogid);
2148
2149 if (r < 0)
2150 return -ENOMEM;
2151
2152 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2153 * namespace. */
2154 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2155 if (unshare_ready_fd < 0)
2156 return -errno;
2157
2158 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2159 * failed. */
2160 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2161 return -errno;
2162
2163 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2164 if (r < 0)
2165 return r;
2166 if (r == 0) {
2167 _cleanup_close_ int fd = -EBADF;
2168 const char *a;
2169 pid_t ppid;
2170
2171 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2172 * here, after the parent opened its own user namespace. */
2173
2174 ppid = getppid();
2175 errno_pipe[0] = safe_close(errno_pipe[0]);
2176
2177 /* Wait until the parent unshared the user namespace */
2178 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2179 r = -errno;
2180 goto child_fail;
2181 }
2182
2183 /* Disable the setgroups() system call in the child user namespace, for good. */
2184 a = procfs_file_alloca(ppid, "setgroups");
2185 fd = open(a, O_WRONLY|O_CLOEXEC);
2186 if (fd < 0) {
2187 if (errno != ENOENT) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191
2192 /* If the file is missing the kernel is too old, let's continue anyway. */
2193 } else {
2194 if (write(fd, "deny\n", 5) < 0) {
2195 r = -errno;
2196 goto child_fail;
2197 }
2198
2199 fd = safe_close(fd);
2200 }
2201
2202 /* First write the GID map */
2203 a = procfs_file_alloca(ppid, "gid_map");
2204 fd = open(a, O_WRONLY|O_CLOEXEC);
2205 if (fd < 0) {
2206 r = -errno;
2207 goto child_fail;
2208 }
2209 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2210 r = -errno;
2211 goto child_fail;
2212 }
2213 fd = safe_close(fd);
2214
2215 /* The write the UID map */
2216 a = procfs_file_alloca(ppid, "uid_map");
2217 fd = open(a, O_WRONLY|O_CLOEXEC);
2218 if (fd < 0) {
2219 r = -errno;
2220 goto child_fail;
2221 }
2222 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2223 r = -errno;
2224 goto child_fail;
2225 }
2226
2227 _exit(EXIT_SUCCESS);
2228
2229 child_fail:
2230 (void) write(errno_pipe[1], &r, sizeof(r));
2231 _exit(EXIT_FAILURE);
2232 }
2233
2234 errno_pipe[1] = safe_close(errno_pipe[1]);
2235
2236 if (unshare(CLONE_NEWUSER) < 0)
2237 return -errno;
2238
2239 /* Let the child know that the namespace is ready now */
2240 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2241 return -errno;
2242
2243 /* Try to read an error code from the child */
2244 n = read(errno_pipe[0], &r, sizeof(r));
2245 if (n < 0)
2246 return -errno;
2247 if (n == sizeof(r)) { /* an error code was sent to us */
2248 if (r < 0)
2249 return r;
2250 return -EIO;
2251 }
2252 if (n != 0) /* on success we should have read 0 bytes */
2253 return -EIO;
2254
2255 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2256 if (r < 0)
2257 return r;
2258 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2259 return -EIO;
2260
2261 return 0;
2262 }
2263
2264 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2265 if (!context->dynamic_user)
2266 return false;
2267
2268 if (type == EXEC_DIRECTORY_CONFIGURATION)
2269 return false;
2270
2271 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2272 return false;
2273
2274 return true;
2275 }
2276
2277 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2278 _cleanup_free_ char *src_abs = NULL;
2279 int r;
2280
2281 assert(source);
2282
2283 src_abs = path_join(root, source);
2284 if (!src_abs)
2285 return -ENOMEM;
2286
2287 STRV_FOREACH(dst, symlinks) {
2288 _cleanup_free_ char *dst_abs = NULL;
2289
2290 dst_abs = path_join(root, *dst);
2291 if (!dst_abs)
2292 return -ENOMEM;
2293
2294 r = mkdir_parents_label(dst_abs, 0755);
2295 if (r < 0)
2296 return r;
2297
2298 r = symlink_idempotent(src_abs, dst_abs, true);
2299 if (r < 0)
2300 return r;
2301 }
2302
2303 return 0;
2304 }
2305
2306 static int setup_exec_directory(
2307 const ExecContext *context,
2308 const ExecParameters *params,
2309 uid_t uid,
2310 gid_t gid,
2311 ExecDirectoryType type,
2312 bool needs_mount_namespace,
2313 int *exit_status) {
2314
2315 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2316 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2317 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2318 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2319 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2320 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2321 };
2322 int r;
2323
2324 assert(context);
2325 assert(params);
2326 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2327 assert(exit_status);
2328
2329 if (!params->prefix[type])
2330 return 0;
2331
2332 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2333 if (!uid_is_valid(uid))
2334 uid = 0;
2335 if (!gid_is_valid(gid))
2336 gid = 0;
2337 }
2338
2339 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2340 _cleanup_free_ char *p = NULL, *pp = NULL;
2341
2342 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2343 if (!p) {
2344 r = -ENOMEM;
2345 goto fail;
2346 }
2347
2348 r = mkdir_parents_label(p, 0755);
2349 if (r < 0)
2350 goto fail;
2351
2352 if (exec_directory_is_private(context, type)) {
2353 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2354 * case we want to avoid leaving a directory around fully accessible that is owned by
2355 * a dynamic user whose UID is later on reused. To lock this down we use the same
2356 * trick used by container managers to prohibit host users to get access to files of
2357 * the same UID in containers: we place everything inside a directory that has an
2358 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2359 * for unprivileged host code. We then use fs namespacing to make this directory
2360 * permeable for the service itself.
2361 *
2362 * Specifically: for a service which wants a special directory "foo/" we first create
2363 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2364 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2365 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2366 * unprivileged host users can't look into it. Inside of the namespace of the unit
2367 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2368 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2369 * for the service and making sure it only gets access to the dirs it needs but no
2370 * others. Tricky? Yes, absolutely, but it works!
2371 *
2372 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2373 * to be owned by the service itself.
2374 *
2375 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2376 * for sharing files or sockets with other services. */
2377
2378 pp = path_join(params->prefix[type], "private");
2379 if (!pp) {
2380 r = -ENOMEM;
2381 goto fail;
2382 }
2383
2384 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2385 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2386 if (r < 0)
2387 goto fail;
2388
2389 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2390 r = -ENOMEM;
2391 goto fail;
2392 }
2393
2394 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2395 r = mkdir_parents_label(pp, 0755);
2396 if (r < 0)
2397 goto fail;
2398
2399 if (is_dir(p, false) > 0 &&
2400 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2401
2402 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2403 * it over. Most likely the service has been upgraded from one that didn't use
2404 * DynamicUser=1, to one that does. */
2405
2406 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2407 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2408 exec_directory_type_to_string(type), p, pp);
2409
2410 if (rename(p, pp) < 0) {
2411 r = -errno;
2412 goto fail;
2413 }
2414 } else {
2415 /* Otherwise, create the actual directory for the service */
2416
2417 r = mkdir_label(pp, context->directories[type].mode);
2418 if (r < 0 && r != -EEXIST)
2419 goto fail;
2420 }
2421
2422 if (!context->directories[type].items[i].only_create) {
2423 /* And link it up from the original place.
2424 * Notes
2425 * 1) If a mount namespace is going to be used, then this symlink remains on
2426 * the host, and a new one for the child namespace will be created later.
2427 * 2) It is not necessary to create this symlink when one of its parent
2428 * directories is specified and already created. E.g.
2429 * StateDirectory=foo foo/bar
2430 * In that case, the inode points to pp and p for "foo/bar" are the same:
2431 * pp = "/var/lib/private/foo/bar"
2432 * p = "/var/lib/foo/bar"
2433 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2434 * we do not need to create the symlink, but we cannot create the symlink.
2435 * See issue #24783. */
2436 r = symlink_idempotent(pp, p, true);
2437 if (r < 0)
2438 goto fail;
2439 }
2440
2441 } else {
2442 _cleanup_free_ char *target = NULL;
2443
2444 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2445 readlink_and_make_absolute(p, &target) >= 0) {
2446 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2447
2448 /* This already exists and is a symlink? Interesting. Maybe it's one created
2449 * by DynamicUser=1 (see above)?
2450 *
2451 * We do this for all directory types except for ConfigurationDirectory=,
2452 * since they all support the private/ symlink logic at least in some
2453 * configurations, see above. */
2454
2455 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2456 if (r < 0)
2457 goto fail;
2458
2459 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2460 if (!q) {
2461 r = -ENOMEM;
2462 goto fail;
2463 }
2464
2465 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2466 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2467 if (r < 0)
2468 goto fail;
2469
2470 if (path_equal(q_resolved, target_resolved)) {
2471
2472 /* Hmm, apparently DynamicUser= was once turned on for this service,
2473 * but is no longer. Let's move the directory back up. */
2474
2475 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2476 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2477 exec_directory_type_to_string(type), q, p);
2478
2479 if (unlink(p) < 0) {
2480 r = -errno;
2481 goto fail;
2482 }
2483
2484 if (rename(q, p) < 0) {
2485 r = -errno;
2486 goto fail;
2487 }
2488 }
2489 }
2490
2491 r = mkdir_label(p, context->directories[type].mode);
2492 if (r < 0) {
2493 if (r != -EEXIST)
2494 goto fail;
2495
2496 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2497 struct stat st;
2498
2499 /* Don't change the owner/access mode of the configuration directory,
2500 * as in the common case it is not written to by a service, and shall
2501 * not be writable. */
2502
2503 if (stat(p, &st) < 0) {
2504 r = -errno;
2505 goto fail;
2506 }
2507
2508 /* Still complain if the access mode doesn't match */
2509 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2510 log_warning("%s \'%s\' already exists but the mode is different. "
2511 "(File system: %o %sMode: %o)",
2512 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2513 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2514
2515 continue;
2516 }
2517 }
2518 }
2519
2520 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2521 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2522 * current UID/GID ownership.) */
2523 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2524 if (r < 0)
2525 goto fail;
2526
2527 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2528 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2529 * assignments to exist. */
2530 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
2531 if (r < 0)
2532 goto fail;
2533 }
2534
2535 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2536 * they are set up later, to allow configuring empty var/run/etc. */
2537 if (!needs_mount_namespace)
2538 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2539 r = create_many_symlinks(params->prefix[type],
2540 context->directories[type].items[i].path,
2541 context->directories[type].items[i].symlinks);
2542 if (r < 0)
2543 goto fail;
2544 }
2545
2546 return 0;
2547
2548 fail:
2549 *exit_status = exit_status_table[type];
2550 return r;
2551 }
2552
2553 static int write_credential(
2554 int dfd,
2555 const char *id,
2556 const void *data,
2557 size_t size,
2558 uid_t uid,
2559 bool ownership_ok) {
2560
2561 _cleanup_(unlink_and_freep) char *tmp = NULL;
2562 _cleanup_close_ int fd = -EBADF;
2563 int r;
2564
2565 r = tempfn_random_child("", "cred", &tmp);
2566 if (r < 0)
2567 return r;
2568
2569 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2570 if (fd < 0) {
2571 tmp = mfree(tmp);
2572 return -errno;
2573 }
2574
2575 r = loop_write(fd, data, size, /* do_poll = */ false);
2576 if (r < 0)
2577 return r;
2578
2579 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2580 return -errno;
2581
2582 if (uid_is_valid(uid) && uid != getuid()) {
2583 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2584 if (r < 0) {
2585 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2586 return r;
2587
2588 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2589 * to express: that the user gets read access and nothing
2590 * else. But if the backing fs can't support that (e.g. ramfs)
2591 * then we can use file ownership instead. But that's only safe if
2592 * we can then re-mount the whole thing read-only, so that the
2593 * user can no longer chmod() the file to gain write access. */
2594 return r;
2595
2596 if (fchown(fd, uid, GID_INVALID) < 0)
2597 return -errno;
2598 }
2599 }
2600
2601 if (renameat(dfd, tmp, dfd, id) < 0)
2602 return -errno;
2603
2604 tmp = mfree(tmp);
2605 return 0;
2606 }
2607
2608 static char **credential_search_path(
2609 const ExecParameters *params,
2610 bool encrypted) {
2611
2612 _cleanup_strv_free_ char **l = NULL;
2613
2614 assert(params);
2615
2616 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2617 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2618 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2619
2620 if (encrypted) {
2621 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2622 return NULL;
2623
2624 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2625 return NULL;
2626 }
2627
2628 if (params->received_credentials_directory)
2629 if (strv_extend(&l, params->received_credentials_directory) < 0)
2630 return NULL;
2631
2632 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2633 return NULL;
2634
2635 if (DEBUG_LOGGING) {
2636 _cleanup_free_ char *t = strv_join(l, ":");
2637
2638 log_debug("Credential search path is: %s", t);
2639 }
2640
2641 return TAKE_PTR(l);
2642 }
2643
2644 static int load_credential(
2645 const ExecContext *context,
2646 const ExecParameters *params,
2647 const char *id,
2648 const char *path,
2649 bool encrypted,
2650 const char *unit,
2651 int read_dfd,
2652 int write_dfd,
2653 uid_t uid,
2654 bool ownership_ok,
2655 uint64_t *left) {
2656
2657 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2658 _cleanup_strv_free_ char **search_path = NULL;
2659 _cleanup_(erase_and_freep) char *data = NULL;
2660 _cleanup_free_ char *bindname = NULL;
2661 const char *source = NULL;
2662 bool missing_ok = true;
2663 size_t size, add, maxsz;
2664 int r;
2665
2666 assert(context);
2667 assert(params);
2668 assert(id);
2669 assert(path);
2670 assert(unit);
2671 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2672 assert(write_dfd >= 0);
2673 assert(left);
2674
2675 if (read_dfd >= 0) {
2676 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2677 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2678 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2679 * open it. */
2680
2681 if (!filename_is_valid(path)) /* safety check */
2682 return -EINVAL;
2683
2684 missing_ok = true;
2685 source = path;
2686
2687 } else if (path_is_absolute(path)) {
2688 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2689 * sockets */
2690
2691 if (!path_is_valid(path)) /* safety check */
2692 return -EINVAL;
2693
2694 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2695
2696 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2697 * via the source socket address in case we read off an AF_UNIX socket. */
2698 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
2699 return -ENOMEM;
2700
2701 missing_ok = false;
2702 source = path;
2703
2704 } else if (credential_name_valid(path)) {
2705 /* If this is a relative path, take it as credential name relative to the credentials
2706 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2707 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2708
2709 search_path = credential_search_path(params, encrypted);
2710 if (!search_path)
2711 return -ENOMEM;
2712
2713 missing_ok = true;
2714 } else
2715 source = NULL;
2716
2717 if (encrypted)
2718 flags |= READ_FULL_FILE_UNBASE64;
2719
2720 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2721
2722 if (search_path) {
2723 STRV_FOREACH(d, search_path) {
2724 _cleanup_free_ char *j = NULL;
2725
2726 j = path_join(*d, path);
2727 if (!j)
2728 return -ENOMEM;
2729
2730 r = read_full_file_full(
2731 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2732 UINT64_MAX,
2733 maxsz,
2734 flags,
2735 NULL,
2736 &data, &size);
2737 if (r != -ENOENT)
2738 break;
2739 }
2740 } else if (source)
2741 r = read_full_file_full(
2742 read_dfd, source,
2743 UINT64_MAX,
2744 maxsz,
2745 flags,
2746 bindname,
2747 &data, &size);
2748 else
2749 r = -ENOENT;
2750
2751 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
2752 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2753 * will get clear errors if we don't pass such a missing credential on as they
2754 * themselves will get ENOENT when trying to read them, which should not be much
2755 * worse than when we handle the error here and make it fatal.
2756 *
2757 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2758 * we are fine, too. */
2759 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
2760 return 0;
2761 }
2762 if (r < 0)
2763 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
2764
2765 if (encrypted) {
2766 _cleanup_free_ void *plaintext = NULL;
2767 size_t plaintext_size = 0;
2768
2769 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
2770 if (r < 0)
2771 return r;
2772
2773 free_and_replace(data, plaintext);
2774 size = plaintext_size;
2775 }
2776
2777 add = strlen(id) + size;
2778 if (add > *left)
2779 return -E2BIG;
2780
2781 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
2782 if (r < 0)
2783 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2784
2785 *left -= add;
2786 return 0;
2787 }
2788
2789 struct load_cred_args {
2790 const ExecContext *context;
2791 const ExecParameters *params;
2792 bool encrypted;
2793 const char *unit;
2794 int dfd;
2795 uid_t uid;
2796 bool ownership_ok;
2797 uint64_t *left;
2798 };
2799
2800 static int load_cred_recurse_dir_cb(
2801 RecurseDirEvent event,
2802 const char *path,
2803 int dir_fd,
2804 int inode_fd,
2805 const struct dirent *de,
2806 const struct statx *sx,
2807 void *userdata) {
2808
2809 struct load_cred_args *args = ASSERT_PTR(userdata);
2810 _cleanup_free_ char *sub_id = NULL;
2811 int r;
2812
2813 if (event != RECURSE_DIR_ENTRY)
2814 return RECURSE_DIR_CONTINUE;
2815
2816 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2817 return RECURSE_DIR_CONTINUE;
2818
2819 sub_id = strreplace(path, "/", "_");
2820 if (!sub_id)
2821 return -ENOMEM;
2822
2823 if (!credential_name_valid(sub_id))
2824 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
2825
2826 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
2827 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2828 return RECURSE_DIR_CONTINUE;
2829 }
2830 if (errno != ENOENT)
2831 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
2832
2833 r = load_credential(
2834 args->context,
2835 args->params,
2836 sub_id,
2837 de->d_name,
2838 args->encrypted,
2839 args->unit,
2840 dir_fd,
2841 args->dfd,
2842 args->uid,
2843 args->ownership_ok,
2844 args->left);
2845 if (r < 0)
2846 return r;
2847
2848 return RECURSE_DIR_CONTINUE;
2849 }
2850
2851 static int acquire_credentials(
2852 const ExecContext *context,
2853 const ExecParameters *params,
2854 const char *unit,
2855 const char *p,
2856 uid_t uid,
2857 bool ownership_ok) {
2858
2859 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
2860 _cleanup_close_ int dfd = -EBADF;
2861 ExecLoadCredential *lc;
2862 ExecSetCredential *sc;
2863 int r;
2864
2865 assert(context);
2866 assert(p);
2867
2868 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2869 if (dfd < 0)
2870 return -errno;
2871
2872 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2873 HASHMAP_FOREACH(lc, context->load_credentials) {
2874 _cleanup_close_ int sub_fd = -EBADF;
2875
2876 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2877 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2878 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2879 * propagate a credential passed to us from further up. */
2880
2881 if (path_is_absolute(lc->path)) {
2882 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
2883 if (sub_fd < 0 && !IN_SET(errno,
2884 ENOTDIR, /* Not a directory */
2885 ENOENT)) /* Doesn't exist? */
2886 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
2887 }
2888
2889 if (sub_fd < 0)
2890 /* Regular file (incl. a credential passed in from higher up) */
2891 r = load_credential(
2892 context,
2893 params,
2894 lc->id,
2895 lc->path,
2896 lc->encrypted,
2897 unit,
2898 AT_FDCWD,
2899 dfd,
2900 uid,
2901 ownership_ok,
2902 &left);
2903 else
2904 /* Directory */
2905 r = recurse_dir(
2906 sub_fd,
2907 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
2908 /* statx_mask= */ 0,
2909 /* n_depth_max= */ UINT_MAX,
2910 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
2911 load_cred_recurse_dir_cb,
2912 &(struct load_cred_args) {
2913 .context = context,
2914 .params = params,
2915 .encrypted = lc->encrypted,
2916 .unit = unit,
2917 .dfd = dfd,
2918 .uid = uid,
2919 .ownership_ok = ownership_ok,
2920 .left = &left,
2921 });
2922 if (r < 0)
2923 return r;
2924 }
2925
2926 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2927 * them, so that they can act as a "default" if the same credential is specified multiple times. */
2928 HASHMAP_FOREACH(sc, context->set_credentials) {
2929 _cleanup_(erase_and_freep) void *plaintext = NULL;
2930 const char *data;
2931 size_t size, add;
2932
2933 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2934 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2935 * slow and involved, hence it's nice to be able to skip that if the credential already
2936 * exists anyway. */
2937 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2938 continue;
2939 if (errno != ENOENT)
2940 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2941
2942 if (sc->encrypted) {
2943 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
2944 if (r < 0)
2945 return r;
2946
2947 data = plaintext;
2948 } else {
2949 data = sc->data;
2950 size = sc->size;
2951 }
2952
2953 add = strlen(sc->id) + size;
2954 if (add > left)
2955 return -E2BIG;
2956
2957 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2958 if (r < 0)
2959 return r;
2960
2961 left -= add;
2962 }
2963
2964 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2965 return -errno;
2966
2967 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2968 * accessible */
2969
2970 if (uid_is_valid(uid) && uid != getuid()) {
2971 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
2972 if (r < 0) {
2973 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2974 return r;
2975
2976 if (!ownership_ok)
2977 return r;
2978
2979 if (fchown(dfd, uid, GID_INVALID) < 0)
2980 return -errno;
2981 }
2982 }
2983
2984 return 0;
2985 }
2986
2987 static int setup_credentials_internal(
2988 const ExecContext *context,
2989 const ExecParameters *params,
2990 const char *unit,
2991 const char *final, /* This is where the credential store shall eventually end up at */
2992 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2993 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2994 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2995 uid_t uid) {
2996
2997 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2998 * if we mounted something; false if we definitely can't mount anything */
2999 bool final_mounted;
3000 const char *where;
3001
3002 assert(context);
3003 assert(final);
3004 assert(workspace);
3005
3006 if (reuse_workspace) {
3007 r = path_is_mount_point(workspace, NULL, 0);
3008 if (r < 0)
3009 return r;
3010 if (r > 0)
3011 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3012 else
3013 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3014 } else
3015 workspace_mounted = -1; /* ditto */
3016
3017 r = path_is_mount_point(final, NULL, 0);
3018 if (r < 0)
3019 return r;
3020 if (r > 0) {
3021 /* If the final place already has something mounted, we use that. If the workspace also has
3022 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3023 * different). */
3024 final_mounted = true;
3025
3026 if (workspace_mounted < 0) {
3027 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3028 * the final version to the workspace, and make it writable, so that we can make
3029 * changes */
3030
3031 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3032 if (r < 0)
3033 return r;
3034
3035 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3036 if (r < 0)
3037 return r;
3038
3039 workspace_mounted = true;
3040 }
3041 } else
3042 final_mounted = false;
3043
3044 if (workspace_mounted < 0) {
3045 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3046 for (int try = 0;; try++) {
3047
3048 if (try == 0) {
3049 /* Try "ramfs" first, since it's not swap backed */
3050 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3051 if (r >= 0) {
3052 workspace_mounted = true;
3053 break;
3054 }
3055
3056 } else if (try == 1) {
3057 _cleanup_free_ char *opts = NULL;
3058
3059 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
3060 return -ENOMEM;
3061
3062 /* Fall back to "tmpfs" otherwise */
3063 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3064 if (r >= 0) {
3065 workspace_mounted = true;
3066 break;
3067 }
3068
3069 } else {
3070 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3071 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3072 if (r < 0) {
3073 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3074 return r;
3075
3076 if (must_mount) /* If we it's not OK to use the plain directory
3077 * fallback, propagate all errors too */
3078 return r;
3079
3080 /* If we lack privileges to bind mount stuff, then let's gracefully
3081 * proceed for compat with container envs, and just use the final dir
3082 * as is. */
3083
3084 workspace_mounted = false;
3085 break;
3086 }
3087
3088 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3089 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3090 if (r < 0)
3091 return r;
3092
3093 workspace_mounted = true;
3094 break;
3095 }
3096 }
3097 }
3098
3099 assert(!must_mount || workspace_mounted > 0);
3100 where = workspace_mounted ? workspace : final;
3101
3102 (void) label_fix_full(AT_FDCWD, where, final, 0);
3103
3104 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3105 if (r < 0)
3106 return r;
3107
3108 if (workspace_mounted) {
3109 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3110 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3111 if (r < 0)
3112 return r;
3113
3114 /* And mount it to the final place, read-only */
3115 if (final_mounted)
3116 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3117 else
3118 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3119 if (r < 0)
3120 return r;
3121 } else {
3122 _cleanup_free_ char *parent = NULL;
3123
3124 /* If we do not have our own mount put used the plain directory fallback, then we need to
3125 * open access to the top-level credential directory and the per-service directory now */
3126
3127 r = path_extract_directory(final, &parent);
3128 if (r < 0)
3129 return r;
3130 if (chmod(parent, 0755) < 0)
3131 return -errno;
3132 }
3133
3134 return 0;
3135 }
3136
3137 static int setup_credentials(
3138 const ExecContext *context,
3139 const ExecParameters *params,
3140 const char *unit,
3141 uid_t uid) {
3142
3143 _cleanup_free_ char *p = NULL, *q = NULL;
3144 int r;
3145
3146 assert(context);
3147 assert(params);
3148
3149 if (!exec_context_has_credentials(context))
3150 return 0;
3151
3152 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3153 return -EINVAL;
3154
3155 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3156 * and the subdir we mount over with a read-only file system readable by the service's user */
3157 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3158 if (!q)
3159 return -ENOMEM;
3160
3161 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3162 if (r < 0 && r != -EEXIST)
3163 return r;
3164
3165 p = path_join(q, unit);
3166 if (!p)
3167 return -ENOMEM;
3168
3169 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3170 if (r < 0 && r != -EEXIST)
3171 return r;
3172
3173 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3174 if (r < 0) {
3175 _cleanup_free_ char *t = NULL, *u = NULL;
3176
3177 /* If this is not a privilege or support issue then propagate the error */
3178 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3179 return r;
3180
3181 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3182 * it into place, so that users can't access half-initialized credential stores. */
3183 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3184 if (!t)
3185 return -ENOMEM;
3186
3187 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3188 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3189 * after it is fully set up */
3190 u = path_join(t, unit);
3191 if (!u)
3192 return -ENOMEM;
3193
3194 FOREACH_STRING(i, t, u) {
3195 r = mkdir_label(i, 0700);
3196 if (r < 0 && r != -EEXIST)
3197 return r;
3198 }
3199
3200 r = setup_credentials_internal(
3201 context,
3202 params,
3203 unit,
3204 p, /* final mount point */
3205 u, /* temporary workspace to overmount */
3206 true, /* reuse the workspace if it is already a mount */
3207 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3208 uid);
3209
3210 (void) rmdir(u); /* remove the workspace again if we can. */
3211
3212 if (r < 0)
3213 return r;
3214
3215 } else if (r == 0) {
3216
3217 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3218 * we can use the same directory for all cases, after turning off propagation. Question
3219 * though is: where do we turn off propagation exactly, and where do we place the workspace
3220 * directory? We need some place that is guaranteed to be a mount point in the host, and
3221 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3222 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3223 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3224 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3225 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3226 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3227 * propagation on the former, and then overmount the latter.
3228 *
3229 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3230 * for this purpose, but there are few other candidates that work equally well for us, and
3231 * given that the we do this in a privately namespaced short-lived single-threaded process
3232 * that no one else sees this should be OK to do. */
3233
3234 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3235 if (r < 0)
3236 goto child_fail;
3237
3238 r = setup_credentials_internal(
3239 context,
3240 params,
3241 unit,
3242 p, /* final mount point */
3243 "/dev/shm", /* temporary workspace to overmount */
3244 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3245 true, /* insist that something is mounted, do not allow fallback to plain directory */
3246 uid);
3247 if (r < 0)
3248 goto child_fail;
3249
3250 _exit(EXIT_SUCCESS);
3251
3252 child_fail:
3253 _exit(EXIT_FAILURE);
3254 }
3255
3256 return 0;
3257 }
3258
3259 #if ENABLE_SMACK
3260 static int setup_smack(
3261 const Manager *manager,
3262 const ExecContext *context,
3263 int executable_fd) {
3264 int r;
3265
3266 assert(context);
3267 assert(executable_fd >= 0);
3268
3269 if (context->smack_process_label) {
3270 r = mac_smack_apply_pid(0, context->smack_process_label);
3271 if (r < 0)
3272 return r;
3273 } else if (manager->default_smack_process_label) {
3274 _cleanup_free_ char *exec_label = NULL;
3275
3276 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3277 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3278 return r;
3279
3280 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
3281 if (r < 0)
3282 return r;
3283 }
3284
3285 return 0;
3286 }
3287 #endif
3288
3289 static int compile_bind_mounts(
3290 const ExecContext *context,
3291 const ExecParameters *params,
3292 BindMount **ret_bind_mounts,
3293 size_t *ret_n_bind_mounts,
3294 char ***ret_empty_directories) {
3295
3296 _cleanup_strv_free_ char **empty_directories = NULL;
3297 BindMount *bind_mounts;
3298 size_t n, h = 0;
3299 int r;
3300
3301 assert(context);
3302 assert(params);
3303 assert(ret_bind_mounts);
3304 assert(ret_n_bind_mounts);
3305 assert(ret_empty_directories);
3306
3307 n = context->n_bind_mounts;
3308 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3309 if (!params->prefix[t])
3310 continue;
3311
3312 for (size_t i = 0; i < context->directories[t].n_items; i++)
3313 n += !context->directories[t].items[i].only_create;
3314 }
3315
3316 if (n <= 0) {
3317 *ret_bind_mounts = NULL;
3318 *ret_n_bind_mounts = 0;
3319 *ret_empty_directories = NULL;
3320 return 0;
3321 }
3322
3323 bind_mounts = new(BindMount, n);
3324 if (!bind_mounts)
3325 return -ENOMEM;
3326
3327 for (size_t i = 0; i < context->n_bind_mounts; i++) {
3328 BindMount *item = context->bind_mounts + i;
3329 char *s, *d;
3330
3331 s = strdup(item->source);
3332 if (!s) {
3333 r = -ENOMEM;
3334 goto finish;
3335 }
3336
3337 d = strdup(item->destination);
3338 if (!d) {
3339 free(s);
3340 r = -ENOMEM;
3341 goto finish;
3342 }
3343
3344 bind_mounts[h++] = (BindMount) {
3345 .source = s,
3346 .destination = d,
3347 .read_only = item->read_only,
3348 .recursive = item->recursive,
3349 .ignore_enoent = item->ignore_enoent,
3350 };
3351 }
3352
3353 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3354 if (!params->prefix[t])
3355 continue;
3356
3357 if (context->directories[t].n_items == 0)
3358 continue;
3359
3360 if (exec_directory_is_private(context, t) &&
3361 !exec_context_with_rootfs(context)) {
3362 char *private_root;
3363
3364 /* So this is for a dynamic user, and we need to make sure the process can access its own
3365 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3366 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3367
3368 private_root = path_join(params->prefix[t], "private");
3369 if (!private_root) {
3370 r = -ENOMEM;
3371 goto finish;
3372 }
3373
3374 r = strv_consume(&empty_directories, private_root);
3375 if (r < 0)
3376 goto finish;
3377 }
3378
3379 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3380 char *s, *d;
3381
3382 /* When one of the parent directories is in the list, we cannot create the symlink
3383 * for the child directory. See also the comments in setup_exec_directory(). */
3384 if (context->directories[t].items[i].only_create)
3385 continue;
3386
3387 if (exec_directory_is_private(context, t))
3388 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3389 else
3390 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3391 if (!s) {
3392 r = -ENOMEM;
3393 goto finish;
3394 }
3395
3396 if (exec_directory_is_private(context, t) &&
3397 exec_context_with_rootfs(context))
3398 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3399 * directory is not created on the root directory. So, let's bind-mount the directory
3400 * on the 'non-private' place. */
3401 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3402 else
3403 d = strdup(s);
3404 if (!d) {
3405 free(s);
3406 r = -ENOMEM;
3407 goto finish;
3408 }
3409
3410 bind_mounts[h++] = (BindMount) {
3411 .source = s,
3412 .destination = d,
3413 .read_only = false,
3414 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3415 .recursive = true,
3416 .ignore_enoent = false,
3417 };
3418 }
3419 }
3420
3421 assert(h == n);
3422
3423 *ret_bind_mounts = bind_mounts;
3424 *ret_n_bind_mounts = n;
3425 *ret_empty_directories = TAKE_PTR(empty_directories);
3426
3427 return (int) n;
3428
3429 finish:
3430 bind_mount_free_many(bind_mounts, h);
3431 return r;
3432 }
3433
3434 /* ret_symlinks will contain a list of pairs src:dest that describes
3435 * the symlinks to create later on. For example, the symlinks needed
3436 * to safely give private directories to DynamicUser=1 users. */
3437 static int compile_symlinks(
3438 const ExecContext *context,
3439 const ExecParameters *params,
3440 char ***ret_symlinks) {
3441
3442 _cleanup_strv_free_ char **symlinks = NULL;
3443 int r;
3444
3445 assert(context);
3446 assert(params);
3447 assert(ret_symlinks);
3448
3449 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3450 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3451 _cleanup_free_ char *private_path = NULL, *path = NULL;
3452
3453 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3454 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3455
3456 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3457 dst_abs = path_join(params->prefix[dt], *symlink);
3458 if (!src_abs || !dst_abs)
3459 return -ENOMEM;
3460
3461 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3462 if (r < 0)
3463 return r;
3464 }
3465
3466 if (!exec_directory_is_private(context, dt) ||
3467 exec_context_with_rootfs(context) ||
3468 context->directories[dt].items[i].only_create)
3469 continue;
3470
3471 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3472 if (!private_path)
3473 return -ENOMEM;
3474
3475 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3476 if (!path)
3477 return -ENOMEM;
3478
3479 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3480 if (r < 0)
3481 return r;
3482 }
3483 }
3484
3485 *ret_symlinks = TAKE_PTR(symlinks);
3486
3487 return 0;
3488 }
3489
3490 static bool insist_on_sandboxing(
3491 const ExecContext *context,
3492 const char *root_dir,
3493 const char *root_image,
3494 const BindMount *bind_mounts,
3495 size_t n_bind_mounts) {
3496
3497 assert(context);
3498 assert(n_bind_mounts == 0 || bind_mounts);
3499
3500 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3501 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3502 * rearrange stuff in a way we cannot ignore gracefully. */
3503
3504 if (context->n_temporary_filesystems > 0)
3505 return true;
3506
3507 if (root_dir || root_image)
3508 return true;
3509
3510 if (context->n_mount_images > 0)
3511 return true;
3512
3513 if (context->dynamic_user)
3514 return true;
3515
3516 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3517 return true;
3518
3519 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3520 * essential. */
3521 for (size_t i = 0; i < n_bind_mounts; i++)
3522 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3523 return true;
3524
3525 if (context->log_namespace)
3526 return true;
3527
3528 return false;
3529 }
3530
3531 static int apply_mount_namespace(
3532 const Unit *u,
3533 ExecCommandFlags command_flags,
3534 const ExecContext *context,
3535 const ExecParameters *params,
3536 const ExecRuntime *runtime,
3537 char **error_path) {
3538
3539 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
3540 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
3541 const char *root_dir = NULL, *root_image = NULL;
3542 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3543 *extension_dir = NULL;
3544 NamespaceInfo ns_info;
3545 bool needs_sandboxing;
3546 BindMount *bind_mounts = NULL;
3547 size_t n_bind_mounts = 0;
3548 int r;
3549
3550 assert(context);
3551
3552 if (params->flags & EXEC_APPLY_CHROOT) {
3553 root_image = context->root_image;
3554
3555 if (!root_image)
3556 root_dir = context->root_directory;
3557 }
3558
3559 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3560 if (r < 0)
3561 return r;
3562
3563 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3564 r = compile_symlinks(context, params, &symlinks);
3565 if (r < 0)
3566 goto finalize;
3567
3568 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
3569 if (needs_sandboxing) {
3570 /* The runtime struct only contains the parent of the private /tmp,
3571 * which is non-accessible to world users. Inside of it there's a /tmp
3572 * that is sticky, and that's the one we want to use here.
3573 * This does not apply when we are using /run/systemd/empty as fallback. */
3574
3575 if (context->private_tmp && runtime) {
3576 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3577 tmp_dir = runtime->tmp_dir;
3578 else if (runtime->tmp_dir)
3579 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3580
3581 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3582 var_tmp_dir = runtime->var_tmp_dir;
3583 else if (runtime->var_tmp_dir)
3584 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
3585 }
3586
3587 ns_info = (NamespaceInfo) {
3588 .ignore_protect_paths = false,
3589 .private_dev = context->private_devices,
3590 .protect_control_groups = context->protect_control_groups,
3591 .protect_kernel_tunables = context->protect_kernel_tunables,
3592 .protect_kernel_modules = context->protect_kernel_modules,
3593 .protect_kernel_logs = context->protect_kernel_logs,
3594 .protect_hostname = context->protect_hostname,
3595 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
3596 .private_mounts = context->private_mounts,
3597 .protect_home = context->protect_home,
3598 .protect_system = context->protect_system,
3599 .protect_proc = context->protect_proc,
3600 .proc_subset = context->proc_subset,
3601 .private_ipc = context->private_ipc || context->ipc_namespace_path,
3602 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3603 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
3604 };
3605 } else if (!context->dynamic_user && root_dir)
3606 /*
3607 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3608 * sandbox info, otherwise enforce it, don't ignore protected paths and
3609 * fail if we are enable to apply the sandbox inside the mount namespace.
3610 */
3611 ns_info = (NamespaceInfo) {
3612 .ignore_protect_paths = true,
3613 };
3614 else
3615 ns_info = (NamespaceInfo) {};
3616
3617 if (context->mount_flags == MS_SHARED)
3618 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3619
3620 if (exec_context_has_credentials(context) &&
3621 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3622 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3623 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3624 if (!creds_path) {
3625 r = -ENOMEM;
3626 goto finalize;
3627 }
3628 }
3629
3630 if (MANAGER_IS_SYSTEM(u->manager)) {
3631 propagate_dir = path_join("/run/systemd/propagate/", u->id);
3632 if (!propagate_dir) {
3633 r = -ENOMEM;
3634 goto finalize;
3635 }
3636
3637 incoming_dir = strdup("/run/systemd/incoming");
3638 if (!incoming_dir) {
3639 r = -ENOMEM;
3640 goto finalize;
3641 }
3642
3643 extension_dir = strdup("/run/systemd/unit-extensions");
3644 if (!extension_dir) {
3645 r = -ENOMEM;
3646 goto finalize;
3647 }
3648 } else
3649 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3650 r = -ENOMEM;
3651 goto finalize;
3652 }
3653
3654 r = setup_namespace(root_dir, root_image, context->root_image_options,
3655 &ns_info, context->read_write_paths,
3656 needs_sandboxing ? context->read_only_paths : NULL,
3657 needs_sandboxing ? context->inaccessible_paths : NULL,
3658 needs_sandboxing ? context->exec_paths : NULL,
3659 needs_sandboxing ? context->no_exec_paths : NULL,
3660 empty_directories,
3661 symlinks,
3662 bind_mounts,
3663 n_bind_mounts,
3664 context->temporary_filesystems,
3665 context->n_temporary_filesystems,
3666 context->mount_images,
3667 context->n_mount_images,
3668 tmp_dir,
3669 var_tmp_dir,
3670 creds_path,
3671 context->log_namespace,
3672 context->mount_flags,
3673 context->root_hash, context->root_hash_size, context->root_hash_path,
3674 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3675 context->root_verity,
3676 context->extension_images,
3677 context->n_extension_images,
3678 context->extension_directories,
3679 propagate_dir,
3680 incoming_dir,
3681 extension_dir,
3682 root_dir || root_image ? params->notify_socket : NULL,
3683 error_path);
3684
3685 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
3686 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
3687 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3688 * completely different execution environment. */
3689 if (r == -ENOANO) {
3690 if (insist_on_sandboxing(
3691 context,
3692 root_dir, root_image,
3693 bind_mounts,
3694 n_bind_mounts)) {
3695 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3696 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3697 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3698
3699 r = -EOPNOTSUPP;
3700 } else {
3701 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3702 r = 0;
3703 }
3704 }
3705
3706 finalize:
3707 bind_mount_free_many(bind_mounts, n_bind_mounts);
3708 return r;
3709 }
3710
3711 static int apply_working_directory(
3712 const ExecContext *context,
3713 const ExecParameters *params,
3714 const char *home,
3715 int *exit_status) {
3716
3717 const char *d, *wd;
3718
3719 assert(context);
3720 assert(exit_status);
3721
3722 if (context->working_directory_home) {
3723
3724 if (!home) {
3725 *exit_status = EXIT_CHDIR;
3726 return -ENXIO;
3727 }
3728
3729 wd = home;
3730
3731 } else
3732 wd = empty_to_root(context->working_directory);
3733
3734 if (params->flags & EXEC_APPLY_CHROOT)
3735 d = wd;
3736 else
3737 d = prefix_roota(context->root_directory, wd);
3738
3739 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3740 *exit_status = EXIT_CHDIR;
3741 return -errno;
3742 }
3743
3744 return 0;
3745 }
3746
3747 static int apply_root_directory(
3748 const ExecContext *context,
3749 const ExecParameters *params,
3750 const bool needs_mount_ns,
3751 int *exit_status) {
3752
3753 assert(context);
3754 assert(exit_status);
3755
3756 if (params->flags & EXEC_APPLY_CHROOT)
3757 if (!needs_mount_ns && context->root_directory)
3758 if (chroot(context->root_directory) < 0) {
3759 *exit_status = EXIT_CHROOT;
3760 return -errno;
3761 }
3762
3763 return 0;
3764 }
3765
3766 static int setup_keyring(
3767 const Unit *u,
3768 const ExecContext *context,
3769 const ExecParameters *p,
3770 uid_t uid, gid_t gid) {
3771
3772 key_serial_t keyring;
3773 int r = 0;
3774 uid_t saved_uid;
3775 gid_t saved_gid;
3776
3777 assert(u);
3778 assert(context);
3779 assert(p);
3780
3781 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3782 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3783 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3784 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3785 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3786 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3787
3788 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3789 return 0;
3790
3791 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3792 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3793 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3794 * & group is just as nasty as acquiring a reference to the user keyring. */
3795
3796 saved_uid = getuid();
3797 saved_gid = getgid();
3798
3799 if (gid_is_valid(gid) && gid != saved_gid) {
3800 if (setregid(gid, -1) < 0)
3801 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3802 }
3803
3804 if (uid_is_valid(uid) && uid != saved_uid) {
3805 if (setreuid(uid, -1) < 0) {
3806 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3807 goto out;
3808 }
3809 }
3810
3811 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3812 if (keyring == -1) {
3813 if (errno == ENOSYS)
3814 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
3815 else if (ERRNO_IS_PRIVILEGE(errno))
3816 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
3817 else if (errno == EDQUOT)
3818 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
3819 else
3820 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
3821
3822 goto out;
3823 }
3824
3825 /* When requested link the user keyring into the session keyring. */
3826 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3827
3828 if (keyctl(KEYCTL_LINK,
3829 KEY_SPEC_USER_KEYRING,
3830 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3831 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3832 goto out;
3833 }
3834 }
3835
3836 /* Restore uid/gid back */
3837 if (uid_is_valid(uid) && uid != saved_uid) {
3838 if (setreuid(saved_uid, -1) < 0) {
3839 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3840 goto out;
3841 }
3842 }
3843
3844 if (gid_is_valid(gid) && gid != saved_gid) {
3845 if (setregid(saved_gid, -1) < 0)
3846 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3847 }
3848
3849 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
3850 if (!sd_id128_is_null(u->invocation_id)) {
3851 key_serial_t key;
3852
3853 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3854 if (key == -1)
3855 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
3856 else {
3857 if (keyctl(KEYCTL_SETPERM, key,
3858 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3859 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
3860 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
3861 }
3862 }
3863
3864 out:
3865 /* Revert back uid & gid for the last time, and exit */
3866 /* no extra logging, as only the first already reported error matters */
3867 if (getuid() != saved_uid)
3868 (void) setreuid(saved_uid, -1);
3869
3870 if (getgid() != saved_gid)
3871 (void) setregid(saved_gid, -1);
3872
3873 return r;
3874 }
3875
3876 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
3877 assert(array);
3878 assert(n);
3879 assert(pair);
3880
3881 if (pair[0] >= 0)
3882 array[(*n)++] = pair[0];
3883 if (pair[1] >= 0)
3884 array[(*n)++] = pair[1];
3885 }
3886
3887 static int close_remaining_fds(
3888 const ExecParameters *params,
3889 const ExecRuntime *runtime,
3890 const DynamicCreds *dcreds,
3891 int user_lookup_fd,
3892 int socket_fd,
3893 const int *fds, size_t n_fds) {
3894
3895 size_t n_dont_close = 0;
3896 int dont_close[n_fds + 12];
3897
3898 assert(params);
3899
3900 if (params->stdin_fd >= 0)
3901 dont_close[n_dont_close++] = params->stdin_fd;
3902 if (params->stdout_fd >= 0)
3903 dont_close[n_dont_close++] = params->stdout_fd;
3904 if (params->stderr_fd >= 0)
3905 dont_close[n_dont_close++] = params->stderr_fd;
3906
3907 if (socket_fd >= 0)
3908 dont_close[n_dont_close++] = socket_fd;
3909 if (n_fds > 0) {
3910 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3911 n_dont_close += n_fds;
3912 }
3913
3914 if (runtime) {
3915 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
3916 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3917 }
3918
3919 if (dcreds) {
3920 if (dcreds->user)
3921 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3922 if (dcreds->group)
3923 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
3924 }
3925
3926 if (user_lookup_fd >= 0)
3927 dont_close[n_dont_close++] = user_lookup_fd;
3928
3929 return close_all_fds(dont_close, n_dont_close);
3930 }
3931
3932 static int send_user_lookup(
3933 Unit *unit,
3934 int user_lookup_fd,
3935 uid_t uid,
3936 gid_t gid) {
3937
3938 assert(unit);
3939
3940 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3941 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3942 * specified. */
3943
3944 if (user_lookup_fd < 0)
3945 return 0;
3946
3947 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3948 return 0;
3949
3950 if (writev(user_lookup_fd,
3951 (struct iovec[]) {
3952 IOVEC_INIT(&uid, sizeof(uid)),
3953 IOVEC_INIT(&gid, sizeof(gid)),
3954 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
3955 return -errno;
3956
3957 return 0;
3958 }
3959
3960 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3961 int r;
3962
3963 assert(c);
3964 assert(home);
3965 assert(buf);
3966
3967 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3968
3969 if (*home)
3970 return 0;
3971
3972 if (!c->working_directory_home)
3973 return 0;
3974
3975 r = get_home_dir(buf);
3976 if (r < 0)
3977 return r;
3978
3979 *home = *buf;
3980 return 1;
3981 }
3982
3983 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3984 _cleanup_strv_free_ char ** list = NULL;
3985 int r;
3986
3987 assert(c);
3988 assert(p);
3989 assert(ret);
3990
3991 assert(c->dynamic_user);
3992
3993 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3994 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3995 * directories. */
3996
3997 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3998 if (t == EXEC_DIRECTORY_CONFIGURATION)
3999 continue;
4000
4001 if (!p->prefix[t])
4002 continue;
4003
4004 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4005 char *e;
4006
4007 if (exec_directory_is_private(c, t))
4008 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4009 else
4010 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4011 if (!e)
4012 return -ENOMEM;
4013
4014 r = strv_consume(&list, e);
4015 if (r < 0)
4016 return r;
4017 }
4018 }
4019
4020 *ret = TAKE_PTR(list);
4021
4022 return 0;
4023 }
4024
4025 static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4026 bool using_subcgroup;
4027 char *p;
4028
4029 assert(params);
4030 assert(ret);
4031
4032 if (!params->cgroup_path)
4033 return -EINVAL;
4034
4035 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4036 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4037 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4038 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4039 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4040 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4041 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4042 * flag, which is only passed for the former statements, not for the latter. */
4043
4044 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4045 if (using_subcgroup)
4046 p = path_join(params->cgroup_path, ".control");
4047 else
4048 p = strdup(params->cgroup_path);
4049 if (!p)
4050 return -ENOMEM;
4051
4052 *ret = p;
4053 return using_subcgroup;
4054 }
4055
4056 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4057 _cleanup_(cpu_set_reset) CPUSet s = {};
4058 int r;
4059
4060 assert(c);
4061 assert(ret);
4062
4063 if (!c->numa_policy.nodes.set) {
4064 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4065 return 0;
4066 }
4067
4068 r = numa_to_cpu_set(&c->numa_policy, &s);
4069 if (r < 0)
4070 return r;
4071
4072 cpu_set_reset(ret);
4073
4074 return cpu_set_add_all(ret, &s);
4075 }
4076
4077 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4078 assert(c);
4079
4080 return c->cpu_affinity_from_numa;
4081 }
4082
4083 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4084 int r;
4085
4086 assert(fds);
4087 assert(n_fds);
4088 assert(*n_fds < fds_size);
4089 assert(ret_fd);
4090
4091 if (fd < 0) {
4092 *ret_fd = -EBADF;
4093 return 0;
4094 }
4095
4096 if (fd < 3 + (int) *n_fds) {
4097 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4098 * the fds we pass to the process (or which are closed only during execve). */
4099
4100 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4101 if (r < 0)
4102 return -errno;
4103
4104 close_and_replace(fd, r);
4105 }
4106
4107 *ret_fd = fds[*n_fds] = fd;
4108 (*n_fds) ++;
4109 return 1;
4110 }
4111
4112 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4113 union sockaddr_union addr = {
4114 .un.sun_family = AF_UNIX,
4115 };
4116 socklen_t sa_len;
4117 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4118 int r;
4119
4120 assert(u);
4121 assert(of);
4122 assert(ofd >= 0);
4123
4124 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4125 if (r < 0)
4126 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4127
4128 sa_len = r;
4129
4130 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4131 _cleanup_close_ int fd = -EBADF;
4132
4133 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4134 if (fd < 0)
4135 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4136
4137 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4138 if (r == -EPROTOTYPE)
4139 continue;
4140 if (r < 0)
4141 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4142
4143 return TAKE_FD(fd);
4144 }
4145
4146 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4147 }
4148
4149 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4150 struct stat st;
4151 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4152
4153 assert(u);
4154 assert(of);
4155
4156 ofd = open(of->path, O_PATH | O_CLOEXEC);
4157 if (ofd < 0)
4158 return log_error_errno(errno, "Could not open \"%s\": %m", of->path);
4159 if (fstat(ofd, &st) < 0)
4160 return log_error_errno(errno, "Failed to stat %s: %m", of->path);
4161
4162 if (S_ISSOCK(st.st_mode)) {
4163 fd = connect_unix_harder(u, of, ofd);
4164 if (fd < 0)
4165 return fd;
4166
4167 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4168 return log_error_errno(errno, "Failed to shutdown send for socket %s: %m", of->path);
4169
4170 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4171 } else {
4172 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4173 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4174 flags |= O_APPEND;
4175 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4176 flags |= O_TRUNC;
4177
4178 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4179 if (fd < 0)
4180 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4181
4182 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4183 }
4184
4185 return TAKE_FD(fd);
4186 }
4187
4188 static int collect_open_file_fds(
4189 Unit *u,
4190 OpenFile* open_files,
4191 int **fds,
4192 char ***fdnames,
4193 size_t *n_fds) {
4194 int r;
4195
4196 assert(u);
4197 assert(fds);
4198 assert(fdnames);
4199 assert(n_fds);
4200
4201 LIST_FOREACH(open_files, of, open_files) {
4202 _cleanup_close_ int fd = -EBADF;
4203
4204 fd = get_open_file_fd(u, of);
4205 if (fd < 0) {
4206 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4207 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4208 continue;
4209 }
4210
4211 return fd;
4212 }
4213
4214 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4215 return -ENOMEM;
4216
4217 r = strv_extend(fdnames, of->fdname);
4218 if (r < 0)
4219 return r;
4220
4221 (*fds)[*n_fds] = TAKE_FD(fd);
4222
4223 (*n_fds)++;
4224 }
4225
4226 return 0;
4227 }
4228
4229 static int exec_child(
4230 Unit *unit,
4231 const ExecCommand *command,
4232 const ExecContext *context,
4233 const ExecParameters *params,
4234 ExecRuntime *runtime,
4235 DynamicCreds *dcreds,
4236 int socket_fd,
4237 const int named_iofds[static 3],
4238 int *params_fds,
4239 size_t n_socket_fds,
4240 size_t n_storage_fds,
4241 char **files_env,
4242 int user_lookup_fd,
4243 int *exit_status) {
4244
4245 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4246 int r, ngids = 0, exec_fd;
4247 _cleanup_free_ gid_t *supplementary_gids = NULL;
4248 const char *username = NULL, *groupname = NULL;
4249 _cleanup_free_ char *home_buffer = NULL;
4250 const char *home = NULL, *shell = NULL;
4251 char **final_argv = NULL;
4252 dev_t journal_stream_dev = 0;
4253 ino_t journal_stream_ino = 0;
4254 bool userns_set_up = false;
4255 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4256 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4257 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4258 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4259 #if HAVE_SELINUX
4260 _cleanup_free_ char *mac_selinux_context_net = NULL;
4261 bool use_selinux = false;
4262 #endif
4263 #if ENABLE_SMACK
4264 bool use_smack = false;
4265 #endif
4266 #if HAVE_APPARMOR
4267 bool use_apparmor = false;
4268 #endif
4269 uid_t saved_uid = getuid();
4270 gid_t saved_gid = getgid();
4271 uid_t uid = UID_INVALID;
4272 gid_t gid = GID_INVALID;
4273 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4274 n_keep_fds; /* total number of fds not to close */
4275 int secure_bits;
4276 _cleanup_free_ gid_t *gids_after_pam = NULL;
4277 int ngids_after_pam = 0;
4278 _cleanup_free_ int *fds = NULL;
4279 _cleanup_strv_free_ char **fdnames = NULL;
4280
4281 assert(unit);
4282 assert(command);
4283 assert(context);
4284 assert(params);
4285 assert(exit_status);
4286
4287 /* Explicitly test for CVE-2021-4034 inspired invocations */
4288 assert(command->path);
4289 assert(!strv_isempty(command->argv));
4290
4291 rename_process_from_path(command->path);
4292
4293 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4294 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4295 * both of which will be demoted to SIG_DFL. */
4296 (void) default_signals(SIGNALS_CRASH_HANDLER,
4297 SIGNALS_IGNORE);
4298
4299 if (context->ignore_sigpipe)
4300 (void) ignore_signals(SIGPIPE);
4301
4302 r = reset_signal_mask();
4303 if (r < 0) {
4304 *exit_status = EXIT_SIGNAL_MASK;
4305 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4306 }
4307
4308 if (params->idle_pipe)
4309 do_idle_pipe_dance(params->idle_pipe);
4310
4311 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4312 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4313 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4314 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4315
4316 log_forget_fds();
4317 log_set_open_when_needed(true);
4318
4319 /* In case anything used libc syslog(), close this here, too */
4320 closelog();
4321
4322 fds = newdup(int, params_fds, n_fds);
4323 if (!fds) {
4324 *exit_status = EXIT_MEMORY;
4325 return log_oom();
4326 }
4327
4328 fdnames = strv_copy((char**) params->fd_names);
4329 if (!fdnames) {
4330 *exit_status = EXIT_MEMORY;
4331 return log_oom();
4332 }
4333
4334 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4335 if (r < 0) {
4336 *exit_status = EXIT_FDS;
4337 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4338 }
4339
4340 int keep_fds[n_fds + 3];
4341 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4342 n_keep_fds = n_fds;
4343
4344 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4345 if (r < 0) {
4346 *exit_status = EXIT_FDS;
4347 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4348 }
4349
4350 #if HAVE_LIBBPF
4351 if (unit->manager->restrict_fs) {
4352 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4353 if (bpf_map_fd < 0) {
4354 *exit_status = EXIT_FDS;
4355 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4356 }
4357
4358 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4359 if (r < 0) {
4360 *exit_status = EXIT_FDS;
4361 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4362 }
4363 }
4364 #endif
4365
4366 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4367 if (r < 0) {
4368 *exit_status = EXIT_FDS;
4369 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4370 }
4371
4372 if (!context->same_pgrp &&
4373 setsid() < 0) {
4374 *exit_status = EXIT_SETSID;
4375 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4376 }
4377
4378 exec_context_tty_reset(context, params);
4379
4380 if (unit_shall_confirm_spawn(unit)) {
4381 _cleanup_free_ char *cmdline = NULL;
4382
4383 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4384 if (!cmdline) {
4385 *exit_status = EXIT_MEMORY;
4386 return log_oom();
4387 }
4388
4389 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4390 if (r != CONFIRM_EXECUTE) {
4391 if (r == CONFIRM_PRETEND_SUCCESS) {
4392 *exit_status = EXIT_SUCCESS;
4393 return 0;
4394 }
4395 *exit_status = EXIT_CONFIRM;
4396 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4397 "Execution cancelled by the user");
4398 }
4399 }
4400
4401 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4402 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4403 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4404 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4405 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4406 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4407 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4408 *exit_status = EXIT_MEMORY;
4409 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4410 }
4411
4412 if (context->dynamic_user && dcreds) {
4413 _cleanup_strv_free_ char **suggested_paths = NULL;
4414
4415 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4416 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4417 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4418 *exit_status = EXIT_USER;
4419 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4420 }
4421
4422 r = compile_suggested_paths(context, params, &suggested_paths);
4423 if (r < 0) {
4424 *exit_status = EXIT_MEMORY;
4425 return log_oom();
4426 }
4427
4428 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
4429 if (r < 0) {
4430 *exit_status = EXIT_USER;
4431 if (r == -EILSEQ)
4432 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4433 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4434 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4435 }
4436
4437 if (!uid_is_valid(uid)) {
4438 *exit_status = EXIT_USER;
4439 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4440 }
4441
4442 if (!gid_is_valid(gid)) {
4443 *exit_status = EXIT_USER;
4444 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4445 }
4446
4447 if (dcreds->user)
4448 username = dcreds->user->name;
4449
4450 } else {
4451 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4452 if (r < 0) {
4453 *exit_status = EXIT_USER;
4454 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4455 }
4456
4457 r = get_fixed_group(context, &groupname, &gid);
4458 if (r < 0) {
4459 *exit_status = EXIT_GROUP;
4460 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4461 }
4462 }
4463
4464 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4465 r = get_supplementary_groups(context, username, groupname, gid,
4466 &supplementary_gids, &ngids);
4467 if (r < 0) {
4468 *exit_status = EXIT_GROUP;
4469 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
4470 }
4471
4472 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4473 if (r < 0) {
4474 *exit_status = EXIT_USER;
4475 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
4476 }
4477
4478 user_lookup_fd = safe_close(user_lookup_fd);
4479
4480 r = acquire_home(context, uid, &home, &home_buffer);
4481 if (r < 0) {
4482 *exit_status = EXIT_CHDIR;
4483 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
4484 }
4485
4486 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4487 * must sure to drop O_NONBLOCK */
4488 if (socket_fd >= 0)
4489 (void) fd_nonblock(socket_fd, false);
4490
4491 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4492 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4493 if (params->cgroup_path) {
4494 _cleanup_free_ char *p = NULL;
4495
4496 r = exec_parameters_get_cgroup_path(params, &p);
4497 if (r < 0) {
4498 *exit_status = EXIT_CGROUP;
4499 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4500 }
4501
4502 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4503 if (r == -EUCLEAN) {
4504 *exit_status = EXIT_CGROUP;
4505 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4506 "because the cgroup or one of its parents or "
4507 "siblings is in the threaded mode: %m", p);
4508 }
4509 if (r < 0) {
4510 *exit_status = EXIT_CGROUP;
4511 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4512 }
4513 }
4514
4515 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
4516 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
4517 if (r < 0) {
4518 *exit_status = EXIT_NETWORK;
4519 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4520 }
4521 }
4522
4523 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4524 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4525 if (r < 0) {
4526 *exit_status = EXIT_NAMESPACE;
4527 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4528 }
4529 }
4530
4531 r = setup_input(context, params, socket_fd, named_iofds);
4532 if (r < 0) {
4533 *exit_status = EXIT_STDIN;
4534 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
4535 }
4536
4537 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4538 if (r < 0) {
4539 *exit_status = EXIT_STDOUT;
4540 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
4541 }
4542
4543 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
4544 if (r < 0) {
4545 *exit_status = EXIT_STDERR;
4546 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
4547 }
4548
4549 if (context->oom_score_adjust_set) {
4550 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4551 * prohibit write access to this file, and we shouldn't trip up over that. */
4552 r = set_oom_score_adjust(context->oom_score_adjust);
4553 if (ERRNO_IS_PRIVILEGE(r))
4554 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4555 else if (r < 0) {
4556 *exit_status = EXIT_OOM_ADJUST;
4557 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
4558 }
4559 }
4560
4561 if (context->coredump_filter_set) {
4562 r = set_coredump_filter(context->coredump_filter);
4563 if (ERRNO_IS_PRIVILEGE(r))
4564 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4565 else if (r < 0)
4566 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4567 }
4568
4569 if (context->nice_set) {
4570 r = setpriority_closest(context->nice);
4571 if (r < 0)
4572 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4573 }
4574
4575 if (context->cpu_sched_set) {
4576 struct sched_param param = {
4577 .sched_priority = context->cpu_sched_priority,
4578 };
4579
4580 r = sched_setscheduler(0,
4581 context->cpu_sched_policy |
4582 (context->cpu_sched_reset_on_fork ?
4583 SCHED_RESET_ON_FORK : 0),
4584 &param);
4585 if (r < 0) {
4586 *exit_status = EXIT_SETSCHEDULER;
4587 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
4588 }
4589 }
4590
4591 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4592 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4593 const CPUSet *cpu_set;
4594
4595 if (context->cpu_affinity_from_numa) {
4596 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4597 if (r < 0) {
4598 *exit_status = EXIT_CPUAFFINITY;
4599 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4600 }
4601
4602 cpu_set = &converted_cpu_set;
4603 } else
4604 cpu_set = &context->cpu_set;
4605
4606 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
4607 *exit_status = EXIT_CPUAFFINITY;
4608 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
4609 }
4610 }
4611
4612 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4613 r = apply_numa_policy(&context->numa_policy);
4614 if (r == -EOPNOTSUPP)
4615 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4616 else if (r < 0) {
4617 *exit_status = EXIT_NUMA_POLICY;
4618 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4619 }
4620 }
4621
4622 if (context->ioprio_set)
4623 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
4624 *exit_status = EXIT_IOPRIO;
4625 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
4626 }
4627
4628 if (context->timer_slack_nsec != NSEC_INFINITY)
4629 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
4630 *exit_status = EXIT_TIMERSLACK;
4631 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4632 }
4633
4634 if (context->personality != PERSONALITY_INVALID) {
4635 r = safe_personality(context->personality);
4636 if (r < 0) {
4637 *exit_status = EXIT_PERSONALITY;
4638 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4639 }
4640 }
4641
4642 if (context->utmp_id) {
4643 const char *line = context->tty_path ?
4644 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4645 NULL;
4646 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
4647 line,
4648 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4649 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4650 USER_PROCESS,
4651 username);
4652 }
4653
4654 if (uid_is_valid(uid)) {
4655 r = chown_terminal(STDIN_FILENO, uid);
4656 if (r < 0) {
4657 *exit_status = EXIT_STDIN;
4658 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
4659 }
4660 }
4661
4662 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4663 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4664 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4665 * touch a single hierarchy too. */
4666 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
4667 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4668 if (r < 0) {
4669 *exit_status = EXIT_CGROUP;
4670 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4671 }
4672 }
4673
4674 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4675
4676 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
4677 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
4678 if (r < 0)
4679 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
4680 }
4681
4682 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4683 r = setup_credentials(context, params, unit->id, uid);
4684 if (r < 0) {
4685 *exit_status = EXIT_CREDENTIALS;
4686 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4687 }
4688 }
4689
4690 r = build_environment(
4691 unit,
4692 context,
4693 params,
4694 n_fds,
4695 fdnames,
4696 home,
4697 username,
4698 shell,
4699 journal_stream_dev,
4700 journal_stream_ino,
4701 &our_env);
4702 if (r < 0) {
4703 *exit_status = EXIT_MEMORY;
4704 return log_oom();
4705 }
4706
4707 r = build_pass_environment(context, &pass_env);
4708 if (r < 0) {
4709 *exit_status = EXIT_MEMORY;
4710 return log_oom();
4711 }
4712
4713 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4714 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4715 * not specify PATH but the unit has ExecSearchPath. */
4716 if (!strv_isempty(context->exec_search_path)) {
4717 _cleanup_free_ char *joined = NULL;
4718
4719 joined = strv_join(context->exec_search_path, ":");
4720 if (!joined) {
4721 *exit_status = EXIT_MEMORY;
4722 return log_oom();
4723 }
4724
4725 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4726 if (r < 0) {
4727 *exit_status = EXIT_MEMORY;
4728 return log_oom();
4729 }
4730 }
4731
4732 accum_env = strv_env_merge(params->environment,
4733 our_env,
4734 joined_exec_search_path,
4735 pass_env,
4736 context->environment,
4737 files_env);
4738 if (!accum_env) {
4739 *exit_status = EXIT_MEMORY;
4740 return log_oom();
4741 }
4742 accum_env = strv_env_clean(accum_env);
4743
4744 (void) umask(context->umask);
4745
4746 r = setup_keyring(unit, context, params, uid, gid);
4747 if (r < 0) {
4748 *exit_status = EXIT_KEYRING;
4749 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
4750 }
4751
4752 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4753 * from it. */
4754 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4755
4756 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4757 * for it, and the kernel doesn't actually support ambient caps. */
4758 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
4759
4760 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4761 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4762 * desired. */
4763 if (needs_ambient_hack)
4764 needs_setuid = false;
4765 else
4766 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4767
4768 if (needs_sandboxing) {
4769 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4770 * /sys being present. The actual MAC context application will happen later, as late as
4771 * possible, to avoid impacting our own code paths. */
4772
4773 #if HAVE_SELINUX
4774 use_selinux = mac_selinux_use();
4775 #endif
4776 #if ENABLE_SMACK
4777 use_smack = mac_smack_use();
4778 #endif
4779 #if HAVE_APPARMOR
4780 use_apparmor = mac_apparmor_use();
4781 #endif
4782 }
4783
4784 if (needs_sandboxing) {
4785 int which_failed;
4786
4787 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4788 * is set here. (See below.) */
4789
4790 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4791 if (r < 0) {
4792 *exit_status = EXIT_LIMITS;
4793 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4794 }
4795 }
4796
4797 if (needs_setuid && context->pam_name && username) {
4798 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4799 * wins here. (See above.) */
4800
4801 /* All fds passed in the fds array will be closed in the pam child process. */
4802 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4803 if (r < 0) {
4804 *exit_status = EXIT_PAM;
4805 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
4806 }
4807
4808 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4809 if (ngids_after_pam < 0) {
4810 *exit_status = EXIT_MEMORY;
4811 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
4812 }
4813 }
4814
4815 if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
4816 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4817 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4818 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
4819
4820 userns_set_up = true;
4821 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4822 if (r < 0) {
4823 *exit_status = EXIT_USER;
4824 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
4825 }
4826 }
4827
4828 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4829
4830 if (ns_type_supported(NAMESPACE_NET)) {
4831 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
4832 if (r == -EPERM)
4833 log_unit_warning_errno(unit, r,
4834 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4835 else if (r < 0) {
4836 *exit_status = EXIT_NETWORK;
4837 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4838 }
4839 } else if (context->network_namespace_path) {
4840 *exit_status = EXIT_NETWORK;
4841 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4842 "NetworkNamespacePath= is not supported, refusing.");
4843 } else
4844 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
4845 }
4846
4847 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4848
4849 if (ns_type_supported(NAMESPACE_IPC)) {
4850 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4851 if (r == -EPERM)
4852 log_unit_warning_errno(unit, r,
4853 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4854 else if (r < 0) {
4855 *exit_status = EXIT_NAMESPACE;
4856 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4857 }
4858 } else if (context->ipc_namespace_path) {
4859 *exit_status = EXIT_NAMESPACE;
4860 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4861 "IPCNamespacePath= is not supported, refusing.");
4862 } else
4863 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4864 }
4865
4866 if (needs_mount_namespace) {
4867 _cleanup_free_ char *error_path = NULL;
4868
4869 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
4870 if (r < 0) {
4871 *exit_status = EXIT_NAMESPACE;
4872 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4873 error_path ? ": " : "", strempty(error_path));
4874 }
4875 }
4876
4877 if (needs_sandboxing) {
4878 r = apply_protect_hostname(unit, context, exit_status);
4879 if (r < 0)
4880 return r;
4881 }
4882
4883 /* Drop groups as early as possible.
4884 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4885 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
4886 if (needs_setuid) {
4887 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4888 int ngids_to_enforce = 0;
4889
4890 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4891 ngids,
4892 gids_after_pam,
4893 ngids_after_pam,
4894 &gids_to_enforce);
4895 if (ngids_to_enforce < 0) {
4896 *exit_status = EXIT_MEMORY;
4897 return log_unit_error_errno(unit,
4898 ngids_to_enforce,
4899 "Failed to merge group lists. Group membership might be incorrect: %m");
4900 }
4901
4902 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
4903 if (r < 0) {
4904 *exit_status = EXIT_GROUP;
4905 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
4906 }
4907 }
4908
4909 /* If the user namespace was not set up above, try to do it now.
4910 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4911 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4912 * case of mount namespaces being less privileged when the mount point list is copied from a
4913 * different user namespace). */
4914
4915 if (needs_sandboxing && context->private_users && !userns_set_up) {
4916 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4917 if (r < 0) {
4918 *exit_status = EXIT_USER;
4919 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
4920 }
4921 }
4922
4923 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4924 * shall execute. */
4925
4926 _cleanup_free_ char *executable = NULL;
4927 _cleanup_close_ int executable_fd = -EBADF;
4928 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
4929 if (r < 0) {
4930 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
4931 log_unit_struct_errno(unit, LOG_INFO, r,
4932 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4933 LOG_UNIT_INVOCATION_ID(unit),
4934 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4935 command->path),
4936 "EXECUTABLE=%s", command->path);
4937 return 0;
4938 }
4939
4940 *exit_status = EXIT_EXEC;
4941
4942 return log_unit_struct_errno(unit, LOG_INFO, r,
4943 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4944 LOG_UNIT_INVOCATION_ID(unit),
4945 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4946 command->path),
4947 "EXECUTABLE=%s", command->path);
4948 }
4949
4950 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4951 if (r < 0) {
4952 *exit_status = EXIT_FDS;
4953 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4954 }
4955
4956 #if HAVE_SELINUX
4957 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4958 int fd = -EBADF;
4959
4960 if (socket_fd >= 0)
4961 fd = socket_fd;
4962 else if (params->n_socket_fds == 1)
4963 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4964 * use context from that fd to compute the label. */
4965 fd = params->fds[0];
4966
4967 if (fd >= 0) {
4968 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4969 if (r < 0) {
4970 if (!context->selinux_context_ignore) {
4971 *exit_status = EXIT_SELINUX_CONTEXT;
4972 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4973 }
4974 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
4975 }
4976 }
4977 }
4978 #endif
4979
4980 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
4981 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
4982 * however if we have it as we want to keep it open until the final execve(). */
4983
4984 r = close_all_fds(keep_fds, n_keep_fds);
4985 if (r >= 0)
4986 r = shift_fds(fds, n_fds);
4987 if (r >= 0)
4988 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
4989 if (r < 0) {
4990 *exit_status = EXIT_FDS;
4991 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
4992 }
4993
4994 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4995 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4996 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4997 * came this far. */
4998
4999 secure_bits = context->secure_bits;
5000
5001 if (needs_sandboxing) {
5002 uint64_t bset;
5003
5004 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
5005 * requested. (Note this is placed after the general resource limit initialization, see
5006 * above, in order to take precedence.) */
5007 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5008 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5009 *exit_status = EXIT_LIMITS;
5010 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5011 }
5012 }
5013
5014 #if ENABLE_SMACK
5015 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5016 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5017 if (use_smack) {
5018 r = setup_smack(unit->manager, context, executable_fd);
5019 if (r < 0 && !context->smack_process_label_ignore) {
5020 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5021 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5022 }
5023 }
5024 #endif
5025
5026 bset = context->capability_bounding_set;
5027 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5028 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5029 * instead of us doing that */
5030 if (needs_ambient_hack)
5031 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5032 (UINT64_C(1) << CAP_SETUID) |
5033 (UINT64_C(1) << CAP_SETGID);
5034
5035 if (!cap_test_all(bset)) {
5036 r = capability_bounding_set_drop(bset, false);
5037 if (r < 0) {
5038 *exit_status = EXIT_CAPABILITIES;
5039 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5040 }
5041 }
5042
5043 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5044 * keep-caps set.
5045 * To be able to raise the ambient capabilities after setresuid() they have to be
5046 * added to the inherited set and keep caps has to be set (done in enforce_user()).
5047 * After setresuid() the ambient capabilities can be raised as they are present in
5048 * the permitted and inhertiable set. However it is possible that someone wants to
5049 * set ambient capabilities without changing the user, so we also set the ambient
5050 * capabilities here.
5051 * The requested ambient capabilities are raised in the inheritable set if the
5052 * second argument is true. */
5053 if (!needs_ambient_hack) {
5054 r = capability_ambient_set_apply(context->capability_ambient_set, true);
5055 if (r < 0) {
5056 *exit_status = EXIT_CAPABILITIES;
5057 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5058 }
5059 }
5060 }
5061
5062 /* chroot to root directory first, before we lose the ability to chroot */
5063 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
5064 if (r < 0)
5065 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5066
5067 if (needs_setuid) {
5068 if (uid_is_valid(uid)) {
5069 r = enforce_user(context, uid);
5070 if (r < 0) {
5071 *exit_status = EXIT_USER;
5072 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5073 }
5074
5075 if (!needs_ambient_hack &&
5076 context->capability_ambient_set != 0) {
5077
5078 /* Raise the ambient capabilities after user change. */
5079 r = capability_ambient_set_apply(context->capability_ambient_set, false);
5080 if (r < 0) {
5081 *exit_status = EXIT_CAPABILITIES;
5082 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5083 }
5084 }
5085 }
5086 }
5087
5088 /* Apply working directory here, because the working directory might be on NFS and only the user running
5089 * this service might have the correct privilege to change to the working directory */
5090 r = apply_working_directory(context, params, home, exit_status);
5091 if (r < 0)
5092 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5093
5094 if (needs_sandboxing) {
5095 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5096 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5097 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5098 * are restricted. */
5099
5100 #if HAVE_SELINUX
5101 if (use_selinux) {
5102 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5103
5104 if (exec_context) {
5105 r = setexeccon(exec_context);
5106 if (r < 0) {
5107 if (!context->selinux_context_ignore) {
5108 *exit_status = EXIT_SELINUX_CONTEXT;
5109 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5110 }
5111 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5112 }
5113 }
5114 }
5115 #endif
5116
5117 #if HAVE_APPARMOR
5118 if (use_apparmor && context->apparmor_profile) {
5119 r = aa_change_onexec(context->apparmor_profile);
5120 if (r < 0 && !context->apparmor_profile_ignore) {
5121 *exit_status = EXIT_APPARMOR_PROFILE;
5122 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5123 }
5124 }
5125 #endif
5126
5127 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
5128 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
5129 * CAP_SETPCAP. */
5130 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5131 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5132 * effective set here.
5133 * The effective set is overwritten during execve with the following values:
5134 * - ambient set (for non-root processes)
5135 * - (inheritable | bounding) set for root processes)
5136 *
5137 * Hence there is no security impact to raise it in the effective set before execve
5138 */
5139 r = capability_gain_cap_setpcap(NULL);
5140 if (r < 0) {
5141 *exit_status = EXIT_CAPABILITIES;
5142 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5143 }
5144 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5145 *exit_status = EXIT_SECUREBITS;
5146 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5147 }
5148 }
5149
5150 if (context_has_no_new_privileges(context))
5151 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5152 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5153 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5154 }
5155
5156 #if HAVE_SECCOMP
5157 r = apply_address_families(unit, context);
5158 if (r < 0) {
5159 *exit_status = EXIT_ADDRESS_FAMILIES;
5160 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5161 }
5162
5163 r = apply_memory_deny_write_execute(unit, context);
5164 if (r < 0) {
5165 *exit_status = EXIT_SECCOMP;
5166 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5167 }
5168
5169 r = apply_restrict_realtime(unit, context);
5170 if (r < 0) {
5171 *exit_status = EXIT_SECCOMP;
5172 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5173 }
5174
5175 r = apply_restrict_suid_sgid(unit, context);
5176 if (r < 0) {
5177 *exit_status = EXIT_SECCOMP;
5178 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5179 }
5180
5181 r = apply_restrict_namespaces(unit, context);
5182 if (r < 0) {
5183 *exit_status = EXIT_SECCOMP;
5184 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5185 }
5186
5187 r = apply_protect_sysctl(unit, context);
5188 if (r < 0) {
5189 *exit_status = EXIT_SECCOMP;
5190 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5191 }
5192
5193 r = apply_protect_kernel_modules(unit, context);
5194 if (r < 0) {
5195 *exit_status = EXIT_SECCOMP;
5196 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5197 }
5198
5199 r = apply_protect_kernel_logs(unit, context);
5200 if (r < 0) {
5201 *exit_status = EXIT_SECCOMP;
5202 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5203 }
5204
5205 r = apply_protect_clock(unit, context);
5206 if (r < 0) {
5207 *exit_status = EXIT_SECCOMP;
5208 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5209 }
5210
5211 r = apply_private_devices(unit, context);
5212 if (r < 0) {
5213 *exit_status = EXIT_SECCOMP;
5214 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5215 }
5216
5217 r = apply_syscall_archs(unit, context);
5218 if (r < 0) {
5219 *exit_status = EXIT_SECCOMP;
5220 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5221 }
5222
5223 r = apply_lock_personality(unit, context);
5224 if (r < 0) {
5225 *exit_status = EXIT_SECCOMP;
5226 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5227 }
5228
5229 r = apply_syscall_log(unit, context);
5230 if (r < 0) {
5231 *exit_status = EXIT_SECCOMP;
5232 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5233 }
5234
5235 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5236 * by the filter as little as possible. */
5237 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5238 if (r < 0) {
5239 *exit_status = EXIT_SECCOMP;
5240 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5241 }
5242 #endif
5243
5244 #if HAVE_LIBBPF
5245 r = apply_restrict_filesystems(unit, context);
5246 if (r < 0) {
5247 *exit_status = EXIT_BPF;
5248 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5249 }
5250 #endif
5251
5252 }
5253
5254 if (!strv_isempty(context->unset_environment)) {
5255 char **ee = NULL;
5256
5257 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5258 if (!ee) {
5259 *exit_status = EXIT_MEMORY;
5260 return log_oom();
5261 }
5262
5263 strv_free_and_replace(accum_env, ee);
5264 }
5265
5266 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5267 replaced_argv = replace_env_argv(command->argv, accum_env);
5268 if (!replaced_argv) {
5269 *exit_status = EXIT_MEMORY;
5270 return log_oom();
5271 }
5272 final_argv = replaced_argv;
5273 } else
5274 final_argv = command->argv;
5275
5276 if (DEBUG_LOGGING) {
5277 _cleanup_free_ char *line = NULL;
5278
5279 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
5280 if (!line) {
5281 *exit_status = EXIT_MEMORY;
5282 return log_oom();
5283 }
5284
5285 log_unit_struct(unit, LOG_DEBUG,
5286 "EXECUTABLE=%s", executable,
5287 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
5288 }
5289
5290 if (exec_fd >= 0) {
5291 uint8_t hot = 1;
5292
5293 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5294 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5295
5296 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5297 *exit_status = EXIT_EXEC;
5298 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5299 }
5300 }
5301
5302 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5303
5304 if (exec_fd >= 0) {
5305 uint8_t hot = 0;
5306
5307 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5308 * that POLLHUP on it no longer means execve() succeeded. */
5309
5310 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5311 *exit_status = EXIT_EXEC;
5312 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5313 }
5314 }
5315
5316 *exit_status = EXIT_EXEC;
5317 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5318 }
5319
5320 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5321 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5322
5323 int exec_spawn(Unit *unit,
5324 ExecCommand *command,
5325 const ExecContext *context,
5326 const ExecParameters *params,
5327 ExecRuntime *runtime,
5328 DynamicCreds *dcreds,
5329 pid_t *ret) {
5330
5331 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5332 _cleanup_free_ char *subcgroup_path = NULL;
5333 _cleanup_strv_free_ char **files_env = NULL;
5334 size_t n_storage_fds = 0, n_socket_fds = 0;
5335 _cleanup_free_ char *line = NULL;
5336 pid_t pid;
5337
5338 assert(unit);
5339 assert(command);
5340 assert(context);
5341 assert(ret);
5342 assert(params);
5343 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5344
5345 if (context->std_input == EXEC_INPUT_SOCKET ||
5346 context->std_output == EXEC_OUTPUT_SOCKET ||
5347 context->std_error == EXEC_OUTPUT_SOCKET) {
5348
5349 if (params->n_socket_fds > 1)
5350 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5351
5352 if (params->n_socket_fds == 0)
5353 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5354
5355 socket_fd = params->fds[0];
5356 } else {
5357 socket_fd = -EBADF;
5358 fds = params->fds;
5359 n_socket_fds = params->n_socket_fds;
5360 n_storage_fds = params->n_storage_fds;
5361 }
5362
5363 r = exec_context_named_iofds(context, params, named_iofds);
5364 if (r < 0)
5365 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5366
5367 r = exec_context_load_environment(unit, context, &files_env);
5368 if (r < 0)
5369 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
5370
5371 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
5372 if (!line)
5373 return log_oom();
5374
5375 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5376 and, until the next SELinux policy changes, we save further reloads in future children. */
5377 mac_selinux_maybe_reload();
5378
5379 log_unit_struct(unit, LOG_DEBUG,
5380 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5381 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5382 the mount namespace in the child, but we want to log
5383 from the parent, so we need to use the (possibly
5384 inaccurate) path here. */
5385 LOG_UNIT_INVOCATION_ID(unit));
5386
5387 if (params->cgroup_path) {
5388 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5389 if (r < 0)
5390 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5391 if (r > 0) { /* We are using a child cgroup */
5392 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5393 if (r < 0)
5394 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
5395
5396 /* Normally we would not propagate the xattrs to children but since we created this
5397 * sub-cgroup internally we should do it. */
5398 cgroup_oomd_xattr_apply(unit, subcgroup_path);
5399 cgroup_log_xattr_apply(unit, subcgroup_path);
5400 }
5401 }
5402
5403 pid = fork();
5404 if (pid < 0)
5405 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
5406
5407 if (pid == 0) {
5408 int exit_status = EXIT_SUCCESS;
5409
5410 r = exec_child(unit,
5411 command,
5412 context,
5413 params,
5414 runtime,
5415 dcreds,
5416 socket_fd,
5417 named_iofds,
5418 fds,
5419 n_socket_fds,
5420 n_storage_fds,
5421 files_env,
5422 unit->manager->user_lookup_fds[1],
5423 &exit_status);
5424
5425 if (r < 0) {
5426 const char *status =
5427 exit_status_to_string(exit_status,
5428 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
5429
5430 log_unit_struct_errno(unit, LOG_ERR, r,
5431 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5432 LOG_UNIT_INVOCATION_ID(unit),
5433 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5434 status, command->path),
5435 "EXECUTABLE=%s", command->path);
5436 }
5437
5438 _exit(exit_status);
5439 }
5440
5441 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
5442
5443 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5444 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5445 * process will be killed too). */
5446 if (subcgroup_path)
5447 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
5448
5449 exec_status_start(&command->exec_status, pid);
5450
5451 *ret = pid;
5452 return 0;
5453 }
5454
5455 void exec_context_init(ExecContext *c) {
5456 assert(c);
5457
5458 c->umask = 0022;
5459 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
5460 c->cpu_sched_policy = SCHED_OTHER;
5461 c->syslog_priority = LOG_DAEMON|LOG_INFO;
5462 c->syslog_level_prefix = true;
5463 c->ignore_sigpipe = true;
5464 c->timer_slack_nsec = NSEC_INFINITY;
5465 c->personality = PERSONALITY_INVALID;
5466 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5467 c->directories[t].mode = 0755;
5468 c->timeout_clean_usec = USEC_INFINITY;
5469 c->capability_bounding_set = CAP_ALL;
5470 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5471 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
5472 c->log_level_max = -1;
5473 #if HAVE_SECCOMP
5474 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5475 #endif
5476 c->tty_rows = UINT_MAX;
5477 c->tty_cols = UINT_MAX;
5478 numa_policy_reset(&c->numa_policy);
5479 }
5480
5481 void exec_context_done(ExecContext *c) {
5482 assert(c);
5483
5484 c->environment = strv_free(c->environment);
5485 c->environment_files = strv_free(c->environment_files);
5486 c->pass_environment = strv_free(c->pass_environment);
5487 c->unset_environment = strv_free(c->unset_environment);
5488
5489 rlimit_free_all(c->rlimit);
5490
5491 for (size_t l = 0; l < 3; l++) {
5492 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
5493 c->stdio_file[l] = mfree(c->stdio_file[l]);
5494 }
5495
5496 c->working_directory = mfree(c->working_directory);
5497 c->root_directory = mfree(c->root_directory);
5498 c->root_image = mfree(c->root_image);
5499 c->root_image_options = mount_options_free_all(c->root_image_options);
5500 c->root_hash = mfree(c->root_hash);
5501 c->root_hash_size = 0;
5502 c->root_hash_path = mfree(c->root_hash_path);
5503 c->root_hash_sig = mfree(c->root_hash_sig);
5504 c->root_hash_sig_size = 0;
5505 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
5506 c->root_verity = mfree(c->root_verity);
5507 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
5508 c->extension_directories = strv_free(c->extension_directories);
5509 c->tty_path = mfree(c->tty_path);
5510 c->syslog_identifier = mfree(c->syslog_identifier);
5511 c->user = mfree(c->user);
5512 c->group = mfree(c->group);
5513
5514 c->supplementary_groups = strv_free(c->supplementary_groups);
5515
5516 c->pam_name = mfree(c->pam_name);
5517
5518 c->read_only_paths = strv_free(c->read_only_paths);
5519 c->read_write_paths = strv_free(c->read_write_paths);
5520 c->inaccessible_paths = strv_free(c->inaccessible_paths);
5521 c->exec_paths = strv_free(c->exec_paths);
5522 c->no_exec_paths = strv_free(c->no_exec_paths);
5523 c->exec_search_path = strv_free(c->exec_search_path);
5524
5525 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
5526 c->bind_mounts = NULL;
5527 c->n_bind_mounts = 0;
5528 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5529 c->temporary_filesystems = NULL;
5530 c->n_temporary_filesystems = 0;
5531 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
5532
5533 cpu_set_reset(&c->cpu_set);
5534 numa_policy_reset(&c->numa_policy);
5535
5536 c->utmp_id = mfree(c->utmp_id);
5537 c->selinux_context = mfree(c->selinux_context);
5538 c->apparmor_profile = mfree(c->apparmor_profile);
5539 c->smack_process_label = mfree(c->smack_process_label);
5540
5541 c->restrict_filesystems = set_free(c->restrict_filesystems);
5542
5543 c->syscall_filter = hashmap_free(c->syscall_filter);
5544 c->syscall_archs = set_free(c->syscall_archs);
5545 c->address_families = set_free(c->address_families);
5546
5547 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5548 exec_directory_done(&c->directories[t]);
5549
5550 c->log_level_max = -1;
5551
5552 exec_context_free_log_extra_fields(c);
5553 c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5554 c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
5555
5556 c->log_ratelimit_interval_usec = 0;
5557 c->log_ratelimit_burst = 0;
5558
5559 c->stdin_data = mfree(c->stdin_data);
5560 c->stdin_data_size = 0;
5561
5562 c->network_namespace_path = mfree(c->network_namespace_path);
5563 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
5564
5565 c->log_namespace = mfree(c->log_namespace);
5566
5567 c->load_credentials = hashmap_free(c->load_credentials);
5568 c->set_credentials = hashmap_free(c->set_credentials);
5569 }
5570
5571 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
5572 assert(c);
5573
5574 if (!runtime_prefix)
5575 return 0;
5576
5577 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
5578 _cleanup_free_ char *p = NULL;
5579
5580 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5581 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5582 else
5583 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
5584 if (!p)
5585 return -ENOMEM;
5586
5587 /* We execute this synchronously, since we need to be sure this is gone when we start the
5588 * service next. */
5589 (void) rm_rf(p, REMOVE_ROOT);
5590
5591 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5592 _cleanup_free_ char *symlink_abs = NULL;
5593
5594 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5595 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5596 else
5597 symlink_abs = path_join(runtime_prefix, *symlink);
5598 if (!symlink_abs)
5599 return -ENOMEM;
5600
5601 (void) unlink(symlink_abs);
5602 }
5603
5604 }
5605
5606 return 0;
5607 }
5608
5609 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5610 _cleanup_free_ char *p = NULL;
5611
5612 assert(c);
5613
5614 if (!runtime_prefix || !unit)
5615 return 0;
5616
5617 p = path_join(runtime_prefix, "credentials", unit);
5618 if (!p)
5619 return -ENOMEM;
5620
5621 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5622 * unmount it, and afterwards remove the mount point */
5623 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5624 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5625
5626 return 0;
5627 }
5628
5629 static void exec_command_done(ExecCommand *c) {
5630 assert(c);
5631
5632 c->path = mfree(c->path);
5633 c->argv = strv_free(c->argv);
5634 }
5635
5636 void exec_command_done_array(ExecCommand *c, size_t n) {
5637 for (size_t i = 0; i < n; i++)
5638 exec_command_done(c+i);
5639 }
5640
5641 ExecCommand* exec_command_free_list(ExecCommand *c) {
5642 ExecCommand *i;
5643
5644 while ((i = c)) {
5645 LIST_REMOVE(command, c, i);
5646 exec_command_done(i);
5647 free(i);
5648 }
5649
5650 return NULL;
5651 }
5652
5653 void exec_command_free_array(ExecCommand **c, size_t n) {
5654 for (size_t i = 0; i < n; i++)
5655 c[i] = exec_command_free_list(c[i]);
5656 }
5657
5658 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5659 for (size_t i = 0; i < n; i++)
5660 exec_status_reset(&c[i].exec_status);
5661 }
5662
5663 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5664 for (size_t i = 0; i < n; i++)
5665 LIST_FOREACH(command, z, c[i])
5666 exec_status_reset(&z->exec_status);
5667 }
5668
5669 typedef struct InvalidEnvInfo {
5670 const Unit *unit;
5671 const char *path;
5672 } InvalidEnvInfo;
5673
5674 static void invalid_env(const char *p, void *userdata) {
5675 InvalidEnvInfo *info = userdata;
5676
5677 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
5678 }
5679
5680 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5681 assert(c);
5682
5683 switch (fd_index) {
5684
5685 case STDIN_FILENO:
5686 if (c->std_input != EXEC_INPUT_NAMED_FD)
5687 return NULL;
5688
5689 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5690
5691 case STDOUT_FILENO:
5692 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5693 return NULL;
5694
5695 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5696
5697 case STDERR_FILENO:
5698 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5699 return NULL;
5700
5701 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5702
5703 default:
5704 return NULL;
5705 }
5706 }
5707
5708 static int exec_context_named_iofds(
5709 const ExecContext *c,
5710 const ExecParameters *p,
5711 int named_iofds[static 3]) {
5712
5713 size_t targets;
5714 const char* stdio_fdname[3];
5715 size_t n_fds;
5716
5717 assert(c);
5718 assert(p);
5719 assert(named_iofds);
5720
5721 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5722 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5723 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5724
5725 for (size_t i = 0; i < 3; i++)
5726 stdio_fdname[i] = exec_context_fdname(c, i);
5727
5728 n_fds = p->n_storage_fds + p->n_socket_fds;
5729
5730 for (size_t i = 0; i < n_fds && targets > 0; i++)
5731 if (named_iofds[STDIN_FILENO] < 0 &&
5732 c->std_input == EXEC_INPUT_NAMED_FD &&
5733 stdio_fdname[STDIN_FILENO] &&
5734 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5735
5736 named_iofds[STDIN_FILENO] = p->fds[i];
5737 targets--;
5738
5739 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5740 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5741 stdio_fdname[STDOUT_FILENO] &&
5742 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5743
5744 named_iofds[STDOUT_FILENO] = p->fds[i];
5745 targets--;
5746
5747 } else if (named_iofds[STDERR_FILENO] < 0 &&
5748 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5749 stdio_fdname[STDERR_FILENO] &&
5750 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5751
5752 named_iofds[STDERR_FILENO] = p->fds[i];
5753 targets--;
5754 }
5755
5756 return targets == 0 ? 0 : -ENOENT;
5757 }
5758
5759 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5760 _cleanup_strv_free_ char **v = NULL;
5761 int r;
5762
5763 assert(c);
5764 assert(ret);
5765
5766 STRV_FOREACH(i, c->environment_files) {
5767 _cleanup_globfree_ glob_t pglob = {};
5768 bool ignore = false;
5769 char *fn = *i;
5770
5771 if (fn[0] == '-') {
5772 ignore = true;
5773 fn++;
5774 }
5775
5776 if (!path_is_absolute(fn)) {
5777 if (ignore)
5778 continue;
5779 return -EINVAL;
5780 }
5781
5782 /* Filename supports globbing, take all matching files */
5783 r = safe_glob(fn, 0, &pglob);
5784 if (r < 0) {
5785 if (ignore)
5786 continue;
5787 return r;
5788 }
5789
5790 /* When we don't match anything, -ENOENT should be returned */
5791 assert(pglob.gl_pathc > 0);
5792
5793 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
5794 _cleanup_strv_free_ char **p = NULL;
5795
5796 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5797 if (r < 0) {
5798 if (ignore)
5799 continue;
5800 return r;
5801 }
5802
5803 /* Log invalid environment variables with filename */
5804 if (p) {
5805 InvalidEnvInfo info = {
5806 .unit = unit,
5807 .path = pglob.gl_pathv[n]
5808 };
5809
5810 p = strv_env_clean_with_callback(p, invalid_env, &info);
5811 }
5812
5813 if (!v)
5814 v = TAKE_PTR(p);
5815 else {
5816 char **m = strv_env_merge(v, p);
5817 if (!m)
5818 return -ENOMEM;
5819
5820 strv_free_and_replace(v, m);
5821 }
5822 }
5823 }
5824
5825 *ret = TAKE_PTR(v);
5826
5827 return 0;
5828 }
5829
5830 static bool tty_may_match_dev_console(const char *tty) {
5831 _cleanup_free_ char *resolved = NULL;
5832
5833 if (!tty)
5834 return true;
5835
5836 tty = skip_dev_prefix(tty);
5837
5838 /* trivial identity? */
5839 if (streq(tty, "console"))
5840 return true;
5841
5842 if (resolve_dev_console(&resolved) < 0)
5843 return true; /* if we could not resolve, assume it may */
5844
5845 /* "tty0" means the active VC, so it may be the same sometimes */
5846 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
5847 }
5848
5849 static bool exec_context_may_touch_tty(const ExecContext *ec) {
5850 assert(ec);
5851
5852 return ec->tty_reset ||
5853 ec->tty_vhangup ||
5854 ec->tty_vt_disallocate ||
5855 is_terminal_input(ec->std_input) ||
5856 is_terminal_output(ec->std_output) ||
5857 is_terminal_output(ec->std_error);
5858 }
5859
5860 bool exec_context_may_touch_console(const ExecContext *ec) {
5861
5862 return exec_context_may_touch_tty(ec) &&
5863 tty_may_match_dev_console(exec_context_tty_path(ec));
5864 }
5865
5866 static void strv_fprintf(FILE *f, char **l) {
5867 assert(f);
5868
5869 STRV_FOREACH(g, l)
5870 fprintf(f, " %s", *g);
5871 }
5872
5873 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5874 assert(f);
5875 assert(prefix);
5876 assert(name);
5877
5878 if (!strv_isempty(strv)) {
5879 fprintf(f, "%s%s:", prefix, name);
5880 strv_fprintf(f, strv);
5881 fputs("\n", f);
5882 }
5883 }
5884
5885 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5886 int r;
5887
5888 assert(c);
5889 assert(f);
5890
5891 prefix = strempty(prefix);
5892
5893 fprintf(f,
5894 "%sUMask: %04o\n"
5895 "%sWorkingDirectory: %s\n"
5896 "%sRootDirectory: %s\n"
5897 "%sNonBlocking: %s\n"
5898 "%sPrivateTmp: %s\n"
5899 "%sPrivateDevices: %s\n"
5900 "%sProtectKernelTunables: %s\n"
5901 "%sProtectKernelModules: %s\n"
5902 "%sProtectKernelLogs: %s\n"
5903 "%sProtectClock: %s\n"
5904 "%sProtectControlGroups: %s\n"
5905 "%sPrivateNetwork: %s\n"
5906 "%sPrivateUsers: %s\n"
5907 "%sProtectHome: %s\n"
5908 "%sProtectSystem: %s\n"
5909 "%sMountAPIVFS: %s\n"
5910 "%sIgnoreSIGPIPE: %s\n"
5911 "%sMemoryDenyWriteExecute: %s\n"
5912 "%sRestrictRealtime: %s\n"
5913 "%sRestrictSUIDSGID: %s\n"
5914 "%sKeyringMode: %s\n"
5915 "%sProtectHostname: %s\n"
5916 "%sProtectProc: %s\n"
5917 "%sProcSubset: %s\n",
5918 prefix, c->umask,
5919 prefix, empty_to_root(c->working_directory),
5920 prefix, empty_to_root(c->root_directory),
5921 prefix, yes_no(c->non_blocking),
5922 prefix, yes_no(c->private_tmp),
5923 prefix, yes_no(c->private_devices),
5924 prefix, yes_no(c->protect_kernel_tunables),
5925 prefix, yes_no(c->protect_kernel_modules),
5926 prefix, yes_no(c->protect_kernel_logs),
5927 prefix, yes_no(c->protect_clock),
5928 prefix, yes_no(c->protect_control_groups),
5929 prefix, yes_no(c->private_network),
5930 prefix, yes_no(c->private_users),
5931 prefix, protect_home_to_string(c->protect_home),
5932 prefix, protect_system_to_string(c->protect_system),
5933 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
5934 prefix, yes_no(c->ignore_sigpipe),
5935 prefix, yes_no(c->memory_deny_write_execute),
5936 prefix, yes_no(c->restrict_realtime),
5937 prefix, yes_no(c->restrict_suid_sgid),
5938 prefix, exec_keyring_mode_to_string(c->keyring_mode),
5939 prefix, yes_no(c->protect_hostname),
5940 prefix, protect_proc_to_string(c->protect_proc),
5941 prefix, proc_subset_to_string(c->proc_subset));
5942
5943 if (c->root_image)
5944 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5945
5946 if (c->root_image_options) {
5947 fprintf(f, "%sRootImageOptions:", prefix);
5948 LIST_FOREACH(mount_options, o, c->root_image_options)
5949 if (!isempty(o->options))
5950 fprintf(f, " %s:%s",
5951 partition_designator_to_string(o->partition_designator),
5952 o->options);
5953 fprintf(f, "\n");
5954 }
5955
5956 if (c->root_hash) {
5957 _cleanup_free_ char *encoded = NULL;
5958 encoded = hexmem(c->root_hash, c->root_hash_size);
5959 if (encoded)
5960 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5961 }
5962
5963 if (c->root_hash_path)
5964 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5965
5966 if (c->root_hash_sig) {
5967 _cleanup_free_ char *encoded = NULL;
5968 ssize_t len;
5969 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5970 if (len)
5971 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5972 }
5973
5974 if (c->root_hash_sig_path)
5975 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5976
5977 if (c->root_verity)
5978 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5979
5980 STRV_FOREACH(e, c->environment)
5981 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5982
5983 STRV_FOREACH(e, c->environment_files)
5984 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
5985
5986 STRV_FOREACH(e, c->pass_environment)
5987 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5988
5989 STRV_FOREACH(e, c->unset_environment)
5990 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5991
5992 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5993
5994 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5995 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5996
5997 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5998 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5999
6000 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6001 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6002 }
6003 }
6004
6005 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6006
6007 if (c->nice_set)
6008 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6009
6010 if (c->oom_score_adjust_set)
6011 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6012
6013 if (c->coredump_filter_set)
6014 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6015
6016 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6017 if (c->rlimit[i]) {
6018 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6019 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6020 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6021 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6022 }
6023
6024 if (c->ioprio_set) {
6025 _cleanup_free_ char *class_str = NULL;
6026
6027 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6028 if (r >= 0)
6029 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6030
6031 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6032 }
6033
6034 if (c->cpu_sched_set) {
6035 _cleanup_free_ char *policy_str = NULL;
6036
6037 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6038 if (r >= 0)
6039 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6040
6041 fprintf(f,
6042 "%sCPUSchedulingPriority: %i\n"
6043 "%sCPUSchedulingResetOnFork: %s\n",
6044 prefix, c->cpu_sched_priority,
6045 prefix, yes_no(c->cpu_sched_reset_on_fork));
6046 }
6047
6048 if (c->cpu_set.set) {
6049 _cleanup_free_ char *affinity = NULL;
6050
6051 affinity = cpu_set_to_range_string(&c->cpu_set);
6052 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6053 }
6054
6055 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6056 _cleanup_free_ char *nodes = NULL;
6057
6058 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6059 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6060 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6061 }
6062
6063 if (c->timer_slack_nsec != NSEC_INFINITY)
6064 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6065
6066 fprintf(f,
6067 "%sStandardInput: %s\n"
6068 "%sStandardOutput: %s\n"
6069 "%sStandardError: %s\n",
6070 prefix, exec_input_to_string(c->std_input),
6071 prefix, exec_output_to_string(c->std_output),
6072 prefix, exec_output_to_string(c->std_error));
6073
6074 if (c->std_input == EXEC_INPUT_NAMED_FD)
6075 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6076 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6077 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6078 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6079 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6080
6081 if (c->std_input == EXEC_INPUT_FILE)
6082 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6083 if (c->std_output == EXEC_OUTPUT_FILE)
6084 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6085 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6086 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6087 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6088 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6089 if (c->std_error == EXEC_OUTPUT_FILE)
6090 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6091 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6092 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6093 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6094 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6095
6096 if (c->tty_path)
6097 fprintf(f,
6098 "%sTTYPath: %s\n"
6099 "%sTTYReset: %s\n"
6100 "%sTTYVHangup: %s\n"
6101 "%sTTYVTDisallocate: %s\n"
6102 "%sTTYRows: %u\n"
6103 "%sTTYColumns: %u\n",
6104 prefix, c->tty_path,
6105 prefix, yes_no(c->tty_reset),
6106 prefix, yes_no(c->tty_vhangup),
6107 prefix, yes_no(c->tty_vt_disallocate),
6108 prefix, c->tty_rows,
6109 prefix, c->tty_cols);
6110
6111 if (IN_SET(c->std_output,
6112 EXEC_OUTPUT_KMSG,
6113 EXEC_OUTPUT_JOURNAL,
6114 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6115 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6116 IN_SET(c->std_error,
6117 EXEC_OUTPUT_KMSG,
6118 EXEC_OUTPUT_JOURNAL,
6119 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6120 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6121
6122 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6123
6124 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6125 if (r >= 0)
6126 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6127
6128 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6129 if (r >= 0)
6130 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6131 }
6132
6133 if (c->log_level_max >= 0) {
6134 _cleanup_free_ char *t = NULL;
6135
6136 (void) log_level_to_string_alloc(c->log_level_max, &t);
6137
6138 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6139 }
6140
6141 if (c->log_ratelimit_interval_usec > 0)
6142 fprintf(f,
6143 "%sLogRateLimitIntervalSec: %s\n",
6144 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6145
6146 if (c->log_ratelimit_burst > 0)
6147 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6148
6149 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6150 fprintf(f, "%sLogFilterPatterns:", prefix);
6151
6152 char *pattern;
6153 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6154 fprintf(f, " %s", pattern);
6155 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6156 fprintf(f, " ~%s", pattern);
6157 fputc('\n', f);
6158 }
6159
6160 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6161 fprintf(f, "%sLogExtraFields: ", prefix);
6162 fwrite(c->log_extra_fields[j].iov_base,
6163 1, c->log_extra_fields[j].iov_len,
6164 f);
6165 fputc('\n', f);
6166 }
6167
6168 if (c->log_namespace)
6169 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6170
6171 if (c->secure_bits) {
6172 _cleanup_free_ char *str = NULL;
6173
6174 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6175 if (r >= 0)
6176 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6177 }
6178
6179 if (c->capability_bounding_set != CAP_ALL) {
6180 _cleanup_free_ char *str = NULL;
6181
6182 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6183 if (r >= 0)
6184 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6185 }
6186
6187 if (c->capability_ambient_set != 0) {
6188 _cleanup_free_ char *str = NULL;
6189
6190 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6191 if (r >= 0)
6192 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6193 }
6194
6195 if (c->user)
6196 fprintf(f, "%sUser: %s\n", prefix, c->user);
6197 if (c->group)
6198 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6199
6200 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6201
6202 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6203
6204 if (c->pam_name)
6205 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6206
6207 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6208 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6209 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6210 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6211 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6212 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6213
6214 for (size_t i = 0; i < c->n_bind_mounts; i++)
6215 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6216 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6217 c->bind_mounts[i].ignore_enoent ? "-": "",
6218 c->bind_mounts[i].source,
6219 c->bind_mounts[i].destination,
6220 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6221
6222 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6223 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6224
6225 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6226 t->path,
6227 isempty(t->options) ? "" : ":",
6228 strempty(t->options));
6229 }
6230
6231 if (c->utmp_id)
6232 fprintf(f,
6233 "%sUtmpIdentifier: %s\n",
6234 prefix, c->utmp_id);
6235
6236 if (c->selinux_context)
6237 fprintf(f,
6238 "%sSELinuxContext: %s%s\n",
6239 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6240
6241 if (c->apparmor_profile)
6242 fprintf(f,
6243 "%sAppArmorProfile: %s%s\n",
6244 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6245
6246 if (c->smack_process_label)
6247 fprintf(f,
6248 "%sSmackProcessLabel: %s%s\n",
6249 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6250
6251 if (c->personality != PERSONALITY_INVALID)
6252 fprintf(f,
6253 "%sPersonality: %s\n",
6254 prefix, strna(personality_to_string(c->personality)));
6255
6256 fprintf(f,
6257 "%sLockPersonality: %s\n",
6258 prefix, yes_no(c->lock_personality));
6259
6260 if (c->syscall_filter) {
6261 fprintf(f,
6262 "%sSystemCallFilter: ",
6263 prefix);
6264
6265 if (!c->syscall_allow_list)
6266 fputc('~', f);
6267
6268 #if HAVE_SECCOMP
6269 void *id, *val;
6270 bool first = true;
6271 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6272 _cleanup_free_ char *name = NULL;
6273 const char *errno_name = NULL;
6274 int num = PTR_TO_INT(val);
6275
6276 if (first)
6277 first = false;
6278 else
6279 fputc(' ', f);
6280
6281 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6282 fputs(strna(name), f);
6283
6284 if (num >= 0) {
6285 errno_name = seccomp_errno_or_action_to_string(num);
6286 if (errno_name)
6287 fprintf(f, ":%s", errno_name);
6288 else
6289 fprintf(f, ":%d", num);
6290 }
6291 }
6292 #endif
6293
6294 fputc('\n', f);
6295 }
6296
6297 if (c->syscall_archs) {
6298 fprintf(f,
6299 "%sSystemCallArchitectures:",
6300 prefix);
6301
6302 #if HAVE_SECCOMP
6303 void *id;
6304 SET_FOREACH(id, c->syscall_archs)
6305 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6306 #endif
6307 fputc('\n', f);
6308 }
6309
6310 if (exec_context_restrict_namespaces_set(c)) {
6311 _cleanup_free_ char *s = NULL;
6312
6313 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6314 if (r >= 0)
6315 fprintf(f, "%sRestrictNamespaces: %s\n",
6316 prefix, strna(s));
6317 }
6318
6319 #if HAVE_LIBBPF
6320 if (exec_context_restrict_filesystems_set(c)) {
6321 char *fs;
6322 SET_FOREACH(fs, c->restrict_filesystems)
6323 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6324 }
6325 #endif
6326
6327 if (c->network_namespace_path)
6328 fprintf(f,
6329 "%sNetworkNamespacePath: %s\n",
6330 prefix, c->network_namespace_path);
6331
6332 if (c->syscall_errno > 0) {
6333 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6334
6335 #if HAVE_SECCOMP
6336 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6337 if (errno_name)
6338 fputs(errno_name, f);
6339 else
6340 fprintf(f, "%d", c->syscall_errno);
6341 #endif
6342 fputc('\n', f);
6343 }
6344
6345 for (size_t i = 0; i < c->n_mount_images; i++) {
6346 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6347 c->mount_images[i].ignore_enoent ? "-": "",
6348 c->mount_images[i].source,
6349 c->mount_images[i].destination);
6350 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
6351 fprintf(f, ":%s:%s",
6352 partition_designator_to_string(o->partition_designator),
6353 strempty(o->options));
6354 fprintf(f, "\n");
6355 }
6356
6357 for (size_t i = 0; i < c->n_extension_images; i++) {
6358 fprintf(f, "%sExtensionImages: %s%s", prefix,
6359 c->extension_images[i].ignore_enoent ? "-": "",
6360 c->extension_images[i].source);
6361 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6362 fprintf(f, ":%s:%s",
6363 partition_designator_to_string(o->partition_designator),
6364 strempty(o->options));
6365 fprintf(f, "\n");
6366 }
6367
6368 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
6369 }
6370
6371 bool exec_context_maintains_privileges(const ExecContext *c) {
6372 assert(c);
6373
6374 /* Returns true if the process forked off would run under
6375 * an unchanged UID or as root. */
6376
6377 if (!c->user)
6378 return true;
6379
6380 if (streq(c->user, "root") || streq(c->user, "0"))
6381 return true;
6382
6383 return false;
6384 }
6385
6386 int exec_context_get_effective_ioprio(const ExecContext *c) {
6387 int p;
6388
6389 assert(c);
6390
6391 if (c->ioprio_set)
6392 return c->ioprio;
6393
6394 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6395 if (p < 0)
6396 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
6397
6398 return ioprio_normalize(p);
6399 }
6400
6401 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6402 assert(c);
6403
6404 /* Explicit setting wins */
6405 if (c->mount_apivfs_set)
6406 return c->mount_apivfs;
6407
6408 /* Default to "yes" if root directory or image are specified */
6409 if (exec_context_with_rootfs(c))
6410 return true;
6411
6412 return false;
6413 }
6414
6415 void exec_context_free_log_extra_fields(ExecContext *c) {
6416 assert(c);
6417
6418 for (size_t l = 0; l < c->n_log_extra_fields; l++)
6419 free(c->log_extra_fields[l].iov_base);
6420 c->log_extra_fields = mfree(c->log_extra_fields);
6421 c->n_log_extra_fields = 0;
6422 }
6423
6424 void exec_context_revert_tty(ExecContext *c) {
6425 _cleanup_close_ int fd = -EBADF;
6426 const char *path;
6427 struct stat st;
6428 int r;
6429
6430 assert(c);
6431
6432 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6433 exec_context_tty_reset(c, NULL);
6434
6435 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6436 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6437 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
6438 if (!exec_context_may_touch_tty(c))
6439 return;
6440
6441 path = exec_context_tty_path(c);
6442 if (!path)
6443 return;
6444
6445 fd = open(path, O_PATH|O_CLOEXEC);
6446 if (fd < 0)
6447 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6448 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6449 path);
6450
6451 if (fstat(fd, &st) < 0)
6452 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6453
6454 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6455 * if things are a character device, since a proper check either means we'd have to open the TTY and
6456 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6457 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6458 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6459 if (!S_ISCHR(st.st_mode))
6460 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6461
6462 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6463 if (r < 0)
6464 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6465 }
6466
6467 int exec_context_get_clean_directories(
6468 ExecContext *c,
6469 char **prefix,
6470 ExecCleanMask mask,
6471 char ***ret) {
6472
6473 _cleanup_strv_free_ char **l = NULL;
6474 int r;
6475
6476 assert(c);
6477 assert(prefix);
6478 assert(ret);
6479
6480 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6481 if (!FLAGS_SET(mask, 1U << t))
6482 continue;
6483
6484 if (!prefix[t])
6485 continue;
6486
6487 for (size_t i = 0; i < c->directories[t].n_items; i++) {
6488 char *j;
6489
6490 j = path_join(prefix[t], c->directories[t].items[i].path);
6491 if (!j)
6492 return -ENOMEM;
6493
6494 r = strv_consume(&l, j);
6495 if (r < 0)
6496 return r;
6497
6498 /* Also remove private directories unconditionally. */
6499 if (t != EXEC_DIRECTORY_CONFIGURATION) {
6500 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6501 if (!j)
6502 return -ENOMEM;
6503
6504 r = strv_consume(&l, j);
6505 if (r < 0)
6506 return r;
6507 }
6508
6509 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6510 j = path_join(prefix[t], *symlink);
6511 if (!j)
6512 return -ENOMEM;
6513
6514 r = strv_consume(&l, j);
6515 if (r < 0)
6516 return r;
6517 }
6518 }
6519 }
6520
6521 *ret = TAKE_PTR(l);
6522 return 0;
6523 }
6524
6525 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6526 ExecCleanMask mask = 0;
6527
6528 assert(c);
6529 assert(ret);
6530
6531 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6532 if (c->directories[t].n_items > 0)
6533 mask |= 1U << t;
6534
6535 *ret = mask;
6536 return 0;
6537 }
6538
6539 void exec_status_start(ExecStatus *s, pid_t pid) {
6540 assert(s);
6541
6542 *s = (ExecStatus) {
6543 .pid = pid,
6544 };
6545
6546 dual_timestamp_get(&s->start_timestamp);
6547 }
6548
6549 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
6550 assert(s);
6551
6552 if (s->pid != pid)
6553 *s = (ExecStatus) {
6554 .pid = pid,
6555 };
6556
6557 dual_timestamp_get(&s->exit_timestamp);
6558
6559 s->code = code;
6560 s->status = status;
6561
6562 if (context && context->utmp_id)
6563 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
6564 }
6565
6566 void exec_status_reset(ExecStatus *s) {
6567 assert(s);
6568
6569 *s = (ExecStatus) {};
6570 }
6571
6572 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
6573 assert(s);
6574 assert(f);
6575
6576 if (s->pid <= 0)
6577 return;
6578
6579 prefix = strempty(prefix);
6580
6581 fprintf(f,
6582 "%sPID: "PID_FMT"\n",
6583 prefix, s->pid);
6584
6585 if (dual_timestamp_is_set(&s->start_timestamp))
6586 fprintf(f,
6587 "%sStart Timestamp: %s\n",
6588 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
6589
6590 if (dual_timestamp_is_set(&s->exit_timestamp))
6591 fprintf(f,
6592 "%sExit Timestamp: %s\n"
6593 "%sExit Code: %s\n"
6594 "%sExit Status: %i\n",
6595 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
6596 prefix, sigchld_code_to_string(s->code),
6597 prefix, s->status);
6598 }
6599
6600 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
6601 _cleanup_free_ char *cmd = NULL;
6602 const char *prefix2;
6603
6604 assert(c);
6605 assert(f);
6606
6607 prefix = strempty(prefix);
6608 prefix2 = strjoina(prefix, "\t");
6609
6610 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
6611
6612 fprintf(f,
6613 "%sCommand Line: %s\n",
6614 prefix, strnull(cmd));
6615
6616 exec_status_dump(&c->exec_status, f, prefix2);
6617 }
6618
6619 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6620 assert(f);
6621
6622 prefix = strempty(prefix);
6623
6624 LIST_FOREACH(command, i, c)
6625 exec_command_dump(i, f, prefix);
6626 }
6627
6628 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6629 ExecCommand *end;
6630
6631 assert(l);
6632 assert(e);
6633
6634 if (*l) {
6635 /* It's kind of important, that we keep the order here */
6636 LIST_FIND_TAIL(command, *l, end);
6637 LIST_INSERT_AFTER(command, *l, end, e);
6638 } else
6639 *l = e;
6640 }
6641
6642 int exec_command_set(ExecCommand *c, const char *path, ...) {
6643 va_list ap;
6644 char **l, *p;
6645
6646 assert(c);
6647 assert(path);
6648
6649 va_start(ap, path);
6650 l = strv_new_ap(path, ap);
6651 va_end(ap);
6652
6653 if (!l)
6654 return -ENOMEM;
6655
6656 p = strdup(path);
6657 if (!p) {
6658 strv_free(l);
6659 return -ENOMEM;
6660 }
6661
6662 free_and_replace(c->path, p);
6663
6664 return strv_free_and_replace(c->argv, l);
6665 }
6666
6667 int exec_command_append(ExecCommand *c, const char *path, ...) {
6668 _cleanup_strv_free_ char **l = NULL;
6669 va_list ap;
6670 int r;
6671
6672 assert(c);
6673 assert(path);
6674
6675 va_start(ap, path);
6676 l = strv_new_ap(path, ap);
6677 va_end(ap);
6678
6679 if (!l)
6680 return -ENOMEM;
6681
6682 r = strv_extend_strv(&c->argv, l, false);
6683 if (r < 0)
6684 return r;
6685
6686 return 0;
6687 }
6688
6689 static void *remove_tmpdir_thread(void *p) {
6690 _cleanup_free_ char *path = p;
6691
6692 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6693 return NULL;
6694 }
6695
6696 static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6697 int r;
6698
6699 if (!rt)
6700 return NULL;
6701
6702 if (rt->manager)
6703 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6704
6705 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
6706
6707 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
6708 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6709
6710 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
6711 if (r < 0)
6712 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
6713 else
6714 rt->tmp_dir = NULL;
6715 }
6716
6717 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
6718 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6719
6720 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
6721 if (r < 0)
6722 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
6723 else
6724 rt->var_tmp_dir = NULL;
6725 }
6726
6727 rt->id = mfree(rt->id);
6728 rt->tmp_dir = mfree(rt->tmp_dir);
6729 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6730 safe_close_pair(rt->netns_storage_socket);
6731 safe_close_pair(rt->ipcns_storage_socket);
6732 return mfree(rt);
6733 }
6734
6735 static void exec_runtime_freep(ExecRuntime **rt) {
6736 (void) exec_runtime_free(*rt, false);
6737 }
6738
6739 static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6740 _cleanup_free_ char *id_copy = NULL;
6741 ExecRuntime *n;
6742
6743 assert(ret);
6744
6745 id_copy = strdup(id);
6746 if (!id_copy)
6747 return -ENOMEM;
6748
6749 n = new(ExecRuntime, 1);
6750 if (!n)
6751 return -ENOMEM;
6752
6753 *n = (ExecRuntime) {
6754 .id = TAKE_PTR(id_copy),
6755 .netns_storage_socket = PIPE_EBADF,
6756 .ipcns_storage_socket = PIPE_EBADF,
6757 };
6758
6759 *ret = n;
6760 return 0;
6761 }
6762
6763 static int exec_runtime_add(
6764 Manager *m,
6765 const char *id,
6766 char **tmp_dir,
6767 char **var_tmp_dir,
6768 int netns_storage_socket[2],
6769 int ipcns_storage_socket[2],
6770 ExecRuntime **ret) {
6771
6772 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
6773 int r;
6774
6775 assert(m);
6776 assert(id);
6777
6778 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
6779
6780 r = exec_runtime_allocate(&rt, id);
6781 if (r < 0)
6782 return r;
6783
6784 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
6785 if (r < 0)
6786 return r;
6787
6788 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6789 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6790 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
6791
6792 if (netns_storage_socket) {
6793 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6794 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
6795 }
6796
6797 if (ipcns_storage_socket) {
6798 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6799 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6800 }
6801
6802 rt->manager = m;
6803
6804 if (ret)
6805 *ret = rt;
6806 /* do not remove created ExecRuntime object when the operation succeeds. */
6807 TAKE_PTR(rt);
6808 return 0;
6809 }
6810
6811 static int exec_runtime_make(
6812 Manager *m,
6813 const ExecContext *c,
6814 const char *id,
6815 ExecRuntime **ret) {
6816
6817 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
6818 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
6819 int r;
6820
6821 assert(m);
6822 assert(c);
6823 assert(id);
6824
6825 /* It is not necessary to create ExecRuntime object. */
6826 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
6827 *ret = NULL;
6828 return 0;
6829 }
6830
6831 if (c->private_tmp &&
6832 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6833 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6834 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
6835 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
6836 if (r < 0)
6837 return r;
6838 }
6839
6840 if (c->private_network || c->network_namespace_path) {
6841 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6842 return -errno;
6843 }
6844
6845 if (c->private_ipc || c->ipc_namespace_path) {
6846 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6847 return -errno;
6848 }
6849
6850 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
6851 if (r < 0)
6852 return r;
6853
6854 return 1;
6855 }
6856
6857 int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6858 ExecRuntime *rt;
6859 int r;
6860
6861 assert(m);
6862 assert(id);
6863 assert(ret);
6864
6865 rt = hashmap_get(m->exec_runtime_by_id, id);
6866 if (rt)
6867 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
6868 goto ref;
6869
6870 if (!create) {
6871 *ret = NULL;
6872 return 0;
6873 }
6874
6875 /* If not found, then create a new object. */
6876 r = exec_runtime_make(m, c, id, &rt);
6877 if (r < 0)
6878 return r;
6879 if (r == 0) {
6880 /* When r == 0, it is not necessary to create ExecRuntime object. */
6881 *ret = NULL;
6882 return 0;
6883 }
6884
6885 ref:
6886 /* increment reference counter. */
6887 rt->n_ref++;
6888 *ret = rt;
6889 return 1;
6890 }
6891
6892 ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6893 if (!rt)
6894 return NULL;
6895
6896 assert(rt->n_ref > 0);
6897
6898 rt->n_ref--;
6899 if (rt->n_ref > 0)
6900 return NULL;
6901
6902 return exec_runtime_free(rt, destroy);
6903 }
6904
6905 int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6906 ExecRuntime *rt;
6907
6908 assert(m);
6909 assert(f);
6910 assert(fds);
6911
6912 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
6913 fprintf(f, "exec-runtime=%s", rt->id);
6914
6915 if (rt->tmp_dir)
6916 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
6917
6918 if (rt->var_tmp_dir)
6919 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
6920
6921 if (rt->netns_storage_socket[0] >= 0) {
6922 int copy;
6923
6924 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6925 if (copy < 0)
6926 return copy;
6927
6928 fprintf(f, " netns-socket-0=%i", copy);
6929 }
6930
6931 if (rt->netns_storage_socket[1] >= 0) {
6932 int copy;
6933
6934 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6935 if (copy < 0)
6936 return copy;
6937
6938 fprintf(f, " netns-socket-1=%i", copy);
6939 }
6940
6941 if (rt->ipcns_storage_socket[0] >= 0) {
6942 int copy;
6943
6944 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6945 if (copy < 0)
6946 return copy;
6947
6948 fprintf(f, " ipcns-socket-0=%i", copy);
6949 }
6950
6951 if (rt->ipcns_storage_socket[1] >= 0) {
6952 int copy;
6953
6954 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6955 if (copy < 0)
6956 return copy;
6957
6958 fprintf(f, " ipcns-socket-1=%i", copy);
6959 }
6960
6961 fputc('\n', f);
6962 }
6963
6964 return 0;
6965 }
6966
6967 int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6968 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6969 ExecRuntime *rt;
6970 int r;
6971
6972 /* This is for the migration from old (v237 or earlier) deserialization text.
6973 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6974 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6975 * so or not from the serialized text, then we always creates a new object owned by this. */
6976
6977 assert(u);
6978 assert(key);
6979 assert(value);
6980
6981 /* Manager manages ExecRuntime objects by the unit id.
6982 * So, we omit the serialized text when the unit does not have id (yet?)... */
6983 if (isempty(u->id)) {
6984 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6985 return 0;
6986 }
6987
6988 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6989 return log_oom();
6990
6991 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6992 if (!rt) {
6993 if (exec_runtime_allocate(&rt_create, u->id) < 0)
6994 return log_oom();
6995
6996 rt = rt_create;
6997 }
6998
6999 if (streq(key, "tmp-dir")) {
7000 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7001 return -ENOMEM;
7002
7003 } else if (streq(key, "var-tmp-dir")) {
7004 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7005 return -ENOMEM;
7006
7007 } else if (streq(key, "netns-socket-0")) {
7008 int fd;
7009
7010 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
7011 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7012 return 0;
7013 }
7014
7015 safe_close(rt->netns_storage_socket[0]);
7016 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7017
7018 } else if (streq(key, "netns-socket-1")) {
7019 int fd;
7020
7021 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
7022 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7023 return 0;
7024 }
7025
7026 safe_close(rt->netns_storage_socket[1]);
7027 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7028
7029 } else
7030 return 0;
7031
7032 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
7033 if (rt_create) {
7034 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
7035 if (r < 0) {
7036 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7037 return 0;
7038 }
7039
7040 rt_create->manager = u->manager;
7041
7042 /* Avoid cleanup */
7043 TAKE_PTR(rt_create);
7044 }
7045
7046 return 1;
7047 }
7048
7049 int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7050 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7051 char *id = NULL;
7052 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7053 const char *p, *v = ASSERT_PTR(value);
7054 size_t n;
7055
7056 assert(m);
7057 assert(fds);
7058
7059 n = strcspn(v, " ");
7060 id = strndupa_safe(v, n);
7061 if (v[n] != ' ')
7062 goto finalize;
7063 p = v + n + 1;
7064
7065 v = startswith(p, "tmp-dir=");
7066 if (v) {
7067 n = strcspn(v, " ");
7068 tmp_dir = strndup(v, n);
7069 if (!tmp_dir)
7070 return log_oom();
7071 if (v[n] != ' ')
7072 goto finalize;
7073 p = v + n + 1;
7074 }
7075
7076 v = startswith(p, "var-tmp-dir=");
7077 if (v) {
7078 n = strcspn(v, " ");
7079 var_tmp_dir = strndup(v, n);
7080 if (!var_tmp_dir)
7081 return log_oom();
7082 if (v[n] != ' ')
7083 goto finalize;
7084 p = v + n + 1;
7085 }
7086
7087 v = startswith(p, "netns-socket-0=");
7088 if (v) {
7089 char *buf;
7090
7091 n = strcspn(v, " ");
7092 buf = strndupa_safe(v, n);
7093
7094 r = safe_atoi(buf, &netns_fdpair[0]);
7095 if (r < 0)
7096 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7097 if (!fdset_contains(fds, netns_fdpair[0]))
7098 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7099 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7100 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7101 if (v[n] != ' ')
7102 goto finalize;
7103 p = v + n + 1;
7104 }
7105
7106 v = startswith(p, "netns-socket-1=");
7107 if (v) {
7108 char *buf;
7109
7110 n = strcspn(v, " ");
7111 buf = strndupa_safe(v, n);
7112
7113 r = safe_atoi(buf, &netns_fdpair[1]);
7114 if (r < 0)
7115 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7116 if (!fdset_contains(fds, netns_fdpair[1]))
7117 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7118 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7119 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7120 if (v[n] != ' ')
7121 goto finalize;
7122 p = v + n + 1;
7123 }
7124
7125 v = startswith(p, "ipcns-socket-0=");
7126 if (v) {
7127 char *buf;
7128
7129 n = strcspn(v, " ");
7130 buf = strndupa_safe(v, n);
7131
7132 r = safe_atoi(buf, &ipcns_fdpair[0]);
7133 if (r < 0)
7134 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7135 if (!fdset_contains(fds, ipcns_fdpair[0]))
7136 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7137 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7138 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7139 if (v[n] != ' ')
7140 goto finalize;
7141 p = v + n + 1;
7142 }
7143
7144 v = startswith(p, "ipcns-socket-1=");
7145 if (v) {
7146 char *buf;
7147
7148 n = strcspn(v, " ");
7149 buf = strndupa_safe(v, n);
7150
7151 r = safe_atoi(buf, &ipcns_fdpair[1]);
7152 if (r < 0)
7153 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7154 if (!fdset_contains(fds, ipcns_fdpair[1]))
7155 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7156 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7157 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7158 }
7159
7160 finalize:
7161 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7162 if (r < 0)
7163 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7164 return 0;
7165 }
7166
7167 void exec_runtime_vacuum(Manager *m) {
7168 ExecRuntime *rt;
7169
7170 assert(m);
7171
7172 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7173
7174 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
7175 if (rt->n_ref > 0)
7176 continue;
7177
7178 (void) exec_runtime_free(rt, false);
7179 }
7180 }
7181
7182 void exec_params_clear(ExecParameters *p) {
7183 if (!p)
7184 return;
7185
7186 p->environment = strv_free(p->environment);
7187 p->fd_names = strv_free(p->fd_names);
7188 p->fds = mfree(p->fds);
7189 p->exec_fd = safe_close(p->exec_fd);
7190 }
7191
7192 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7193 if (!sc)
7194 return NULL;
7195
7196 free(sc->id);
7197 free(sc->data);
7198 return mfree(sc);
7199 }
7200
7201 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7202 if (!lc)
7203 return NULL;
7204
7205 free(lc->id);
7206 free(lc->path);
7207 return mfree(lc);
7208 }
7209
7210 void exec_directory_done(ExecDirectory *d) {
7211 if (!d)
7212 return;
7213
7214 for (size_t i = 0; i < d->n_items; i++) {
7215 free(d->items[i].path);
7216 strv_free(d->items[i].symlinks);
7217 }
7218
7219 d->items = mfree(d->items);
7220 d->n_items = 0;
7221 d->mode = 0755;
7222 }
7223
7224 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7225 assert(d);
7226 assert(path);
7227
7228 for (size_t i = 0; i < d->n_items; i++)
7229 if (path_equal(d->items[i].path, path))
7230 return &d->items[i];
7231
7232 return NULL;
7233 }
7234
7235 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7236 _cleanup_strv_free_ char **s = NULL;
7237 _cleanup_free_ char *p = NULL;
7238 ExecDirectoryItem *existing;
7239 int r;
7240
7241 assert(d);
7242 assert(path);
7243
7244 existing = exec_directory_find(d, path);
7245 if (existing) {
7246 r = strv_extend(&existing->symlinks, symlink);
7247 if (r < 0)
7248 return r;
7249
7250 return 0; /* existing item is updated */
7251 }
7252
7253 p = strdup(path);
7254 if (!p)
7255 return -ENOMEM;
7256
7257 if (symlink) {
7258 s = strv_new(symlink);
7259 if (!s)
7260 return -ENOMEM;
7261 }
7262
7263 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7264 return -ENOMEM;
7265
7266 d->items[d->n_items++] = (ExecDirectoryItem) {
7267 .path = TAKE_PTR(p),
7268 .symlinks = TAKE_PTR(s),
7269 };
7270
7271 return 1; /* new item is added */
7272 }
7273
7274 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7275 assert(a);
7276 assert(b);
7277
7278 return path_compare(a->path, b->path);
7279 }
7280
7281 void exec_directory_sort(ExecDirectory *d) {
7282 assert(d);
7283
7284 /* Sort the exec directories to make always parent directories processed at first in
7285 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7286 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7287 * list. See also comments in setup_exec_directory() and issue #24783. */
7288
7289 if (d->n_items <= 1)
7290 return;
7291
7292 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7293
7294 for (size_t i = 1; i < d->n_items; i++)
7295 for (size_t j = 0; j < i; j++)
7296 if (path_startswith(d->items[i].path, d->items[j].path)) {
7297 d->items[i].only_create = true;
7298 break;
7299 }
7300 }
7301
7302 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
7303 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
7304
7305 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7306 [EXEC_INPUT_NULL] = "null",
7307 [EXEC_INPUT_TTY] = "tty",
7308 [EXEC_INPUT_TTY_FORCE] = "tty-force",
7309 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
7310 [EXEC_INPUT_SOCKET] = "socket",
7311 [EXEC_INPUT_NAMED_FD] = "fd",
7312 [EXEC_INPUT_DATA] = "data",
7313 [EXEC_INPUT_FILE] = "file",
7314 };
7315
7316 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7317
7318 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
7319 [EXEC_OUTPUT_INHERIT] = "inherit",
7320 [EXEC_OUTPUT_NULL] = "null",
7321 [EXEC_OUTPUT_TTY] = "tty",
7322 [EXEC_OUTPUT_KMSG] = "kmsg",
7323 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
7324 [EXEC_OUTPUT_JOURNAL] = "journal",
7325 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
7326 [EXEC_OUTPUT_SOCKET] = "socket",
7327 [EXEC_OUTPUT_NAMED_FD] = "fd",
7328 [EXEC_OUTPUT_FILE] = "file",
7329 [EXEC_OUTPUT_FILE_APPEND] = "append",
7330 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
7331 };
7332
7333 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
7334
7335 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7336 [EXEC_UTMP_INIT] = "init",
7337 [EXEC_UTMP_LOGIN] = "login",
7338 [EXEC_UTMP_USER] = "user",
7339 };
7340
7341 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
7342
7343 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7344 [EXEC_PRESERVE_NO] = "no",
7345 [EXEC_PRESERVE_YES] = "yes",
7346 [EXEC_PRESERVE_RESTART] = "restart",
7347 };
7348
7349 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
7350
7351 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
7352 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7353 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7354 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7355 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7356 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7357 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7358 };
7359
7360 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
7361
7362 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7363 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7364 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7365 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7366 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7367 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7368 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7369 };
7370
7371 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7372
7373 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7374 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7375 * directories, specifically .timer units with their timestamp touch file. */
7376 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7377 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7378 [EXEC_DIRECTORY_STATE] = "state",
7379 [EXEC_DIRECTORY_CACHE] = "cache",
7380 [EXEC_DIRECTORY_LOGS] = "logs",
7381 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7382 };
7383
7384 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7385
7386 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7387 * the service payload in. */
7388 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7389 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7390 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7391 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7392 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7393 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7394 };
7395
7396 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7397
7398 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7399 [EXEC_KEYRING_INHERIT] = "inherit",
7400 [EXEC_KEYRING_PRIVATE] = "private",
7401 [EXEC_KEYRING_SHARED] = "shared",
7402 };
7403
7404 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);