]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/execute.c
core: copy the host's os-release for /run/host/os-release
[thirdparty/systemd.git] / src / core / execute.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <poll.h>
6 #include <sys/eventfd.h>
7 #include <sys/ioctl.h>
8 #include <sys/mman.h>
9 #include <sys/mount.h>
10 #include <sys/personality.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13 #include <sys/types.h>
14 #include <sys/un.h>
15 #include <unistd.h>
16 #include <utmpx.h>
17
18 #include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
20 #if HAVE_PAM
21 #include <security/pam_appl.h>
22 #endif
23
24 #if HAVE_SELINUX
25 #include <selinux/selinux.h>
26 #endif
27
28 #if HAVE_SECCOMP
29 #include <seccomp.h>
30 #endif
31
32 #if HAVE_APPARMOR
33 #include <sys/apparmor.h>
34 #endif
35
36 #include "sd-messages.h"
37
38 #include "acl-util.h"
39 #include "af-list.h"
40 #include "alloc-util.h"
41 #if HAVE_APPARMOR
42 #include "apparmor-util.h"
43 #endif
44 #include "argv-util.h"
45 #include "async.h"
46 #include "barrier.h"
47 #include "bpf-lsm.h"
48 #include "btrfs-util.h"
49 #include "cap-list.h"
50 #include "capability-util.h"
51 #include "chattr-util.h"
52 #include "cgroup-setup.h"
53 #include "chase.h"
54 #include "chown-recursive.h"
55 #include "constants.h"
56 #include "cpu-set-util.h"
57 #include "creds-util.h"
58 #include "data-fd-util.h"
59 #include "env-file.h"
60 #include "env-util.h"
61 #include "errno-list.h"
62 #include "escape.h"
63 #include "execute.h"
64 #include "exit-status.h"
65 #include "fd-util.h"
66 #include "fileio.h"
67 #include "format-util.h"
68 #include "glob-util.h"
69 #include "hexdecoct.h"
70 #include "io-util.h"
71 #include "ioprio-util.h"
72 #include "label-util.h"
73 #include "lock-util.h"
74 #include "log.h"
75 #include "macro.h"
76 #include "manager.h"
77 #include "manager-dump.h"
78 #include "memory-util.h"
79 #include "missing_fs.h"
80 #include "missing_ioprio.h"
81 #include "missing_prctl.h"
82 #include "mkdir-label.h"
83 #include "mount-util.h"
84 #include "mountpoint-util.h"
85 #include "namespace.h"
86 #include "parse-util.h"
87 #include "path-util.h"
88 #include "proc-cmdline.h"
89 #include "process-util.h"
90 #include "psi-util.h"
91 #include "random-util.h"
92 #include "recurse-dir.h"
93 #include "rlimit-util.h"
94 #include "rm-rf.h"
95 #if HAVE_SECCOMP
96 #include "seccomp-util.h"
97 #endif
98 #include "securebits-util.h"
99 #include "selinux-util.h"
100 #include "signal-util.h"
101 #include "smack-util.h"
102 #include "socket-util.h"
103 #include "sort-util.h"
104 #include "special.h"
105 #include "stat-util.h"
106 #include "string-table.h"
107 #include "string-util.h"
108 #include "strv.h"
109 #include "syslog-util.h"
110 #include "terminal-util.h"
111 #include "tmpfile-util.h"
112 #include "umask-util.h"
113 #include "unit-serialize.h"
114 #include "user-util.h"
115 #include "utmp-wtmp.h"
116
117 #define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
118 #define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
119
120 #define SNDBUF_SIZE (8*1024*1024)
121
122 static int shift_fds(int fds[], size_t n_fds) {
123 if (n_fds <= 0)
124 return 0;
125
126 /* Modifies the fds array! (sorts it) */
127
128 assert(fds);
129
130 for (int start = 0;;) {
131 int restart_from = -1;
132
133 for (int i = start; i < (int) n_fds; i++) {
134 int nfd;
135
136 /* Already at right index? */
137 if (fds[i] == i+3)
138 continue;
139
140 nfd = fcntl(fds[i], F_DUPFD, i + 3);
141 if (nfd < 0)
142 return -errno;
143
144 safe_close(fds[i]);
145 fds[i] = nfd;
146
147 /* Hmm, the fd we wanted isn't free? Then
148 * let's remember that and try again from here */
149 if (nfd != i+3 && restart_from < 0)
150 restart_from = i;
151 }
152
153 if (restart_from < 0)
154 break;
155
156 start = restart_from;
157 }
158
159 return 0;
160 }
161
162 static int flags_fds(
163 const int fds[],
164 size_t n_socket_fds,
165 size_t n_fds,
166 bool nonblock) {
167
168 int r;
169
170 if (n_fds <= 0)
171 return 0;
172
173 assert(fds);
174
175 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
176 * O_NONBLOCK only applies to socket activation though. */
177
178 for (size_t i = 0; i < n_fds; i++) {
179
180 if (i < n_socket_fds) {
181 r = fd_nonblock(fds[i], nonblock);
182 if (r < 0)
183 return r;
184 }
185
186 /* We unconditionally drop FD_CLOEXEC from the fds,
187 * since after all we want to pass these fds to our
188 * children */
189
190 r = fd_cloexec(fds[i], false);
191 if (r < 0)
192 return r;
193 }
194
195 return 0;
196 }
197
198 static const char *exec_context_tty_path(const ExecContext *context) {
199 assert(context);
200
201 if (context->stdio_as_fds)
202 return NULL;
203
204 if (context->tty_path)
205 return context->tty_path;
206
207 return "/dev/console";
208 }
209
210 static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
211 unsigned rows, cols;
212 const char *tty;
213
214 assert(context);
215 assert(ret_rows);
216 assert(ret_cols);
217
218 rows = context->tty_rows;
219 cols = context->tty_cols;
220
221 tty = exec_context_tty_path(context);
222 if (tty)
223 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
224
225 *ret_rows = rows;
226 *ret_cols = cols;
227
228 return 0;
229 }
230
231 static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
232 _cleanup_close_ int fd = -EBADF;
233 const char *path = exec_context_tty_path(ASSERT_PTR(context));
234
235 /* Take a lock around the device for the duration of the setup that we do here.
236 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
237 * We open a new fd that will be closed automatically, and operate on it for convenience.
238 */
239
240 if (p && p->stdin_fd >= 0) {
241 fd = xopenat_lock(p->stdin_fd, NULL,
242 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
243 if (fd < 0)
244 return;
245 } else if (path) {
246 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
247 if (fd < 0)
248 return;
249
250 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
251 return;
252 } else
253 return; /* nothing to do */
254
255 if (context->tty_vhangup)
256 (void) terminal_vhangup_fd(fd);
257
258 if (context->tty_reset)
259 (void) reset_terminal_fd(fd, true);
260
261 if (p && p->stdin_fd >= 0) {
262 unsigned rows = context->tty_rows, cols = context->tty_cols;
263
264 (void) exec_context_tty_size(context, &rows, &cols);
265 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
266 }
267
268 if (context->tty_vt_disallocate && path)
269 (void) vt_disallocate(path);
270 }
271
272 static bool is_terminal_input(ExecInput i) {
273 return IN_SET(i,
274 EXEC_INPUT_TTY,
275 EXEC_INPUT_TTY_FORCE,
276 EXEC_INPUT_TTY_FAIL);
277 }
278
279 static bool is_terminal_output(ExecOutput o) {
280 return IN_SET(o,
281 EXEC_OUTPUT_TTY,
282 EXEC_OUTPUT_KMSG_AND_CONSOLE,
283 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
284 }
285
286 static bool is_kmsg_output(ExecOutput o) {
287 return IN_SET(o,
288 EXEC_OUTPUT_KMSG,
289 EXEC_OUTPUT_KMSG_AND_CONSOLE);
290 }
291
292 static bool exec_context_needs_term(const ExecContext *c) {
293 assert(c);
294
295 /* Return true if the execution context suggests we should set $TERM to something useful. */
296
297 if (is_terminal_input(c->std_input))
298 return true;
299
300 if (is_terminal_output(c->std_output))
301 return true;
302
303 if (is_terminal_output(c->std_error))
304 return true;
305
306 return !!c->tty_path;
307 }
308
309 static int open_null_as(int flags, int nfd) {
310 int fd;
311
312 assert(nfd >= 0);
313
314 fd = open("/dev/null", flags|O_NOCTTY);
315 if (fd < 0)
316 return -errno;
317
318 return move_fd(fd, nfd, false);
319 }
320
321 static int connect_journal_socket(
322 int fd,
323 const char *log_namespace,
324 uid_t uid,
325 gid_t gid) {
326
327 uid_t olduid = UID_INVALID;
328 gid_t oldgid = GID_INVALID;
329 const char *j;
330 int r;
331
332 j = log_namespace ?
333 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
334 "/run/systemd/journal/stdout";
335
336 if (gid_is_valid(gid)) {
337 oldgid = getgid();
338
339 if (setegid(gid) < 0)
340 return -errno;
341 }
342
343 if (uid_is_valid(uid)) {
344 olduid = getuid();
345
346 if (seteuid(uid) < 0) {
347 r = -errno;
348 goto restore_gid;
349 }
350 }
351
352 r = connect_unix_path(fd, AT_FDCWD, j);
353
354 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
355 an LSM interferes. */
356
357 if (uid_is_valid(uid))
358 (void) seteuid(olduid);
359
360 restore_gid:
361 if (gid_is_valid(gid))
362 (void) setegid(oldgid);
363
364 return r;
365 }
366
367 static int connect_logger_as(
368 const Unit *unit,
369 const ExecContext *context,
370 const ExecParameters *params,
371 ExecOutput output,
372 const char *ident,
373 int nfd,
374 uid_t uid,
375 gid_t gid) {
376
377 _cleanup_close_ int fd = -EBADF;
378 int r;
379
380 assert(context);
381 assert(params);
382 assert(output < _EXEC_OUTPUT_MAX);
383 assert(ident);
384 assert(nfd >= 0);
385
386 fd = socket(AF_UNIX, SOCK_STREAM, 0);
387 if (fd < 0)
388 return -errno;
389
390 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
391 if (r < 0)
392 return r;
393
394 if (shutdown(fd, SHUT_RD) < 0)
395 return -errno;
396
397 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
398
399 if (dprintf(fd,
400 "%s\n"
401 "%s\n"
402 "%i\n"
403 "%i\n"
404 "%i\n"
405 "%i\n"
406 "%i\n",
407 context->syslog_identifier ?: ident,
408 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
409 context->syslog_priority,
410 !!context->syslog_level_prefix,
411 false,
412 is_kmsg_output(output),
413 is_terminal_output(output)) < 0)
414 return -errno;
415
416 return move_fd(TAKE_FD(fd), nfd, false);
417 }
418
419 static int open_terminal_as(const char *path, int flags, int nfd) {
420 int fd;
421
422 assert(path);
423 assert(nfd >= 0);
424
425 fd = open_terminal(path, flags | O_NOCTTY);
426 if (fd < 0)
427 return fd;
428
429 return move_fd(fd, nfd, false);
430 }
431
432 static int acquire_path(const char *path, int flags, mode_t mode) {
433 _cleanup_close_ int fd = -EBADF;
434 int r;
435
436 assert(path);
437
438 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
439 flags |= O_CREAT;
440
441 fd = open(path, flags|O_NOCTTY, mode);
442 if (fd >= 0)
443 return TAKE_FD(fd);
444
445 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
446 return -errno;
447
448 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
449
450 fd = socket(AF_UNIX, SOCK_STREAM, 0);
451 if (fd < 0)
452 return -errno;
453
454 r = connect_unix_path(fd, AT_FDCWD, path);
455 if (IN_SET(r, -ENOTSOCK, -EINVAL))
456 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
457 * wasn't an AF_UNIX socket after all */
458 return -ENXIO;
459 if (r < 0)
460 return r;
461
462 if ((flags & O_ACCMODE) == O_RDONLY)
463 r = shutdown(fd, SHUT_WR);
464 else if ((flags & O_ACCMODE) == O_WRONLY)
465 r = shutdown(fd, SHUT_RD);
466 else
467 r = 0;
468 if (r < 0)
469 return -errno;
470
471 return TAKE_FD(fd);
472 }
473
474 static int fixup_input(
475 const ExecContext *context,
476 int socket_fd,
477 bool apply_tty_stdin) {
478
479 ExecInput std_input;
480
481 assert(context);
482
483 std_input = context->std_input;
484
485 if (is_terminal_input(std_input) && !apply_tty_stdin)
486 return EXEC_INPUT_NULL;
487
488 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
489 return EXEC_INPUT_NULL;
490
491 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
492 return EXEC_INPUT_NULL;
493
494 return std_input;
495 }
496
497 static int fixup_output(ExecOutput output, int socket_fd) {
498
499 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
500 return EXEC_OUTPUT_INHERIT;
501
502 return output;
503 }
504
505 static int setup_input(
506 const ExecContext *context,
507 const ExecParameters *params,
508 int socket_fd,
509 const int named_iofds[static 3]) {
510
511 ExecInput i;
512 int r;
513
514 assert(context);
515 assert(params);
516 assert(named_iofds);
517
518 if (params->stdin_fd >= 0) {
519 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
520 return -errno;
521
522 /* Try to make this the controlling tty, if it is a tty, and reset it */
523 if (isatty(STDIN_FILENO)) {
524 unsigned rows = context->tty_rows, cols = context->tty_cols;
525
526 (void) exec_context_tty_size(context, &rows, &cols);
527 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
528 (void) reset_terminal_fd(STDIN_FILENO, true);
529 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
530 }
531
532 return STDIN_FILENO;
533 }
534
535 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
536
537 switch (i) {
538
539 case EXEC_INPUT_NULL:
540 return open_null_as(O_RDONLY, STDIN_FILENO);
541
542 case EXEC_INPUT_TTY:
543 case EXEC_INPUT_TTY_FORCE:
544 case EXEC_INPUT_TTY_FAIL: {
545 unsigned rows, cols;
546 int fd;
547
548 fd = acquire_terminal(exec_context_tty_path(context),
549 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
550 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
551 ACQUIRE_TERMINAL_WAIT,
552 USEC_INFINITY);
553 if (fd < 0)
554 return fd;
555
556 r = exec_context_tty_size(context, &rows, &cols);
557 if (r < 0)
558 return r;
559
560 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
561 if (r < 0)
562 return r;
563
564 return move_fd(fd, STDIN_FILENO, false);
565 }
566
567 case EXEC_INPUT_SOCKET:
568 assert(socket_fd >= 0);
569
570 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
571
572 case EXEC_INPUT_NAMED_FD:
573 assert(named_iofds[STDIN_FILENO] >= 0);
574
575 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
576 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
577
578 case EXEC_INPUT_DATA: {
579 int fd;
580
581 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
582 if (fd < 0)
583 return fd;
584
585 return move_fd(fd, STDIN_FILENO, false);
586 }
587
588 case EXEC_INPUT_FILE: {
589 bool rw;
590 int fd;
591
592 assert(context->stdio_file[STDIN_FILENO]);
593
594 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
595 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
596
597 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
598 if (fd < 0)
599 return fd;
600
601 return move_fd(fd, STDIN_FILENO, false);
602 }
603
604 default:
605 assert_not_reached();
606 }
607 }
608
609 static bool can_inherit_stderr_from_stdout(
610 const ExecContext *context,
611 ExecOutput o,
612 ExecOutput e) {
613
614 assert(context);
615
616 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
617 * stderr fd */
618
619 if (e == EXEC_OUTPUT_INHERIT)
620 return true;
621 if (e != o)
622 return false;
623
624 if (e == EXEC_OUTPUT_NAMED_FD)
625 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
626
627 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
628 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
629
630 return true;
631 }
632
633 static int setup_output(
634 const Unit *unit,
635 const ExecContext *context,
636 const ExecParameters *params,
637 int fileno,
638 int socket_fd,
639 const int named_iofds[static 3],
640 const char *ident,
641 uid_t uid,
642 gid_t gid,
643 dev_t *journal_stream_dev,
644 ino_t *journal_stream_ino) {
645
646 ExecOutput o;
647 ExecInput i;
648 int r;
649
650 assert(unit);
651 assert(context);
652 assert(params);
653 assert(ident);
654 assert(journal_stream_dev);
655 assert(journal_stream_ino);
656
657 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
658
659 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
660 return -errno;
661
662 return STDOUT_FILENO;
663 }
664
665 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
666 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
667 return -errno;
668
669 return STDERR_FILENO;
670 }
671
672 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
673 o = fixup_output(context->std_output, socket_fd);
674
675 if (fileno == STDERR_FILENO) {
676 ExecOutput e;
677 e = fixup_output(context->std_error, socket_fd);
678
679 /* This expects the input and output are already set up */
680
681 /* Don't change the stderr file descriptor if we inherit all
682 * the way and are not on a tty */
683 if (e == EXEC_OUTPUT_INHERIT &&
684 o == EXEC_OUTPUT_INHERIT &&
685 i == EXEC_INPUT_NULL &&
686 !is_terminal_input(context->std_input) &&
687 getppid() != 1)
688 return fileno;
689
690 /* Duplicate from stdout if possible */
691 if (can_inherit_stderr_from_stdout(context, o, e))
692 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
693
694 o = e;
695
696 } else if (o == EXEC_OUTPUT_INHERIT) {
697 /* If input got downgraded, inherit the original value */
698 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
699 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
700
701 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
702 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
703 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
704
705 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
706 if (getppid() != 1)
707 return fileno;
708
709 /* We need to open /dev/null here anew, to get the right access mode. */
710 return open_null_as(O_WRONLY, fileno);
711 }
712
713 switch (o) {
714
715 case EXEC_OUTPUT_NULL:
716 return open_null_as(O_WRONLY, fileno);
717
718 case EXEC_OUTPUT_TTY:
719 if (is_terminal_input(i))
720 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
721
722 /* We don't reset the terminal if this is just about output */
723 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
724
725 case EXEC_OUTPUT_KMSG:
726 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
727 case EXEC_OUTPUT_JOURNAL:
728 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
729 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
730 if (r < 0) {
731 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
732 fileno == STDOUT_FILENO ? "stdout" : "stderr");
733 r = open_null_as(O_WRONLY, fileno);
734 } else {
735 struct stat st;
736
737 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
738 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
739 * services to detect whether they are connected to the journal or not.
740 *
741 * If both stdout and stderr are connected to a stream then let's make sure to store the data
742 * about STDERR as that's usually the best way to do logging. */
743
744 if (fstat(fileno, &st) >= 0 &&
745 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
746 *journal_stream_dev = st.st_dev;
747 *journal_stream_ino = st.st_ino;
748 }
749 }
750 return r;
751
752 case EXEC_OUTPUT_SOCKET:
753 assert(socket_fd >= 0);
754
755 return RET_NERRNO(dup2(socket_fd, fileno));
756
757 case EXEC_OUTPUT_NAMED_FD:
758 assert(named_iofds[fileno] >= 0);
759
760 (void) fd_nonblock(named_iofds[fileno], false);
761 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
762
763 case EXEC_OUTPUT_FILE:
764 case EXEC_OUTPUT_FILE_APPEND:
765 case EXEC_OUTPUT_FILE_TRUNCATE: {
766 bool rw;
767 int fd, flags;
768
769 assert(context->stdio_file[fileno]);
770
771 rw = context->std_input == EXEC_INPUT_FILE &&
772 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
773
774 if (rw)
775 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
776
777 flags = O_WRONLY;
778 if (o == EXEC_OUTPUT_FILE_APPEND)
779 flags |= O_APPEND;
780 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
781 flags |= O_TRUNC;
782
783 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
784 if (fd < 0)
785 return fd;
786
787 return move_fd(fd, fileno, 0);
788 }
789
790 default:
791 assert_not_reached();
792 }
793 }
794
795 static int chown_terminal(int fd, uid_t uid) {
796 int r;
797
798 assert(fd >= 0);
799
800 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
801 if (isatty(fd) < 1) {
802 if (IN_SET(errno, EINVAL, ENOTTY))
803 return 0; /* not a tty */
804
805 return -errno;
806 }
807
808 /* This might fail. What matters are the results. */
809 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
810 if (r < 0)
811 return r;
812
813 return 1;
814 }
815
816 static int setup_confirm_stdio(
817 const ExecContext *context,
818 const char *vc,
819 int *ret_saved_stdin,
820 int *ret_saved_stdout) {
821
822 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
823 unsigned rows, cols;
824 int r;
825
826 assert(ret_saved_stdin);
827 assert(ret_saved_stdout);
828
829 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
830 if (saved_stdin < 0)
831 return -errno;
832
833 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
834 if (saved_stdout < 0)
835 return -errno;
836
837 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
838 if (fd < 0)
839 return fd;
840
841 r = chown_terminal(fd, getuid());
842 if (r < 0)
843 return r;
844
845 r = reset_terminal_fd(fd, true);
846 if (r < 0)
847 return r;
848
849 r = exec_context_tty_size(context, &rows, &cols);
850 if (r < 0)
851 return r;
852
853 r = terminal_set_size_fd(fd, vc, rows, cols);
854 if (r < 0)
855 return r;
856
857 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
858 TAKE_FD(fd);
859 if (r < 0)
860 return r;
861
862 *ret_saved_stdin = TAKE_FD(saved_stdin);
863 *ret_saved_stdout = TAKE_FD(saved_stdout);
864 return 0;
865 }
866
867 static void write_confirm_error_fd(int err, int fd, const Unit *u) {
868 assert(err < 0);
869
870 if (err == -ETIMEDOUT)
871 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
872 else {
873 errno = -err;
874 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
875 }
876 }
877
878 static void write_confirm_error(int err, const char *vc, const Unit *u) {
879 _cleanup_close_ int fd = -EBADF;
880
881 assert(vc);
882
883 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
884 if (fd < 0)
885 return;
886
887 write_confirm_error_fd(err, fd, u);
888 }
889
890 static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
891 int r = 0;
892
893 assert(saved_stdin);
894 assert(saved_stdout);
895
896 release_terminal();
897
898 if (*saved_stdin >= 0)
899 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
900 r = -errno;
901
902 if (*saved_stdout >= 0)
903 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
904 r = -errno;
905
906 *saved_stdin = safe_close(*saved_stdin);
907 *saved_stdout = safe_close(*saved_stdout);
908
909 return r;
910 }
911
912 enum {
913 CONFIRM_PRETEND_FAILURE = -1,
914 CONFIRM_PRETEND_SUCCESS = 0,
915 CONFIRM_EXECUTE = 1,
916 };
917
918 static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
919 int saved_stdout = -1, saved_stdin = -1, r;
920 _cleanup_free_ char *e = NULL;
921 char c;
922
923 /* For any internal errors, assume a positive response. */
924 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
925 if (r < 0) {
926 write_confirm_error(r, vc, u);
927 return CONFIRM_EXECUTE;
928 }
929
930 /* confirm_spawn might have been disabled while we were sleeping. */
931 if (manager_is_confirm_spawn_disabled(u->manager)) {
932 r = 1;
933 goto restore_stdio;
934 }
935
936 e = ellipsize(cmdline, 60, 100);
937 if (!e) {
938 log_oom();
939 r = CONFIRM_EXECUTE;
940 goto restore_stdio;
941 }
942
943 for (;;) {
944 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
945 if (r < 0) {
946 write_confirm_error_fd(r, STDOUT_FILENO, u);
947 r = CONFIRM_EXECUTE;
948 goto restore_stdio;
949 }
950
951 switch (c) {
952 case 'c':
953 printf("Resuming normal execution.\n");
954 manager_disable_confirm_spawn();
955 r = 1;
956 break;
957 case 'D':
958 unit_dump(u, stdout, " ");
959 continue; /* ask again */
960 case 'f':
961 printf("Failing execution.\n");
962 r = CONFIRM_PRETEND_FAILURE;
963 break;
964 case 'h':
965 printf(" c - continue, proceed without asking anymore\n"
966 " D - dump, show the state of the unit\n"
967 " f - fail, don't execute the command and pretend it failed\n"
968 " h - help\n"
969 " i - info, show a short summary of the unit\n"
970 " j - jobs, show jobs that are in progress\n"
971 " s - skip, don't execute the command and pretend it succeeded\n"
972 " y - yes, execute the command\n");
973 continue; /* ask again */
974 case 'i':
975 printf(" Description: %s\n"
976 " Unit: %s\n"
977 " Command: %s\n",
978 u->id, u->description, cmdline);
979 continue; /* ask again */
980 case 'j':
981 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
982 continue; /* ask again */
983 case 'n':
984 /* 'n' was removed in favor of 'f'. */
985 printf("Didn't understand 'n', did you mean 'f'?\n");
986 continue; /* ask again */
987 case 's':
988 printf("Skipping execution.\n");
989 r = CONFIRM_PRETEND_SUCCESS;
990 break;
991 case 'y':
992 r = CONFIRM_EXECUTE;
993 break;
994 default:
995 assert_not_reached();
996 }
997 break;
998 }
999
1000 restore_stdio:
1001 restore_confirm_stdio(&saved_stdin, &saved_stdout);
1002 return r;
1003 }
1004
1005 static int get_fixed_user(const ExecContext *c, const char **user,
1006 uid_t *uid, gid_t *gid,
1007 const char **home, const char **shell) {
1008 int r;
1009 const char *name;
1010
1011 assert(c);
1012
1013 if (!c->user)
1014 return 0;
1015
1016 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1017 * (i.e. are "/" or "/bin/nologin"). */
1018
1019 name = c->user;
1020 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
1021 if (r < 0)
1022 return r;
1023
1024 *user = name;
1025 return 0;
1026 }
1027
1028 static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1029 int r;
1030 const char *name;
1031
1032 assert(c);
1033
1034 if (!c->group)
1035 return 0;
1036
1037 name = c->group;
1038 r = get_group_creds(&name, gid, 0);
1039 if (r < 0)
1040 return r;
1041
1042 *group = name;
1043 return 0;
1044 }
1045
1046 static int get_supplementary_groups(const ExecContext *c, const char *user,
1047 const char *group, gid_t gid,
1048 gid_t **supplementary_gids, int *ngids) {
1049 int r, k = 0;
1050 int ngroups_max;
1051 bool keep_groups = false;
1052 gid_t *groups = NULL;
1053 _cleanup_free_ gid_t *l_gids = NULL;
1054
1055 assert(c);
1056
1057 /*
1058 * If user is given, then lookup GID and supplementary groups list.
1059 * We avoid NSS lookups for gid=0. Also we have to initialize groups
1060 * here and as early as possible so we keep the list of supplementary
1061 * groups of the caller.
1062 */
1063 if (user && gid_is_valid(gid) && gid != 0) {
1064 /* First step, initialize groups from /etc/groups */
1065 if (initgroups(user, gid) < 0)
1066 return -errno;
1067
1068 keep_groups = true;
1069 }
1070
1071 if (strv_isempty(c->supplementary_groups))
1072 return 0;
1073
1074 /*
1075 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1076 * be positive, otherwise fail.
1077 */
1078 errno = 0;
1079 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
1080 if (ngroups_max <= 0)
1081 return errno_or_else(EOPNOTSUPP);
1082
1083 l_gids = new(gid_t, ngroups_max);
1084 if (!l_gids)
1085 return -ENOMEM;
1086
1087 if (keep_groups) {
1088 /*
1089 * Lookup the list of groups that the user belongs to, we
1090 * avoid NSS lookups here too for gid=0.
1091 */
1092 k = ngroups_max;
1093 if (getgrouplist(user, gid, l_gids, &k) < 0)
1094 return -EINVAL;
1095 } else
1096 k = 0;
1097
1098 STRV_FOREACH(i, c->supplementary_groups) {
1099 const char *g;
1100
1101 if (k >= ngroups_max)
1102 return -E2BIG;
1103
1104 g = *i;
1105 r = get_group_creds(&g, l_gids+k, 0);
1106 if (r < 0)
1107 return r;
1108
1109 k++;
1110 }
1111
1112 /*
1113 * Sets ngids to zero to drop all supplementary groups, happens
1114 * when we are under root and SupplementaryGroups= is empty.
1115 */
1116 if (k == 0) {
1117 *ngids = 0;
1118 return 0;
1119 }
1120
1121 /* Otherwise get the final list of supplementary groups */
1122 groups = memdup(l_gids, sizeof(gid_t) * k);
1123 if (!groups)
1124 return -ENOMEM;
1125
1126 *supplementary_gids = groups;
1127 *ngids = k;
1128
1129 groups = NULL;
1130
1131 return 0;
1132 }
1133
1134 static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
1135 int r;
1136
1137 /* Handle SupplementaryGroups= if it is not empty */
1138 if (ngids > 0) {
1139 r = maybe_setgroups(ngids, supplementary_gids);
1140 if (r < 0)
1141 return r;
1142 }
1143
1144 if (gid_is_valid(gid)) {
1145 /* Then set our gids */
1146 if (setresgid(gid, gid, gid) < 0)
1147 return -errno;
1148 }
1149
1150 return 0;
1151 }
1152
1153 static int set_securebits(unsigned bits, unsigned mask) {
1154 unsigned applied;
1155 int current;
1156
1157 current = prctl(PR_GET_SECUREBITS);
1158 if (current < 0)
1159 return -errno;
1160
1161 /* Clear all securebits defined in mask and set bits */
1162 applied = ((unsigned) current & ~mask) | bits;
1163 if ((unsigned) current == applied)
1164 return 0;
1165
1166 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1167 return -errno;
1168
1169 return 1;
1170 }
1171
1172 static int enforce_user(
1173 const ExecContext *context,
1174 uid_t uid,
1175 uint64_t capability_ambient_set) {
1176 assert(context);
1177 int r;
1178
1179 if (!uid_is_valid(uid))
1180 return 0;
1181
1182 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1183 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1184 * case. */
1185
1186 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
1187
1188 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1189 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1190 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1191 if (r < 0)
1192 return r;
1193 }
1194
1195 /* Second step: actually set the uids */
1196 if (setresuid(uid, uid, uid) < 0)
1197 return -errno;
1198
1199 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1200 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1201 * outside of this call. */
1202 return 0;
1203 }
1204
1205 #if HAVE_PAM
1206
1207 static int null_conv(
1208 int num_msg,
1209 const struct pam_message **msg,
1210 struct pam_response **resp,
1211 void *appdata_ptr) {
1212
1213 /* We don't support conversations */
1214
1215 return PAM_CONV_ERR;
1216 }
1217
1218 #endif
1219
1220 static int setup_pam(
1221 const char *name,
1222 const char *user,
1223 uid_t uid,
1224 gid_t gid,
1225 const char *tty,
1226 char ***env, /* updated on success */
1227 const int fds[], size_t n_fds) {
1228
1229 #if HAVE_PAM
1230
1231 static const struct pam_conv conv = {
1232 .conv = null_conv,
1233 .appdata_ptr = NULL
1234 };
1235
1236 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
1237 _cleanup_strv_free_ char **e = NULL;
1238 pam_handle_t *handle = NULL;
1239 sigset_t old_ss;
1240 int pam_code = PAM_SUCCESS, r;
1241 bool close_session = false;
1242 pid_t pam_pid = 0, parent_pid;
1243 int flags = 0;
1244
1245 assert(name);
1246 assert(user);
1247 assert(env);
1248
1249 /* We set up PAM in the parent process, then fork. The child
1250 * will then stay around until killed via PR_GET_PDEATHSIG or
1251 * systemd via the cgroup logic. It will then remove the PAM
1252 * session again. The parent process will exec() the actual
1253 * daemon. We do things this way to ensure that the main PID
1254 * of the daemon is the one we initially fork()ed. */
1255
1256 r = barrier_create(&barrier);
1257 if (r < 0)
1258 goto fail;
1259
1260 if (log_get_max_level() < LOG_DEBUG)
1261 flags |= PAM_SILENT;
1262
1263 pam_code = pam_start(name, user, &conv, &handle);
1264 if (pam_code != PAM_SUCCESS) {
1265 handle = NULL;
1266 goto fail;
1267 }
1268
1269 if (!tty) {
1270 _cleanup_free_ char *q = NULL;
1271
1272 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1273 * out if that's the case, and read the TTY off it. */
1274
1275 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1276 tty = strjoina("/dev/", q);
1277 }
1278
1279 if (tty) {
1280 pam_code = pam_set_item(handle, PAM_TTY, tty);
1281 if (pam_code != PAM_SUCCESS)
1282 goto fail;
1283 }
1284
1285 STRV_FOREACH(nv, *env) {
1286 pam_code = pam_putenv(handle, *nv);
1287 if (pam_code != PAM_SUCCESS)
1288 goto fail;
1289 }
1290
1291 pam_code = pam_acct_mgmt(handle, flags);
1292 if (pam_code != PAM_SUCCESS)
1293 goto fail;
1294
1295 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1296 if (pam_code != PAM_SUCCESS)
1297 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
1298
1299 pam_code = pam_open_session(handle, flags);
1300 if (pam_code != PAM_SUCCESS)
1301 goto fail;
1302
1303 close_session = true;
1304
1305 e = pam_getenvlist(handle);
1306 if (!e) {
1307 pam_code = PAM_BUF_ERR;
1308 goto fail;
1309 }
1310
1311 /* Block SIGTERM, so that we know that it won't get lost in the child */
1312
1313 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
1314
1315 parent_pid = getpid_cached();
1316
1317 r = safe_fork("(sd-pam)", 0, &pam_pid);
1318 if (r < 0)
1319 goto fail;
1320 if (r == 0) {
1321 int sig, ret = EXIT_PAM;
1322
1323 /* The child's job is to reset the PAM session on termination */
1324 barrier_set_role(&barrier, BARRIER_CHILD);
1325
1326 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1327 * those fds are open here that have been opened by PAM. */
1328 (void) close_many(fds, n_fds);
1329
1330 /* Drop privileges - we don't need any to pam_close_session and this will make
1331 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1332 * threads to fail to exit normally */
1333
1334 r = maybe_setgroups(0, NULL);
1335 if (r < 0)
1336 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
1337 if (setresgid(gid, gid, gid) < 0)
1338 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
1339 if (setresuid(uid, uid, uid) < 0)
1340 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
1341
1342 (void) ignore_signals(SIGPIPE);
1343
1344 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1345 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1346 * this way. We rely on the control groups kill logic to do the rest for us. */
1347 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1348 goto child_finish;
1349
1350 /* Tell the parent that our setup is done. This is especially important regarding dropping
1351 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
1352 *
1353 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
1354 (void) barrier_place(&barrier);
1355
1356 /* Check if our parent process might already have died? */
1357 if (getppid() == parent_pid) {
1358 sigset_t ss;
1359
1360 assert_se(sigemptyset(&ss) >= 0);
1361 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1362
1363 for (;;) {
1364 if (sigwait(&ss, &sig) < 0) {
1365 if (errno == EINTR)
1366 continue;
1367
1368 goto child_finish;
1369 }
1370
1371 assert(sig == SIGTERM);
1372 break;
1373 }
1374 }
1375
1376 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1377 if (pam_code != PAM_SUCCESS)
1378 goto child_finish;
1379
1380 /* If our parent died we'll end the session */
1381 if (getppid() != parent_pid) {
1382 pam_code = pam_close_session(handle, flags);
1383 if (pam_code != PAM_SUCCESS)
1384 goto child_finish;
1385 }
1386
1387 ret = 0;
1388
1389 child_finish:
1390 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1391 * know about this. See pam_end(3) */
1392 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
1393 _exit(ret);
1394 }
1395
1396 barrier_set_role(&barrier, BARRIER_PARENT);
1397
1398 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1399 * here. */
1400 handle = NULL;
1401
1402 /* Unblock SIGTERM again in the parent */
1403 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
1404
1405 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1406 * this fd around. */
1407 closelog();
1408
1409 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1410 * recover. However, warn loudly if it happens. */
1411 if (!barrier_place_and_sync(&barrier))
1412 log_error("PAM initialization failed");
1413
1414 return strv_free_and_replace(*env, e);
1415
1416 fail:
1417 if (pam_code != PAM_SUCCESS) {
1418 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
1419 r = -EPERM; /* PAM errors do not map to errno */
1420 } else
1421 log_error_errno(r, "PAM failed: %m");
1422
1423 if (handle) {
1424 if (close_session)
1425 pam_code = pam_close_session(handle, flags);
1426
1427 (void) pam_end(handle, pam_code | flags);
1428 }
1429
1430 closelog();
1431 return r;
1432 #else
1433 return 0;
1434 #endif
1435 }
1436
1437 static void rename_process_from_path(const char *path) {
1438 _cleanup_free_ char *buf = NULL;
1439 const char *p;
1440
1441 assert(path);
1442
1443 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1444 * /bin/ps */
1445
1446 if (path_extract_filename(path, &buf) < 0) {
1447 rename_process("(...)");
1448 return;
1449 }
1450
1451 size_t l = strlen(buf);
1452 if (l > 8) {
1453 /* The end of the process name is usually more interesting, since the first bit might just be
1454 * "systemd-" */
1455 p = buf + l - 8;
1456 l = 8;
1457 } else
1458 p = buf;
1459
1460 char process_name[11];
1461 process_name[0] = '(';
1462 memcpy(process_name+1, p, l);
1463 process_name[1+l] = ')';
1464 process_name[1+l+1] = 0;
1465
1466 rename_process(process_name);
1467 }
1468
1469 static bool context_has_address_families(const ExecContext *c) {
1470 assert(c);
1471
1472 return c->address_families_allow_list ||
1473 !set_isempty(c->address_families);
1474 }
1475
1476 static bool context_has_syscall_filters(const ExecContext *c) {
1477 assert(c);
1478
1479 return c->syscall_allow_list ||
1480 !hashmap_isempty(c->syscall_filter);
1481 }
1482
1483 static bool context_has_syscall_logs(const ExecContext *c) {
1484 assert(c);
1485
1486 return c->syscall_log_allow_list ||
1487 !hashmap_isempty(c->syscall_log);
1488 }
1489
1490 static bool context_has_no_new_privileges(const ExecContext *c) {
1491 assert(c);
1492
1493 if (c->no_new_privileges)
1494 return true;
1495
1496 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
1497 return false;
1498
1499 /* We need NNP if we have any form of seccomp and are unprivileged */
1500 return c->lock_personality ||
1501 c->memory_deny_write_execute ||
1502 c->private_devices ||
1503 c->protect_clock ||
1504 c->protect_hostname ||
1505 c->protect_kernel_tunables ||
1506 c->protect_kernel_modules ||
1507 c->protect_kernel_logs ||
1508 context_has_address_families(c) ||
1509 exec_context_restrict_namespaces_set(c) ||
1510 c->restrict_realtime ||
1511 c->restrict_suid_sgid ||
1512 !set_isempty(c->syscall_archs) ||
1513 context_has_syscall_filters(c) ||
1514 context_has_syscall_logs(c);
1515 }
1516
1517 bool exec_context_has_credentials(const ExecContext *context) {
1518
1519 assert(context);
1520
1521 return !hashmap_isempty(context->set_credentials) ||
1522 !hashmap_isempty(context->load_credentials) ||
1523 !set_isempty(context->import_credentials);
1524 }
1525
1526 #if HAVE_SECCOMP
1527
1528 static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
1529
1530 if (is_seccomp_available())
1531 return false;
1532
1533 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
1534 return true;
1535 }
1536
1537 static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
1538 uint32_t negative_action, default_action, action;
1539 int r;
1540
1541 assert(u);
1542 assert(c);
1543
1544 if (!context_has_syscall_filters(c))
1545 return 0;
1546
1547 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1548 return 0;
1549
1550 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
1551
1552 if (c->syscall_allow_list) {
1553 default_action = negative_action;
1554 action = SCMP_ACT_ALLOW;
1555 } else {
1556 default_action = SCMP_ACT_ALLOW;
1557 action = negative_action;
1558 }
1559
1560 if (needs_ambient_hack) {
1561 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1562 if (r < 0)
1563 return r;
1564 }
1565
1566 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
1567 }
1568
1569 static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1570 #ifdef SCMP_ACT_LOG
1571 uint32_t default_action, action;
1572 #endif
1573
1574 assert(u);
1575 assert(c);
1576
1577 if (!context_has_syscall_logs(c))
1578 return 0;
1579
1580 #ifdef SCMP_ACT_LOG
1581 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1582 return 0;
1583
1584 if (c->syscall_log_allow_list) {
1585 /* Log nothing but the ones listed */
1586 default_action = SCMP_ACT_ALLOW;
1587 action = SCMP_ACT_LOG;
1588 } else {
1589 /* Log everything but the ones listed */
1590 default_action = SCMP_ACT_LOG;
1591 action = SCMP_ACT_ALLOW;
1592 }
1593
1594 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1595 #else
1596 /* old libseccomp */
1597 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1598 return 0;
1599 #endif
1600 }
1601
1602 static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1603 assert(u);
1604 assert(c);
1605
1606 if (set_isempty(c->syscall_archs))
1607 return 0;
1608
1609 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1610 return 0;
1611
1612 return seccomp_restrict_archs(c->syscall_archs);
1613 }
1614
1615 static int apply_address_families(const Unit* u, const ExecContext *c) {
1616 assert(u);
1617 assert(c);
1618
1619 if (!context_has_address_families(c))
1620 return 0;
1621
1622 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1623 return 0;
1624
1625 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
1626 }
1627
1628 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
1629 int r;
1630
1631 assert(u);
1632 assert(c);
1633
1634 if (!c->memory_deny_write_execute)
1635 return 0;
1636
1637 /* use prctl() if kernel supports it (6.3) */
1638 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1639 if (r == 0) {
1640 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1641 return 0;
1642 }
1643 if (r < 0 && errno != EINVAL)
1644 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1645 /* else use seccomp */
1646 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1647
1648 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1649 return 0;
1650
1651 return seccomp_memory_deny_write_execute();
1652 }
1653
1654 static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
1655 assert(u);
1656 assert(c);
1657
1658 if (!c->restrict_realtime)
1659 return 0;
1660
1661 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1662 return 0;
1663
1664 return seccomp_restrict_realtime();
1665 }
1666
1667 static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1668 assert(u);
1669 assert(c);
1670
1671 if (!c->restrict_suid_sgid)
1672 return 0;
1673
1674 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1675 return 0;
1676
1677 return seccomp_restrict_suid_sgid();
1678 }
1679
1680 static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
1681 assert(u);
1682 assert(c);
1683
1684 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1685 * let's protect even those systems where this is left on in the kernel. */
1686
1687 if (!c->protect_kernel_tunables)
1688 return 0;
1689
1690 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1691 return 0;
1692
1693 return seccomp_protect_sysctl();
1694 }
1695
1696 static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
1697 assert(u);
1698 assert(c);
1699
1700 /* Turn off module syscalls on ProtectKernelModules=yes */
1701
1702 if (!c->protect_kernel_modules)
1703 return 0;
1704
1705 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1706 return 0;
1707
1708 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
1709 }
1710
1711 static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1712 assert(u);
1713 assert(c);
1714
1715 if (!c->protect_kernel_logs)
1716 return 0;
1717
1718 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1719 return 0;
1720
1721 return seccomp_protect_syslog();
1722 }
1723
1724 static int apply_protect_clock(const Unit *u, const ExecContext *c) {
1725 assert(u);
1726 assert(c);
1727
1728 if (!c->protect_clock)
1729 return 0;
1730
1731 if (skip_seccomp_unavailable(u, "ProtectClock="))
1732 return 0;
1733
1734 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1735 }
1736
1737 static int apply_private_devices(const Unit *u, const ExecContext *c) {
1738 assert(u);
1739 assert(c);
1740
1741 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
1742
1743 if (!c->private_devices)
1744 return 0;
1745
1746 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1747 return 0;
1748
1749 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
1750 }
1751
1752 static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
1753 assert(u);
1754 assert(c);
1755
1756 if (!exec_context_restrict_namespaces_set(c))
1757 return 0;
1758
1759 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1760 return 0;
1761
1762 return seccomp_restrict_namespaces(c->restrict_namespaces);
1763 }
1764
1765 static int apply_lock_personality(const Unit* u, const ExecContext *c) {
1766 unsigned long personality;
1767 int r;
1768
1769 assert(u);
1770 assert(c);
1771
1772 if (!c->lock_personality)
1773 return 0;
1774
1775 if (skip_seccomp_unavailable(u, "LockPersonality="))
1776 return 0;
1777
1778 personality = c->personality;
1779
1780 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1781 if (personality == PERSONALITY_INVALID) {
1782
1783 r = opinionated_personality(&personality);
1784 if (r < 0)
1785 return r;
1786 }
1787
1788 return seccomp_lock_personality(personality);
1789 }
1790
1791 #endif
1792
1793 #if HAVE_LIBBPF
1794 static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1795 assert(u);
1796 assert(c);
1797
1798 if (!exec_context_restrict_filesystems_set(c))
1799 return 0;
1800
1801 if (!u->manager->restrict_fs) {
1802 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1803 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
1804 return 0;
1805 }
1806
1807 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1808 }
1809 #endif
1810
1811 static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
1812 assert(u);
1813 assert(c);
1814
1815 if (!c->protect_hostname)
1816 return 0;
1817
1818 if (ns_type_supported(NAMESPACE_UTS)) {
1819 if (unshare(CLONE_NEWUTS) < 0) {
1820 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1821 *ret_exit_status = EXIT_NAMESPACE;
1822 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1823 }
1824
1825 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1826 }
1827 } else
1828 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1829
1830 #if HAVE_SECCOMP
1831 int r;
1832
1833 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1834 return 0;
1835
1836 r = seccomp_protect_hostname();
1837 if (r < 0) {
1838 *ret_exit_status = EXIT_SECCOMP;
1839 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1840 }
1841 #endif
1842
1843 return 0;
1844 }
1845
1846 static void do_idle_pipe_dance(int idle_pipe[static 4]) {
1847 assert(idle_pipe);
1848
1849 idle_pipe[1] = safe_close(idle_pipe[1]);
1850 idle_pipe[2] = safe_close(idle_pipe[2]);
1851
1852 if (idle_pipe[0] >= 0) {
1853 int r;
1854
1855 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1856
1857 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
1858 ssize_t n;
1859
1860 /* Signal systemd that we are bored and want to continue. */
1861 n = write(idle_pipe[3], "x", 1);
1862 if (n > 0)
1863 /* Wait for systemd to react to the signal above. */
1864 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
1865 }
1866
1867 idle_pipe[0] = safe_close(idle_pipe[0]);
1868
1869 }
1870
1871 idle_pipe[3] = safe_close(idle_pipe[3]);
1872 }
1873
1874 static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1875
1876 static int build_environment(
1877 const Unit *u,
1878 const ExecContext *c,
1879 const ExecParameters *p,
1880 const CGroupContext *cgroup_context,
1881 size_t n_fds,
1882 char **fdnames,
1883 const char *home,
1884 const char *username,
1885 const char *shell,
1886 dev_t journal_stream_dev,
1887 ino_t journal_stream_ino,
1888 const char *memory_pressure_path,
1889 char ***ret) {
1890
1891 _cleanup_strv_free_ char **our_env = NULL;
1892 size_t n_env = 0;
1893 char *x;
1894 int r;
1895
1896 assert(u);
1897 assert(c);
1898 assert(p);
1899 assert(ret);
1900
1901 #define N_ENV_VARS 19
1902 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1903 if (!our_env)
1904 return -ENOMEM;
1905
1906 if (n_fds > 0) {
1907 _cleanup_free_ char *joined = NULL;
1908
1909 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
1910 return -ENOMEM;
1911 our_env[n_env++] = x;
1912
1913 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
1914 return -ENOMEM;
1915 our_env[n_env++] = x;
1916
1917 joined = strv_join(fdnames, ":");
1918 if (!joined)
1919 return -ENOMEM;
1920
1921 x = strjoin("LISTEN_FDNAMES=", joined);
1922 if (!x)
1923 return -ENOMEM;
1924 our_env[n_env++] = x;
1925 }
1926
1927 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
1928 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
1929 return -ENOMEM;
1930 our_env[n_env++] = x;
1931
1932 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
1933 return -ENOMEM;
1934 our_env[n_env++] = x;
1935 }
1936
1937 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1938 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1939 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1940 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1941 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
1942 if (!x)
1943 return -ENOMEM;
1944 our_env[n_env++] = x;
1945 }
1946
1947 if (home) {
1948 x = strjoin("HOME=", home);
1949 if (!x)
1950 return -ENOMEM;
1951
1952 path_simplify(x + 5);
1953 our_env[n_env++] = x;
1954 }
1955
1956 if (username) {
1957 x = strjoin("LOGNAME=", username);
1958 if (!x)
1959 return -ENOMEM;
1960 our_env[n_env++] = x;
1961
1962 x = strjoin("USER=", username);
1963 if (!x)
1964 return -ENOMEM;
1965 our_env[n_env++] = x;
1966 }
1967
1968 if (shell) {
1969 x = strjoin("SHELL=", shell);
1970 if (!x)
1971 return -ENOMEM;
1972
1973 path_simplify(x + 6);
1974 our_env[n_env++] = x;
1975 }
1976
1977 if (!sd_id128_is_null(u->invocation_id)) {
1978 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1979 return -ENOMEM;
1980
1981 our_env[n_env++] = x;
1982 }
1983
1984 if (exec_context_needs_term(c)) {
1985 _cleanup_free_ char *cmdline = NULL;
1986 const char *tty_path, *term = NULL;
1987
1988 tty_path = exec_context_tty_path(c);
1989
1990 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1991 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1992 * container manager passes to PID 1 ends up all the way in the console login shown. */
1993
1994 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
1995 term = getenv("TERM");
1996 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1997 _cleanup_free_ char *key = NULL;
1998
1999 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
2000 if (!key)
2001 return -ENOMEM;
2002
2003 r = proc_cmdline_get_key(key, 0, &cmdline);
2004 if (r < 0)
2005 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2006 else if (r > 0)
2007 term = cmdline;
2008 }
2009
2010 if (!term)
2011 term = default_term_for_tty(tty_path);
2012
2013 x = strjoin("TERM=", term);
2014 if (!x)
2015 return -ENOMEM;
2016 our_env[n_env++] = x;
2017 }
2018
2019 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2020 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2021 return -ENOMEM;
2022
2023 our_env[n_env++] = x;
2024 }
2025
2026 if (c->log_namespace) {
2027 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2028 if (!x)
2029 return -ENOMEM;
2030
2031 our_env[n_env++] = x;
2032 }
2033
2034 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2035 _cleanup_free_ char *joined = NULL;
2036 const char *n;
2037
2038 if (!p->prefix[t])
2039 continue;
2040
2041 if (c->directories[t].n_items == 0)
2042 continue;
2043
2044 n = exec_directory_env_name_to_string(t);
2045 if (!n)
2046 continue;
2047
2048 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2049 _cleanup_free_ char *prefixed = NULL;
2050
2051 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2052 if (!prefixed)
2053 return -ENOMEM;
2054
2055 if (!strextend_with_separator(&joined, ":", prefixed))
2056 return -ENOMEM;
2057 }
2058
2059 x = strjoin(n, "=", joined);
2060 if (!x)
2061 return -ENOMEM;
2062
2063 our_env[n_env++] = x;
2064 }
2065
2066 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2067 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2068 if (!x)
2069 return -ENOMEM;
2070
2071 our_env[n_env++] = x;
2072 }
2073
2074 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2075 return -ENOMEM;
2076
2077 our_env[n_env++] = x;
2078
2079 if (memory_pressure_path) {
2080 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2081 if (!x)
2082 return -ENOMEM;
2083
2084 our_env[n_env++] = x;
2085
2086 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2087 _cleanup_free_ char *b = NULL, *e = NULL;
2088
2089 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2090 MEMORY_PRESSURE_DEFAULT_TYPE,
2091 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2092 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2093 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2094 return -ENOMEM;
2095
2096 if (base64mem(b, strlen(b) + 1, &e) < 0)
2097 return -ENOMEM;
2098
2099 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2100 if (!x)
2101 return -ENOMEM;
2102
2103 our_env[n_env++] = x;
2104 }
2105 }
2106
2107 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2108 #undef N_ENV_VARS
2109
2110 *ret = TAKE_PTR(our_env);
2111
2112 return 0;
2113 }
2114
2115 static int build_pass_environment(const ExecContext *c, char ***ret) {
2116 _cleanup_strv_free_ char **pass_env = NULL;
2117 size_t n_env = 0;
2118
2119 STRV_FOREACH(i, c->pass_environment) {
2120 _cleanup_free_ char *x = NULL;
2121 char *v;
2122
2123 v = getenv(*i);
2124 if (!v)
2125 continue;
2126 x = strjoin(*i, "=", v);
2127 if (!x)
2128 return -ENOMEM;
2129
2130 if (!GREEDY_REALLOC(pass_env, n_env + 2))
2131 return -ENOMEM;
2132
2133 pass_env[n_env++] = TAKE_PTR(x);
2134 pass_env[n_env] = NULL;
2135 }
2136
2137 *ret = TAKE_PTR(pass_env);
2138
2139 return 0;
2140 }
2141
2142 bool exec_needs_network_namespace(const ExecContext *context) {
2143 assert(context);
2144
2145 return context->private_network || context->network_namespace_path;
2146 }
2147
2148 static bool exec_needs_ephemeral(const ExecContext *context) {
2149 return (context->root_image || context->root_directory) && context->root_ephemeral;
2150 }
2151
2152 static bool exec_needs_ipc_namespace(const ExecContext *context) {
2153 assert(context);
2154
2155 return context->private_ipc || context->ipc_namespace_path;
2156 }
2157
2158 bool exec_needs_mount_namespace(
2159 const ExecContext *context,
2160 const ExecParameters *params,
2161 const ExecRuntime *runtime) {
2162
2163 assert(context);
2164
2165 if (context->root_image)
2166 return true;
2167
2168 if (!strv_isempty(context->read_write_paths) ||
2169 !strv_isempty(context->read_only_paths) ||
2170 !strv_isempty(context->inaccessible_paths) ||
2171 !strv_isempty(context->exec_paths) ||
2172 !strv_isempty(context->no_exec_paths))
2173 return true;
2174
2175 if (context->n_bind_mounts > 0)
2176 return true;
2177
2178 if (context->n_temporary_filesystems > 0)
2179 return true;
2180
2181 if (context->n_mount_images > 0)
2182 return true;
2183
2184 if (context->n_extension_images > 0)
2185 return true;
2186
2187 if (!strv_isempty(context->extension_directories))
2188 return true;
2189
2190 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
2191 return true;
2192
2193 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
2194 return true;
2195
2196 if (context->private_devices ||
2197 context->private_mounts > 0 ||
2198 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
2199 context->protect_system != PROTECT_SYSTEM_NO ||
2200 context->protect_home != PROTECT_HOME_NO ||
2201 context->protect_kernel_tunables ||
2202 context->protect_kernel_modules ||
2203 context->protect_kernel_logs ||
2204 context->protect_control_groups ||
2205 context->protect_proc != PROTECT_PROC_DEFAULT ||
2206 context->proc_subset != PROC_SUBSET_ALL ||
2207 exec_needs_ipc_namespace(context))
2208 return true;
2209
2210 if (context->root_directory) {
2211 if (exec_context_get_effective_mount_apivfs(context))
2212 return true;
2213
2214 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2215 if (params && !params->prefix[t])
2216 continue;
2217
2218 if (context->directories[t].n_items > 0)
2219 return true;
2220 }
2221 }
2222
2223 if (context->dynamic_user &&
2224 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2225 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2226 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
2227 return true;
2228
2229 if (context->log_namespace)
2230 return true;
2231
2232 return false;
2233 }
2234
2235 static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
2236 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2237 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
2238 _cleanup_close_ int unshare_ready_fd = -EBADF;
2239 _cleanup_(sigkill_waitp) pid_t pid = 0;
2240 uint64_t c = 1;
2241 ssize_t n;
2242 int r;
2243
2244 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2245 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
2246 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2247 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2248 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2249 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
2250 * continues execution normally.
2251 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2252 * does not need CAP_SETUID to write the single line mapping to itself. */
2253
2254 /* Can only set up multiple mappings with CAP_SETUID. */
2255 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
2256 r = asprintf(&uid_map,
2257 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
2258 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
2259 ouid, ouid, uid, uid);
2260 else
2261 r = asprintf(&uid_map,
2262 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2263 ouid, ouid);
2264
2265 if (r < 0)
2266 return -ENOMEM;
2267
2268 /* Can only set up multiple mappings with CAP_SETGID. */
2269 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
2270 r = asprintf(&gid_map,
2271 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
2272 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
2273 ogid, ogid, gid, gid);
2274 else
2275 r = asprintf(&gid_map,
2276 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2277 ogid, ogid);
2278
2279 if (r < 0)
2280 return -ENOMEM;
2281
2282 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2283 * namespace. */
2284 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2285 if (unshare_ready_fd < 0)
2286 return -errno;
2287
2288 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2289 * failed. */
2290 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2291 return -errno;
2292
2293 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2294 if (r < 0)
2295 return r;
2296 if (r == 0) {
2297 _cleanup_close_ int fd = -EBADF;
2298 const char *a;
2299 pid_t ppid;
2300
2301 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2302 * here, after the parent opened its own user namespace. */
2303
2304 ppid = getppid();
2305 errno_pipe[0] = safe_close(errno_pipe[0]);
2306
2307 /* Wait until the parent unshared the user namespace */
2308 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2309 r = -errno;
2310 goto child_fail;
2311 }
2312
2313 /* Disable the setgroups() system call in the child user namespace, for good. */
2314 a = procfs_file_alloca(ppid, "setgroups");
2315 fd = open(a, O_WRONLY|O_CLOEXEC);
2316 if (fd < 0) {
2317 if (errno != ENOENT) {
2318 r = -errno;
2319 goto child_fail;
2320 }
2321
2322 /* If the file is missing the kernel is too old, let's continue anyway. */
2323 } else {
2324 if (write(fd, "deny\n", 5) < 0) {
2325 r = -errno;
2326 goto child_fail;
2327 }
2328
2329 fd = safe_close(fd);
2330 }
2331
2332 /* First write the GID map */
2333 a = procfs_file_alloca(ppid, "gid_map");
2334 fd = open(a, O_WRONLY|O_CLOEXEC);
2335 if (fd < 0) {
2336 r = -errno;
2337 goto child_fail;
2338 }
2339 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2340 r = -errno;
2341 goto child_fail;
2342 }
2343 fd = safe_close(fd);
2344
2345 /* The write the UID map */
2346 a = procfs_file_alloca(ppid, "uid_map");
2347 fd = open(a, O_WRONLY|O_CLOEXEC);
2348 if (fd < 0) {
2349 r = -errno;
2350 goto child_fail;
2351 }
2352 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2353 r = -errno;
2354 goto child_fail;
2355 }
2356
2357 _exit(EXIT_SUCCESS);
2358
2359 child_fail:
2360 (void) write(errno_pipe[1], &r, sizeof(r));
2361 _exit(EXIT_FAILURE);
2362 }
2363
2364 errno_pipe[1] = safe_close(errno_pipe[1]);
2365
2366 if (unshare(CLONE_NEWUSER) < 0)
2367 return -errno;
2368
2369 /* Let the child know that the namespace is ready now */
2370 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2371 return -errno;
2372
2373 /* Try to read an error code from the child */
2374 n = read(errno_pipe[0], &r, sizeof(r));
2375 if (n < 0)
2376 return -errno;
2377 if (n == sizeof(r)) { /* an error code was sent to us */
2378 if (r < 0)
2379 return r;
2380 return -EIO;
2381 }
2382 if (n != 0) /* on success we should have read 0 bytes */
2383 return -EIO;
2384
2385 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
2386 if (r < 0)
2387 return r;
2388 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
2389 return -EIO;
2390
2391 return 0;
2392 }
2393
2394 static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2395 assert(context);
2396
2397 if (!context->dynamic_user)
2398 return false;
2399
2400 if (type == EXEC_DIRECTORY_CONFIGURATION)
2401 return false;
2402
2403 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2404 return false;
2405
2406 return true;
2407 }
2408
2409 static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2410 _cleanup_free_ char *src_abs = NULL;
2411 int r;
2412
2413 assert(source);
2414
2415 src_abs = path_join(root, source);
2416 if (!src_abs)
2417 return -ENOMEM;
2418
2419 STRV_FOREACH(dst, symlinks) {
2420 _cleanup_free_ char *dst_abs = NULL;
2421
2422 dst_abs = path_join(root, *dst);
2423 if (!dst_abs)
2424 return -ENOMEM;
2425
2426 r = mkdir_parents_label(dst_abs, 0755);
2427 if (r < 0)
2428 return r;
2429
2430 r = symlink_idempotent(src_abs, dst_abs, true);
2431 if (r < 0)
2432 return r;
2433 }
2434
2435 return 0;
2436 }
2437
2438 static int setup_exec_directory(
2439 Unit *u,
2440 const ExecContext *context,
2441 const ExecParameters *params,
2442 uid_t uid,
2443 gid_t gid,
2444 ExecDirectoryType type,
2445 bool needs_mount_namespace,
2446 int *exit_status) {
2447
2448 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
2449 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2450 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2451 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2452 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2453 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2454 };
2455 int r;
2456
2457 assert(context);
2458 assert(params);
2459 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
2460 assert(exit_status);
2461
2462 if (!params->prefix[type])
2463 return 0;
2464
2465 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
2466 if (!uid_is_valid(uid))
2467 uid = 0;
2468 if (!gid_is_valid(gid))
2469 gid = 0;
2470 }
2471
2472 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2473 _cleanup_free_ char *p = NULL, *pp = NULL;
2474
2475 p = path_join(params->prefix[type], context->directories[type].items[i].path);
2476 if (!p) {
2477 r = -ENOMEM;
2478 goto fail;
2479 }
2480
2481 r = mkdir_parents_label(p, 0755);
2482 if (r < 0)
2483 goto fail;
2484
2485 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2486
2487 /* If we are in user mode, and a configuration directory exists but a state directory
2488 * doesn't exist, then we likely are upgrading from an older systemd version that
2489 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2490 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2491 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
2492 * separated. If a service has both dirs configured but only the configuration dir
2493 * exists and the state dir does not, we assume we are looking at an update
2494 * situation. Hence, create a compatibility symlink, so that all expectations are
2495 * met.
2496 *
2497 * (We also do something similar with the log directory, which still doesn't exist in
2498 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2499
2500 /* this assumes the state dir is always created before the configuration dir */
2501 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2502 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2503
2504 r = laccess(p, F_OK);
2505 if (r == -ENOENT) {
2506 _cleanup_free_ char *q = NULL;
2507
2508 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2509 * under the configuration hierarchy. */
2510
2511 if (type == EXEC_DIRECTORY_STATE)
2512 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2513 else if (type == EXEC_DIRECTORY_LOGS)
2514 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2515 else
2516 assert_not_reached();
2517 if (!q) {
2518 r = -ENOMEM;
2519 goto fail;
2520 }
2521
2522 r = laccess(q, F_OK);
2523 if (r >= 0) {
2524 /* It does exist! This hence looks like an update. Symlink the
2525 * configuration directory into the state directory. */
2526
2527 r = symlink_idempotent(q, p, /* make_relative= */ true);
2528 if (r < 0)
2529 goto fail;
2530
2531 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
2532 continue;
2533 } else if (r != -ENOENT)
2534 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
2535
2536 } else if (r < 0)
2537 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
2538 }
2539
2540 if (exec_directory_is_private(context, type)) {
2541 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2542 * case we want to avoid leaving a directory around fully accessible that is owned by
2543 * a dynamic user whose UID is later on reused. To lock this down we use the same
2544 * trick used by container managers to prohibit host users to get access to files of
2545 * the same UID in containers: we place everything inside a directory that has an
2546 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2547 * for unprivileged host code. We then use fs namespacing to make this directory
2548 * permeable for the service itself.
2549 *
2550 * Specifically: for a service which wants a special directory "foo/" we first create
2551 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2552 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2553 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2554 * unprivileged host users can't look into it. Inside of the namespace of the unit
2555 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2556 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2557 * for the service and making sure it only gets access to the dirs it needs but no
2558 * others. Tricky? Yes, absolutely, but it works!
2559 *
2560 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2561 * to be owned by the service itself.
2562 *
2563 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2564 * for sharing files or sockets with other services. */
2565
2566 pp = path_join(params->prefix[type], "private");
2567 if (!pp) {
2568 r = -ENOMEM;
2569 goto fail;
2570 }
2571
2572 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
2573 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
2574 if (r < 0)
2575 goto fail;
2576
2577 if (!path_extend(&pp, context->directories[type].items[i].path)) {
2578 r = -ENOMEM;
2579 goto fail;
2580 }
2581
2582 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2583 r = mkdir_parents_label(pp, 0755);
2584 if (r < 0)
2585 goto fail;
2586
2587 if (is_dir(p, false) > 0 &&
2588 (laccess(pp, F_OK) == -ENOENT)) {
2589
2590 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2591 * it over. Most likely the service has been upgraded from one that didn't use
2592 * DynamicUser=1, to one that does. */
2593
2594 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2595 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2596 exec_directory_type_to_string(type), p, pp);
2597
2598 r = RET_NERRNO(rename(p, pp));
2599 if (r < 0)
2600 goto fail;
2601 } else {
2602 /* Otherwise, create the actual directory for the service */
2603
2604 r = mkdir_label(pp, context->directories[type].mode);
2605 if (r < 0 && r != -EEXIST)
2606 goto fail;
2607 }
2608
2609 if (!context->directories[type].items[i].only_create) {
2610 /* And link it up from the original place.
2611 * Notes
2612 * 1) If a mount namespace is going to be used, then this symlink remains on
2613 * the host, and a new one for the child namespace will be created later.
2614 * 2) It is not necessary to create this symlink when one of its parent
2615 * directories is specified and already created. E.g.
2616 * StateDirectory=foo foo/bar
2617 * In that case, the inode points to pp and p for "foo/bar" are the same:
2618 * pp = "/var/lib/private/foo/bar"
2619 * p = "/var/lib/foo/bar"
2620 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2621 * we do not need to create the symlink, but we cannot create the symlink.
2622 * See issue #24783. */
2623 r = symlink_idempotent(pp, p, true);
2624 if (r < 0)
2625 goto fail;
2626 }
2627
2628 } else {
2629 _cleanup_free_ char *target = NULL;
2630
2631 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2632 readlink_and_make_absolute(p, &target) >= 0) {
2633 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
2634
2635 /* This already exists and is a symlink? Interesting. Maybe it's one created
2636 * by DynamicUser=1 (see above)?
2637 *
2638 * We do this for all directory types except for ConfigurationDirectory=,
2639 * since they all support the private/ symlink logic at least in some
2640 * configurations, see above. */
2641
2642 r = chase(target, NULL, 0, &target_resolved, NULL);
2643 if (r < 0)
2644 goto fail;
2645
2646 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
2647 if (!q) {
2648 r = -ENOMEM;
2649 goto fail;
2650 }
2651
2652 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2653 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2654 if (r < 0)
2655 goto fail;
2656
2657 if (path_equal(q_resolved, target_resolved)) {
2658
2659 /* Hmm, apparently DynamicUser= was once turned on for this service,
2660 * but is no longer. Let's move the directory back up. */
2661
2662 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2663 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2664 exec_directory_type_to_string(type), q, p);
2665
2666 r = RET_NERRNO(unlink(p));
2667 if (r < 0)
2668 goto fail;
2669
2670 r = RET_NERRNO(rename(q, p));
2671 if (r < 0)
2672 goto fail;
2673 }
2674 }
2675
2676 r = mkdir_label(p, context->directories[type].mode);
2677 if (r < 0) {
2678 if (r != -EEXIST)
2679 goto fail;
2680
2681 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2682 struct stat st;
2683
2684 /* Don't change the owner/access mode of the configuration directory,
2685 * as in the common case it is not written to by a service, and shall
2686 * not be writable. */
2687
2688 r = RET_NERRNO(stat(p, &st));
2689 if (r < 0)
2690 goto fail;
2691
2692 /* Still complain if the access mode doesn't match */
2693 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2694 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2695 "(File system: %o %sMode: %o)",
2696 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2697 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2698
2699 continue;
2700 }
2701 }
2702 }
2703
2704 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
2705 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
2706 * current UID/GID ownership.) */
2707 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2708 if (r < 0)
2709 goto fail;
2710
2711 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2712 * available to user code anyway */
2713 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2714 continue;
2715
2716 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2717 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2718 * assignments to exist. */
2719 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
2720 if (r < 0)
2721 goto fail;
2722 }
2723
2724 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2725 * they are set up later, to allow configuring empty var/run/etc. */
2726 if (!needs_mount_namespace)
2727 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2728 r = create_many_symlinks(params->prefix[type],
2729 context->directories[type].items[i].path,
2730 context->directories[type].items[i].symlinks);
2731 if (r < 0)
2732 goto fail;
2733 }
2734
2735 return 0;
2736
2737 fail:
2738 *exit_status = exit_status_table[type];
2739 return r;
2740 }
2741
2742 static int write_credential(
2743 int dfd,
2744 const char *id,
2745 const void *data,
2746 size_t size,
2747 uid_t uid,
2748 bool ownership_ok) {
2749
2750 _cleanup_(unlink_and_freep) char *tmp = NULL;
2751 _cleanup_close_ int fd = -EBADF;
2752 int r;
2753
2754 r = tempfn_random_child("", "cred", &tmp);
2755 if (r < 0)
2756 return r;
2757
2758 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2759 if (fd < 0) {
2760 tmp = mfree(tmp);
2761 return -errno;
2762 }
2763
2764 r = loop_write(fd, data, size, /* do_poll = */ false);
2765 if (r < 0)
2766 return r;
2767
2768 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2769 return -errno;
2770
2771 if (uid_is_valid(uid) && uid != getuid()) {
2772 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
2773 if (r < 0) {
2774 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2775 return r;
2776
2777 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2778 * to express: that the user gets read access and nothing
2779 * else. But if the backing fs can't support that (e.g. ramfs)
2780 * then we can use file ownership instead. But that's only safe if
2781 * we can then re-mount the whole thing read-only, so that the
2782 * user can no longer chmod() the file to gain write access. */
2783 return r;
2784
2785 if (fchown(fd, uid, GID_INVALID) < 0)
2786 return -errno;
2787 }
2788 }
2789
2790 if (renameat(dfd, tmp, dfd, id) < 0)
2791 return -errno;
2792
2793 tmp = mfree(tmp);
2794 return 0;
2795 }
2796
2797 typedef enum CredentialSearchPath {
2798 CREDENTIAL_SEARCH_PATH_TRUSTED,
2799 CREDENTIAL_SEARCH_PATH_ENCRYPTED,
2800 CREDENTIAL_SEARCH_PATH_ALL,
2801 _CREDENTIAL_SEARCH_PATH_MAX,
2802 _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
2803 } CredentialSearchPath;
2804
2805 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
2806
2807 _cleanup_strv_free_ char **l = NULL;
2808
2809 assert(params);
2810 assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
2811
2812 /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
2813 * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
2814 * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
2815
2816 if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2817 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2818 return NULL;
2819
2820 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2821 return NULL;
2822 }
2823
2824 if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
2825 if (params->received_credentials_directory)
2826 if (strv_extend(&l, params->received_credentials_directory) < 0)
2827 return NULL;
2828
2829 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2830 return NULL;
2831 }
2832
2833 if (DEBUG_LOGGING) {
2834 _cleanup_free_ char *t = strv_join(l, ":");
2835
2836 log_debug("Credential search path is: %s", strempty(t));
2837 }
2838
2839 return TAKE_PTR(l);
2840 }
2841
2842 static int maybe_decrypt_and_write_credential(
2843 int dir_fd,
2844 const char *id,
2845 bool encrypted,
2846 uid_t uid,
2847 bool ownership_ok,
2848 const char *data,
2849 size_t size,
2850 uint64_t *left) {
2851
2852 _cleanup_free_ void *plaintext = NULL;
2853 size_t add;
2854 int r;
2855
2856 if (encrypted) {
2857 size_t plaintext_size = 0;
2858
2859 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
2860 &plaintext, &plaintext_size);
2861 if (r < 0)
2862 return r;
2863
2864 data = plaintext;
2865 size = plaintext_size;
2866 }
2867
2868 add = strlen(id) + size;
2869 if (add > *left)
2870 return -E2BIG;
2871
2872 r = write_credential(dir_fd, id, data, size, uid, ownership_ok);
2873 if (r < 0)
2874 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
2875
2876 *left -= add;
2877 return 0;
2878 }
2879
2880 static int load_credential_glob(
2881 const char *path,
2882 bool encrypted,
2883 char **search_path,
2884 ReadFullFileFlags flags,
2885 int write_dfd,
2886 uid_t uid,
2887 bool ownership_ok,
2888 uint64_t *left) {
2889
2890 int r;
2891
2892 STRV_FOREACH(d, search_path) {
2893 _cleanup_globfree_ glob_t pglob = {};
2894 _cleanup_free_ char *j = NULL;
2895
2896 j = path_join(*d, path);
2897 if (!j)
2898 return -ENOMEM;
2899
2900 r = safe_glob(j, 0, &pglob);
2901 if (r == -ENOENT)
2902 continue;
2903 if (r < 0)
2904 return r;
2905
2906 for (size_t n = 0; n < pglob.gl_pathc; n++) {
2907 _cleanup_free_ char *fn = NULL;
2908 _cleanup_(erase_and_freep) char *data = NULL;
2909 size_t size;
2910
2911 /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2912 r = read_full_file_full(
2913 AT_FDCWD,
2914 pglob.gl_pathv[n],
2915 UINT64_MAX,
2916 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2917 flags,
2918 NULL,
2919 &data, &size);
2920 if (r < 0)
2921 return log_debug_errno(r, "Failed to read credential '%s': %m",
2922 pglob.gl_pathv[n]);
2923
2924 r = path_extract_filename(pglob.gl_pathv[n], &fn);
2925 if (r < 0)
2926 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
2927 pglob.gl_pathv[n]);
2928
2929 r = maybe_decrypt_and_write_credential(
2930 write_dfd,
2931 fn,
2932 encrypted,
2933 uid,
2934 ownership_ok,
2935 data, size,
2936 left);
2937 if (r == -EEXIST)
2938 continue;
2939 if (r < 0)
2940 return r;
2941 }
2942 }
2943
2944 return 0;
2945 }
2946
2947 static int load_credential(
2948 const ExecContext *context,
2949 const ExecParameters *params,
2950 const char *id,
2951 const char *path,
2952 bool encrypted,
2953 const char *unit,
2954 int read_dfd,
2955 int write_dfd,
2956 uid_t uid,
2957 bool ownership_ok,
2958 uint64_t *left) {
2959
2960 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2961 _cleanup_strv_free_ char **search_path = NULL;
2962 _cleanup_(erase_and_freep) char *data = NULL;
2963 _cleanup_free_ char *bindname = NULL;
2964 const char *source = NULL;
2965 bool missing_ok = true;
2966 size_t size, maxsz;
2967 int r;
2968
2969 assert(context);
2970 assert(params);
2971 assert(id);
2972 assert(path);
2973 assert(unit);
2974 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
2975 assert(write_dfd >= 0);
2976 assert(left);
2977
2978 if (read_dfd >= 0) {
2979 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2980 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2981 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2982 * open it. */
2983
2984 if (!filename_is_valid(path)) /* safety check */
2985 return -EINVAL;
2986
2987 missing_ok = true;
2988 source = path;
2989
2990 } else if (path_is_absolute(path)) {
2991 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2992 * sockets */
2993
2994 if (!path_is_valid(path)) /* safety check */
2995 return -EINVAL;
2996
2997 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2998
2999 /* Pass some minimal info about the unit and the credential name we are looking to acquire
3000 * via the source socket address in case we read off an AF_UNIX socket. */
3001 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3002 return -ENOMEM;
3003
3004 missing_ok = false;
3005 source = path;
3006
3007 } else if (credential_name_valid(path)) {
3008 /* If this is a relative path, take it as credential name relative to the credentials
3009 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
3010 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
3011
3012 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
3013 if (!search_path)
3014 return -ENOMEM;
3015
3016 missing_ok = true;
3017 } else
3018 source = NULL;
3019
3020 if (encrypted)
3021 flags |= READ_FULL_FILE_UNBASE64;
3022
3023 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
3024
3025 if (search_path) {
3026 STRV_FOREACH(d, search_path) {
3027 _cleanup_free_ char *j = NULL;
3028
3029 j = path_join(*d, path);
3030 if (!j)
3031 return -ENOMEM;
3032
3033 r = read_full_file_full(
3034 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
3035 UINT64_MAX,
3036 maxsz,
3037 flags,
3038 NULL,
3039 &data, &size);
3040 if (r != -ENOENT)
3041 break;
3042 }
3043 } else if (source)
3044 r = read_full_file_full(
3045 read_dfd, source,
3046 UINT64_MAX,
3047 maxsz,
3048 flags,
3049 bindname,
3050 &data, &size);
3051 else
3052 r = -ENOENT;
3053
3054 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3055 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
3056 * will get clear errors if we don't pass such a missing credential on as they
3057 * themselves will get ENOENT when trying to read them, which should not be much
3058 * worse than when we handle the error here and make it fatal.
3059 *
3060 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
3061 * we are fine, too. */
3062 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3063 return 0;
3064 }
3065 if (r < 0)
3066 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3067
3068 return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, ownership_ok, data, size, left);
3069 }
3070
3071 struct load_cred_args {
3072 const ExecContext *context;
3073 const ExecParameters *params;
3074 bool encrypted;
3075 const char *unit;
3076 int dfd;
3077 uid_t uid;
3078 bool ownership_ok;
3079 uint64_t *left;
3080 };
3081
3082 static int load_cred_recurse_dir_cb(
3083 RecurseDirEvent event,
3084 const char *path,
3085 int dir_fd,
3086 int inode_fd,
3087 const struct dirent *de,
3088 const struct statx *sx,
3089 void *userdata) {
3090
3091 struct load_cred_args *args = ASSERT_PTR(userdata);
3092 _cleanup_free_ char *sub_id = NULL;
3093 int r;
3094
3095 if (event != RECURSE_DIR_ENTRY)
3096 return RECURSE_DIR_CONTINUE;
3097
3098 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
3099 return RECURSE_DIR_CONTINUE;
3100
3101 sub_id = strreplace(path, "/", "_");
3102 if (!sub_id)
3103 return -ENOMEM;
3104
3105 if (!credential_name_valid(sub_id))
3106 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3107
3108 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3109 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
3110 return RECURSE_DIR_CONTINUE;
3111 }
3112 if (errno != ENOENT)
3113 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3114
3115 r = load_credential(
3116 args->context,
3117 args->params,
3118 sub_id,
3119 de->d_name,
3120 args->encrypted,
3121 args->unit,
3122 dir_fd,
3123 args->dfd,
3124 args->uid,
3125 args->ownership_ok,
3126 args->left);
3127 if (r < 0)
3128 return r;
3129
3130 return RECURSE_DIR_CONTINUE;
3131 }
3132
3133 static int acquire_credentials(
3134 const ExecContext *context,
3135 const ExecParameters *params,
3136 const char *unit,
3137 const char *p,
3138 uid_t uid,
3139 bool ownership_ok) {
3140
3141 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
3142 _cleanup_close_ int dfd = -EBADF;
3143 const char *ic;
3144 ExecLoadCredential *lc;
3145 ExecSetCredential *sc;
3146 int r;
3147
3148 assert(context);
3149 assert(p);
3150
3151 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
3152 if (dfd < 0)
3153 return -errno;
3154
3155 r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
3156 if (r < 0)
3157 return r;
3158
3159 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
3160 HASHMAP_FOREACH(lc, context->load_credentials) {
3161 _cleanup_close_ int sub_fd = -EBADF;
3162
3163 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
3164 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
3165 * a regular file. Finally, if it's a relative path we will use it as a credential name to
3166 * propagate a credential passed to us from further up. */
3167
3168 if (path_is_absolute(lc->path)) {
3169 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
3170 if (sub_fd < 0 && !IN_SET(errno,
3171 ENOTDIR, /* Not a directory */
3172 ENOENT)) /* Doesn't exist? */
3173 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
3174 }
3175
3176 if (sub_fd < 0)
3177 /* Regular file (incl. a credential passed in from higher up) */
3178 r = load_credential(
3179 context,
3180 params,
3181 lc->id,
3182 lc->path,
3183 lc->encrypted,
3184 unit,
3185 AT_FDCWD,
3186 dfd,
3187 uid,
3188 ownership_ok,
3189 &left);
3190 else
3191 /* Directory */
3192 r = recurse_dir(
3193 sub_fd,
3194 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3195 /* statx_mask= */ 0,
3196 /* n_depth_max= */ UINT_MAX,
3197 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3198 load_cred_recurse_dir_cb,
3199 &(struct load_cred_args) {
3200 .context = context,
3201 .params = params,
3202 .encrypted = lc->encrypted,
3203 .unit = unit,
3204 .dfd = dfd,
3205 .uid = uid,
3206 .ownership_ok = ownership_ok,
3207 .left = &left,
3208 });
3209 if (r < 0)
3210 return r;
3211 }
3212
3213 /* Next, look for system credentials and credentials in the credentials store. Note that these do not
3214 * override any credentials found earlier. */
3215 SET_FOREACH(ic, context->import_credentials) {
3216 _cleanup_free_ char **search_path = NULL;
3217
3218 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
3219 if (!search_path)
3220 return -ENOMEM;
3221
3222 r = load_credential_glob(
3223 ic,
3224 /* encrypted = */ false,
3225 search_path,
3226 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
3227 dfd,
3228 uid,
3229 ownership_ok,
3230 &left);
3231 if (r < 0)
3232 return r;
3233
3234 search_path = strv_free(search_path);
3235 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
3236 if (!search_path)
3237 return -ENOMEM;
3238
3239 r = load_credential_glob(
3240 ic,
3241 /* encrypted = */ true,
3242 search_path,
3243 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
3244 dfd,
3245 uid,
3246 ownership_ok,
3247 &left);
3248 if (r < 0)
3249 return r;
3250 }
3251
3252 /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
3253 * add them, so that they can act as a "default" if the same credential is specified multiple times. */
3254 HASHMAP_FOREACH(sc, context->set_credentials) {
3255 _cleanup_(erase_and_freep) void *plaintext = NULL;
3256 const char *data;
3257 size_t size, add;
3258
3259 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
3260 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
3261 * slow and involved, hence it's nice to be able to skip that if the credential already
3262 * exists anyway. */
3263 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
3264 continue;
3265 if (errno != ENOENT)
3266 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
3267
3268 if (sc->encrypted) {
3269 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
3270 if (r < 0)
3271 return r;
3272
3273 data = plaintext;
3274 } else {
3275 data = sc->data;
3276 size = sc->size;
3277 }
3278
3279 add = strlen(sc->id) + size;
3280 if (add > left)
3281 return -E2BIG;
3282
3283 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3284 if (r < 0)
3285 return r;
3286
3287 left -= add;
3288 }
3289
3290 r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
3291 if (r < 0)
3292 return r;
3293
3294 /* After we created all keys with the right perms, also make sure the credential store as a whole is
3295 * accessible */
3296
3297 if (uid_is_valid(uid) && uid != getuid()) {
3298 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
3299 if (r < 0) {
3300 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3301 return r;
3302
3303 if (!ownership_ok)
3304 return r;
3305
3306 if (fchown(dfd, uid, GID_INVALID) < 0)
3307 return -errno;
3308 }
3309 }
3310
3311 return 0;
3312 }
3313
3314 static int setup_credentials_internal(
3315 const ExecContext *context,
3316 const ExecParameters *params,
3317 const char *unit,
3318 const char *final, /* This is where the credential store shall eventually end up at */
3319 const char *workspace, /* This is where we can prepare it before moving it to the final place */
3320 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
3321 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3322 uid_t uid) {
3323
3324 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3325 * if we mounted something; false if we definitely can't mount anything */
3326 bool final_mounted;
3327 const char *where;
3328
3329 assert(context);
3330 assert(final);
3331 assert(workspace);
3332
3333 if (reuse_workspace) {
3334 r = path_is_mount_point(workspace, NULL, 0);
3335 if (r < 0)
3336 return r;
3337 if (r > 0)
3338 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3339 else
3340 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3341 } else
3342 workspace_mounted = -1; /* ditto */
3343
3344 r = path_is_mount_point(final, NULL, 0);
3345 if (r < 0)
3346 return r;
3347 if (r > 0) {
3348 /* If the final place already has something mounted, we use that. If the workspace also has
3349 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3350 * different). */
3351 final_mounted = true;
3352
3353 if (workspace_mounted < 0) {
3354 /* If the final place is mounted, but the workspace isn't, then let's bind mount
3355 * the final version to the workspace, and make it writable, so that we can make
3356 * changes */
3357
3358 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3359 if (r < 0)
3360 return r;
3361
3362 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3363 if (r < 0)
3364 return r;
3365
3366 workspace_mounted = true;
3367 }
3368 } else
3369 final_mounted = false;
3370
3371 if (workspace_mounted < 0) {
3372 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3373
3374 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
3375 if (r < 0) {
3376 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
3377 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3378 if (r < 0) {
3379 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3380 return r;
3381
3382 if (must_mount) /* If we it's not OK to use the plain directory
3383 * fallback, propagate all errors too */
3384 return r;
3385
3386 /* If we lack privileges to bind mount stuff, then let's gracefully
3387 * proceed for compat with container envs, and just use the final dir
3388 * as is. */
3389
3390 workspace_mounted = false;
3391 } else {
3392 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
3393 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
3394 if (r < 0)
3395 return r;
3396
3397 workspace_mounted = true;
3398 }
3399 } else
3400 workspace_mounted = true;
3401 }
3402
3403 assert(!must_mount || workspace_mounted > 0);
3404 where = workspace_mounted ? workspace : final;
3405
3406 (void) label_fix_full(AT_FDCWD, where, final, 0);
3407
3408 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
3409 if (r < 0)
3410 return r;
3411
3412 if (workspace_mounted) {
3413 bool install;
3414
3415 /* Determine if we should actually install the prepared mount in the final location by bind
3416 * mounting it there. We do so only if the mount is not established there already, and if the
3417 * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
3418 * case we are doing all this in a mount namespace, thus no one else will see that we
3419 * allocated a file system we are getting rid of again here. */
3420 if (final_mounted)
3421 install = false; /* already installed */
3422 else {
3423 r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
3424 if (r < 0)
3425 return r;
3426
3427 install = r == 0; /* install only if non-empty */
3428 }
3429
3430 if (install) {
3431 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
3432 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
3433 if (r < 0)
3434 return r;
3435
3436 /* And mount it to the final place, read-only */
3437 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3438 } else
3439 /* Otherwise get rid of it */
3440 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3441 if (r < 0)
3442 return r;
3443 } else {
3444 _cleanup_free_ char *parent = NULL;
3445
3446 /* If we do not have our own mount put used the plain directory fallback, then we need to
3447 * open access to the top-level credential directory and the per-service directory now */
3448
3449 r = path_extract_directory(final, &parent);
3450 if (r < 0)
3451 return r;
3452 if (chmod(parent, 0755) < 0)
3453 return -errno;
3454 }
3455
3456 return 0;
3457 }
3458
3459 static int setup_credentials(
3460 const ExecContext *context,
3461 const ExecParameters *params,
3462 const char *unit,
3463 uid_t uid) {
3464
3465 _cleanup_free_ char *p = NULL, *q = NULL;
3466 int r;
3467
3468 assert(context);
3469 assert(params);
3470
3471 if (!exec_context_has_credentials(context))
3472 return 0;
3473
3474 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3475 return -EINVAL;
3476
3477 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3478 * and the subdir we mount over with a read-only file system readable by the service's user */
3479 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3480 if (!q)
3481 return -ENOMEM;
3482
3483 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3484 if (r < 0 && r != -EEXIST)
3485 return r;
3486
3487 p = path_join(q, unit);
3488 if (!p)
3489 return -ENOMEM;
3490
3491 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3492 if (r < 0 && r != -EEXIST)
3493 return r;
3494
3495 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3496 if (r < 0) {
3497 _cleanup_free_ char *t = NULL, *u = NULL;
3498
3499 /* If this is not a privilege or support issue then propagate the error */
3500 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3501 return r;
3502
3503 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3504 * it into place, so that users can't access half-initialized credential stores. */
3505 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3506 if (!t)
3507 return -ENOMEM;
3508
3509 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3510 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3511 * after it is fully set up */
3512 u = path_join(t, unit);
3513 if (!u)
3514 return -ENOMEM;
3515
3516 FOREACH_STRING(i, t, u) {
3517 r = mkdir_label(i, 0700);
3518 if (r < 0 && r != -EEXIST)
3519 return r;
3520 }
3521
3522 r = setup_credentials_internal(
3523 context,
3524 params,
3525 unit,
3526 p, /* final mount point */
3527 u, /* temporary workspace to overmount */
3528 true, /* reuse the workspace if it is already a mount */
3529 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3530 uid);
3531
3532 (void) rmdir(u); /* remove the workspace again if we can. */
3533
3534 if (r < 0)
3535 return r;
3536
3537 } else if (r == 0) {
3538
3539 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3540 * we can use the same directory for all cases, after turning off propagation. Question
3541 * though is: where do we turn off propagation exactly, and where do we place the workspace
3542 * directory? We need some place that is guaranteed to be a mount point in the host, and
3543 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3544 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3545 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3546 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3547 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3548 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3549 * propagation on the former, and then overmount the latter.
3550 *
3551 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3552 * for this purpose, but there are few other candidates that work equally well for us, and
3553 * given that the we do this in a privately namespaced short-lived single-threaded process
3554 * that no one else sees this should be OK to do. */
3555
3556 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3557 if (r < 0)
3558 goto child_fail;
3559
3560 r = setup_credentials_internal(
3561 context,
3562 params,
3563 unit,
3564 p, /* final mount point */
3565 "/dev/shm", /* temporary workspace to overmount */
3566 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3567 true, /* insist that something is mounted, do not allow fallback to plain directory */
3568 uid);
3569 if (r < 0)
3570 goto child_fail;
3571
3572 _exit(EXIT_SUCCESS);
3573
3574 child_fail:
3575 _exit(EXIT_FAILURE);
3576 }
3577
3578 /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
3579 * try to remove it. This matters in particular if we created the dir as mount point but then didn't
3580 * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
3581 * seen by users when trying access this inode. */
3582 (void) rmdir(p);
3583 return 0;
3584 }
3585
3586 #if ENABLE_SMACK
3587 static int setup_smack(
3588 const Manager *manager,
3589 const ExecContext *context,
3590 int executable_fd) {
3591 int r;
3592
3593 assert(context);
3594 assert(executable_fd >= 0);
3595
3596 if (context->smack_process_label) {
3597 r = mac_smack_apply_pid(0, context->smack_process_label);
3598 if (r < 0)
3599 return r;
3600 } else if (manager->default_smack_process_label) {
3601 _cleanup_free_ char *exec_label = NULL;
3602
3603 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
3604 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
3605 return r;
3606
3607 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
3608 if (r < 0)
3609 return r;
3610 }
3611
3612 return 0;
3613 }
3614 #endif
3615
3616 static int compile_bind_mounts(
3617 const ExecContext *context,
3618 const ExecParameters *params,
3619 BindMount **ret_bind_mounts,
3620 size_t *ret_n_bind_mounts,
3621 char ***ret_empty_directories) {
3622
3623 _cleanup_strv_free_ char **empty_directories = NULL;
3624 BindMount *bind_mounts = NULL;
3625 size_t n, h = 0;
3626 int r;
3627
3628 assert(context);
3629 assert(params);
3630 assert(ret_bind_mounts);
3631 assert(ret_n_bind_mounts);
3632 assert(ret_empty_directories);
3633
3634 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
3635
3636 n = context->n_bind_mounts;
3637 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3638 if (!params->prefix[t])
3639 continue;
3640
3641 for (size_t i = 0; i < context->directories[t].n_items; i++)
3642 n += !context->directories[t].items[i].only_create;
3643 }
3644
3645 if (n <= 0) {
3646 *ret_bind_mounts = NULL;
3647 *ret_n_bind_mounts = 0;
3648 *ret_empty_directories = NULL;
3649 return 0;
3650 }
3651
3652 bind_mounts = new(BindMount, n);
3653 if (!bind_mounts)
3654 return -ENOMEM;
3655
3656 for (size_t i = 0; i < context->n_bind_mounts; i++) {
3657 BindMount *item = context->bind_mounts + i;
3658 _cleanup_free_ char *s = NULL, *d = NULL;
3659
3660 s = strdup(item->source);
3661 if (!s)
3662 return -ENOMEM;
3663
3664 d = strdup(item->destination);
3665 if (!d)
3666 return -ENOMEM;
3667
3668 bind_mounts[h++] = (BindMount) {
3669 .source = TAKE_PTR(s),
3670 .destination = TAKE_PTR(d),
3671 .read_only = item->read_only,
3672 .recursive = item->recursive,
3673 .ignore_enoent = item->ignore_enoent,
3674 };
3675 }
3676
3677 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
3678 if (!params->prefix[t])
3679 continue;
3680
3681 if (context->directories[t].n_items == 0)
3682 continue;
3683
3684 if (exec_directory_is_private(context, t) &&
3685 !exec_context_with_rootfs(context)) {
3686 char *private_root;
3687
3688 /* So this is for a dynamic user, and we need to make sure the process can access its own
3689 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3690 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3691
3692 private_root = path_join(params->prefix[t], "private");
3693 if (!private_root)
3694 return -ENOMEM;
3695
3696 r = strv_consume(&empty_directories, private_root);
3697 if (r < 0)
3698 return r;
3699 }
3700
3701 for (size_t i = 0; i < context->directories[t].n_items; i++) {
3702 _cleanup_free_ char *s = NULL, *d = NULL;
3703
3704 /* When one of the parent directories is in the list, we cannot create the symlink
3705 * for the child directory. See also the comments in setup_exec_directory(). */
3706 if (context->directories[t].items[i].only_create)
3707 continue;
3708
3709 if (exec_directory_is_private(context, t))
3710 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
3711 else
3712 s = path_join(params->prefix[t], context->directories[t].items[i].path);
3713 if (!s)
3714 return -ENOMEM;
3715
3716 if (exec_directory_is_private(context, t) &&
3717 exec_context_with_rootfs(context))
3718 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3719 * directory is not created on the root directory. So, let's bind-mount the directory
3720 * on the 'non-private' place. */
3721 d = path_join(params->prefix[t], context->directories[t].items[i].path);
3722 else
3723 d = strdup(s);
3724 if (!d)
3725 return -ENOMEM;
3726
3727 bind_mounts[h++] = (BindMount) {
3728 .source = TAKE_PTR(s),
3729 .destination = TAKE_PTR(d),
3730 .read_only = false,
3731 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
3732 .recursive = true,
3733 .ignore_enoent = false,
3734 };
3735 }
3736 }
3737
3738 assert(h == n);
3739
3740 *ret_bind_mounts = TAKE_PTR(bind_mounts);
3741 *ret_n_bind_mounts = n;
3742 *ret_empty_directories = TAKE_PTR(empty_directories);
3743
3744 return (int) n;
3745 }
3746
3747 /* ret_symlinks will contain a list of pairs src:dest that describes
3748 * the symlinks to create later on. For example, the symlinks needed
3749 * to safely give private directories to DynamicUser=1 users. */
3750 static int compile_symlinks(
3751 const ExecContext *context,
3752 const ExecParameters *params,
3753 char ***ret_symlinks) {
3754
3755 _cleanup_strv_free_ char **symlinks = NULL;
3756 int r;
3757
3758 assert(context);
3759 assert(params);
3760 assert(ret_symlinks);
3761
3762 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3763 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3764 _cleanup_free_ char *private_path = NULL, *path = NULL;
3765
3766 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3767 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
3768
3769 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3770 dst_abs = path_join(params->prefix[dt], *symlink);
3771 if (!src_abs || !dst_abs)
3772 return -ENOMEM;
3773
3774 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3775 if (r < 0)
3776 return r;
3777 }
3778
3779 if (!exec_directory_is_private(context, dt) ||
3780 exec_context_with_rootfs(context) ||
3781 context->directories[dt].items[i].only_create)
3782 continue;
3783
3784 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
3785 if (!private_path)
3786 return -ENOMEM;
3787
3788 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3789 if (!path)
3790 return -ENOMEM;
3791
3792 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3793 if (r < 0)
3794 return r;
3795 }
3796 }
3797
3798 *ret_symlinks = TAKE_PTR(symlinks);
3799
3800 return 0;
3801 }
3802
3803 static bool insist_on_sandboxing(
3804 const ExecContext *context,
3805 const char *root_dir,
3806 const char *root_image,
3807 const BindMount *bind_mounts,
3808 size_t n_bind_mounts) {
3809
3810 assert(context);
3811 assert(n_bind_mounts == 0 || bind_mounts);
3812
3813 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
3814 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
3815 * rearrange stuff in a way we cannot ignore gracefully. */
3816
3817 if (context->n_temporary_filesystems > 0)
3818 return true;
3819
3820 if (root_dir || root_image)
3821 return true;
3822
3823 if (context->n_mount_images > 0)
3824 return true;
3825
3826 if (context->dynamic_user)
3827 return true;
3828
3829 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3830 return true;
3831
3832 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3833 * essential. */
3834 for (size_t i = 0; i < n_bind_mounts; i++)
3835 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3836 return true;
3837
3838 if (context->log_namespace)
3839 return true;
3840
3841 return false;
3842 }
3843
3844 static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3845 _cleanup_close_ int fd = -EBADF;
3846 int r;
3847
3848 if (!runtime || !runtime->ephemeral_copy)
3849 return 0;
3850
3851 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3852 if (r < 0)
3853 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3854
3855 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3856
3857 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3858 if (fd >= 0)
3859 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3860 return 0;
3861
3862 if (fd != -EAGAIN)
3863 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3864
3865 log_debug("Making ephemeral snapshot of %s to %s",
3866 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3867
3868 if (context->root_image)
3869 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3870 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3871 else
3872 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3873 AT_FDCWD, runtime->ephemeral_copy,
3874 BTRFS_SNAPSHOT_FALLBACK_COPY |
3875 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3876 BTRFS_SNAPSHOT_RECURSIVE |
3877 BTRFS_SNAPSHOT_LOCK_BSD);
3878 if (fd < 0)
3879 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3880 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3881
3882 if (context->root_image) {
3883 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3884 * which tends to not perform well in combination with lots of random writes.
3885 *
3886 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3887 * copy, but we at least want to make the intention clear.
3888 */
3889 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3890 if (r < 0)
3891 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3892 }
3893
3894 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3895 if (r < 0)
3896 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3897
3898 return 1;
3899 }
3900
3901 static int verity_settings_prepare(
3902 VeritySettings *verity,
3903 const char *root_image,
3904 const void *root_hash,
3905 size_t root_hash_size,
3906 const char *root_hash_path,
3907 const void *root_hash_sig,
3908 size_t root_hash_sig_size,
3909 const char *root_hash_sig_path,
3910 const char *verity_data_path) {
3911
3912 int r;
3913
3914 assert(verity);
3915
3916 if (root_hash) {
3917 void *d;
3918
3919 d = memdup(root_hash, root_hash_size);
3920 if (!d)
3921 return -ENOMEM;
3922
3923 free_and_replace(verity->root_hash, d);
3924 verity->root_hash_size = root_hash_size;
3925 verity->designator = PARTITION_ROOT;
3926 }
3927
3928 if (root_hash_sig) {
3929 void *d;
3930
3931 d = memdup(root_hash_sig, root_hash_sig_size);
3932 if (!d)
3933 return -ENOMEM;
3934
3935 free_and_replace(verity->root_hash_sig, d);
3936 verity->root_hash_sig_size = root_hash_sig_size;
3937 verity->designator = PARTITION_ROOT;
3938 }
3939
3940 if (verity_data_path) {
3941 r = free_and_strdup(&verity->data_path, verity_data_path);
3942 if (r < 0)
3943 return r;
3944 }
3945
3946 r = verity_settings_load(
3947 verity,
3948 root_image,
3949 root_hash_path,
3950 root_hash_sig_path);
3951 if (r < 0)
3952 return log_debug_errno(r, "Failed to load root hash: %m");
3953
3954 return 0;
3955 }
3956
3957 static int apply_mount_namespace(
3958 const Unit *u,
3959 ExecCommandFlags command_flags,
3960 const ExecContext *context,
3961 const ExecParameters *params,
3962 ExecRuntime *runtime,
3963 const char *memory_pressure_path,
3964 char **error_path) {
3965
3966 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
3967 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3968 **read_write_paths_cleanup = NULL;
3969 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3970 *extension_dir = NULL, *host_os_release = NULL;
3971 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
3972 char **read_write_paths;
3973 NamespaceInfo ns_info;
3974 bool needs_sandboxing;
3975 BindMount *bind_mounts = NULL;
3976 size_t n_bind_mounts = 0;
3977 int r;
3978
3979 assert(context);
3980
3981 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3982
3983 if (params->flags & EXEC_APPLY_CHROOT) {
3984 r = setup_ephemeral(context, runtime);
3985 if (r < 0)
3986 return r;
3987
3988 if (context->root_image)
3989 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3990 else
3991 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
3992 }
3993
3994 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3995 if (r < 0)
3996 return r;
3997
3998 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
3999 r = compile_symlinks(context, params, &symlinks);
4000 if (r < 0)
4001 return r;
4002
4003 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
4004 * service will need to write to it in order to start the notifications. */
4005 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
4006 read_write_paths_cleanup = strv_copy(context->read_write_paths);
4007 if (!read_write_paths_cleanup)
4008 return -ENOMEM;
4009
4010 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
4011 if (r < 0)
4012 return r;
4013
4014 read_write_paths = read_write_paths_cleanup;
4015 } else
4016 read_write_paths = context->read_write_paths;
4017
4018 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
4019 if (needs_sandboxing) {
4020 /* The runtime struct only contains the parent of the private /tmp,
4021 * which is non-accessible to world users. Inside of it there's a /tmp
4022 * that is sticky, and that's the one we want to use here.
4023 * This does not apply when we are using /run/systemd/empty as fallback. */
4024
4025 if (context->private_tmp && runtime && runtime->shared) {
4026 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
4027 tmp_dir = runtime->shared->tmp_dir;
4028 else if (runtime->shared->tmp_dir)
4029 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
4030
4031 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
4032 var_tmp_dir = runtime->shared->var_tmp_dir;
4033 else if (runtime->shared->var_tmp_dir)
4034 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
4035 }
4036
4037 ns_info = (NamespaceInfo) {
4038 .ignore_protect_paths = false,
4039 .private_dev = context->private_devices,
4040 .protect_control_groups = context->protect_control_groups,
4041 .protect_kernel_tunables = context->protect_kernel_tunables,
4042 .protect_kernel_modules = context->protect_kernel_modules,
4043 .protect_kernel_logs = context->protect_kernel_logs,
4044 .protect_hostname = context->protect_hostname,
4045 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
4046 .protect_home = context->protect_home,
4047 .protect_system = context->protect_system,
4048 .protect_proc = context->protect_proc,
4049 .proc_subset = context->proc_subset,
4050 .private_network = exec_needs_network_namespace(context),
4051 .private_ipc = exec_needs_ipc_namespace(context),
4052 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
4053 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
4054 };
4055 } else if (!context->dynamic_user && root_dir)
4056 /*
4057 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
4058 * sandbox info, otherwise enforce it, don't ignore protected paths and
4059 * fail if we are enable to apply the sandbox inside the mount namespace.
4060 */
4061 ns_info = (NamespaceInfo) {
4062 .ignore_protect_paths = true,
4063 };
4064 else
4065 ns_info = (NamespaceInfo) {};
4066
4067 if (context->mount_propagation_flag == MS_SHARED)
4068 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
4069
4070 if (exec_context_has_credentials(context) &&
4071 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
4072 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4073 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
4074 if (!creds_path)
4075 return -ENOMEM;
4076 }
4077
4078 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
4079 propagate_dir = path_join("/run/systemd/propagate/", u->id);
4080 if (!propagate_dir)
4081 return -ENOMEM;
4082
4083 incoming_dir = strdup("/run/systemd/incoming");
4084 if (!incoming_dir)
4085 return -ENOMEM;
4086
4087 extension_dir = strdup("/run/systemd/unit-extensions");
4088 if (!extension_dir)
4089 return -ENOMEM;
4090
4091 /* If running under a different root filesystem, propagate the host's os-release. We make a
4092 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
4093 if (root_dir || root_image) {
4094 host_os_release = strdup("/run/systemd/propagate/os-release");
4095 if (!host_os_release)
4096 return -ENOMEM;
4097 }
4098 } else {
4099 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
4100
4101 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
4102 return -ENOMEM;
4103
4104 if (root_dir || root_image) {
4105 if (asprintf(&host_os_release, "/run/user/" UID_FMT "/systemd/propagate/os-release", geteuid()) < 0)
4106 return -ENOMEM;
4107 }
4108 }
4109
4110 if (root_image) {
4111 r = verity_settings_prepare(
4112 &verity,
4113 root_image,
4114 context->root_hash, context->root_hash_size, context->root_hash_path,
4115 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
4116 context->root_verity);
4117 if (r < 0)
4118 return r;
4119 }
4120
4121 r = setup_namespace(
4122 root_dir,
4123 root_image,
4124 context->root_image_options,
4125 context->root_image_policy ?: &image_policy_service,
4126 &ns_info,
4127 read_write_paths,
4128 needs_sandboxing ? context->read_only_paths : NULL,
4129 needs_sandboxing ? context->inaccessible_paths : NULL,
4130 needs_sandboxing ? context->exec_paths : NULL,
4131 needs_sandboxing ? context->no_exec_paths : NULL,
4132 empty_directories,
4133 symlinks,
4134 bind_mounts,
4135 n_bind_mounts,
4136 context->temporary_filesystems,
4137 context->n_temporary_filesystems,
4138 context->mount_images,
4139 context->n_mount_images,
4140 context->mount_image_policy ?: &image_policy_service,
4141 tmp_dir,
4142 var_tmp_dir,
4143 creds_path,
4144 context->log_namespace,
4145 context->mount_propagation_flag,
4146 &verity,
4147 context->extension_images,
4148 context->n_extension_images,
4149 context->extension_image_policy ?: &image_policy_sysext,
4150 context->extension_directories,
4151 propagate_dir,
4152 incoming_dir,
4153 extension_dir,
4154 root_dir || root_image ? params->notify_socket : NULL,
4155 host_os_release,
4156 error_path);
4157
4158 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
4159 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
4160 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
4161 * completely different execution environment. */
4162 if (r == -ENOANO) {
4163 if (insist_on_sandboxing(
4164 context,
4165 root_dir, root_image,
4166 bind_mounts,
4167 n_bind_mounts))
4168 return log_unit_debug_errno(u,
4169 SYNTHETIC_ERRNO(EOPNOTSUPP),
4170 "Failed to set up namespace, and refusing to continue since "
4171 "the selected namespacing options alter mount environment non-trivially.\n"
4172 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
4173 n_bind_mounts,
4174 context->n_temporary_filesystems,
4175 yes_no(root_dir),
4176 yes_no(root_image),
4177 yes_no(context->dynamic_user));
4178
4179 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4180 return 0;
4181 }
4182
4183 return r;
4184 }
4185
4186 static int apply_working_directory(
4187 const ExecContext *context,
4188 const ExecParameters *params,
4189 ExecRuntime *runtime,
4190 const char *home,
4191 int *exit_status) {
4192
4193 const char *d, *wd;
4194
4195 assert(context);
4196 assert(exit_status);
4197
4198 if (context->working_directory_home) {
4199
4200 if (!home) {
4201 *exit_status = EXIT_CHDIR;
4202 return -ENXIO;
4203 }
4204
4205 wd = home;
4206
4207 } else
4208 wd = empty_to_root(context->working_directory);
4209
4210 if (params->flags & EXEC_APPLY_CHROOT)
4211 d = wd;
4212 else
4213 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
4214
4215 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
4216 *exit_status = EXIT_CHDIR;
4217 return -errno;
4218 }
4219
4220 return 0;
4221 }
4222
4223 static int apply_root_directory(
4224 const ExecContext *context,
4225 const ExecParameters *params,
4226 ExecRuntime *runtime,
4227 const bool needs_mount_ns,
4228 int *exit_status) {
4229
4230 assert(context);
4231 assert(exit_status);
4232
4233 if (params->flags & EXEC_APPLY_CHROOT)
4234 if (!needs_mount_ns && context->root_directory)
4235 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
4236 *exit_status = EXIT_CHROOT;
4237 return -errno;
4238 }
4239
4240 return 0;
4241 }
4242
4243 static int setup_keyring(
4244 const Unit *u,
4245 const ExecContext *context,
4246 const ExecParameters *p,
4247 uid_t uid, gid_t gid) {
4248
4249 key_serial_t keyring;
4250 int r = 0;
4251 uid_t saved_uid;
4252 gid_t saved_gid;
4253
4254 assert(u);
4255 assert(context);
4256 assert(p);
4257
4258 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
4259 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
4260 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
4261 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
4262 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
4263 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
4264
4265 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
4266 return 0;
4267
4268 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
4269 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
4270 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
4271 * & group is just as nasty as acquiring a reference to the user keyring. */
4272
4273 saved_uid = getuid();
4274 saved_gid = getgid();
4275
4276 if (gid_is_valid(gid) && gid != saved_gid) {
4277 if (setregid(gid, -1) < 0)
4278 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
4279 }
4280
4281 if (uid_is_valid(uid) && uid != saved_uid) {
4282 if (setreuid(uid, -1) < 0) {
4283 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
4284 goto out;
4285 }
4286 }
4287
4288 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
4289 if (keyring == -1) {
4290 if (errno == ENOSYS)
4291 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
4292 else if (ERRNO_IS_PRIVILEGE(errno))
4293 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
4294 else if (errno == EDQUOT)
4295 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
4296 else
4297 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
4298
4299 goto out;
4300 }
4301
4302 /* When requested link the user keyring into the session keyring. */
4303 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
4304
4305 if (keyctl(KEYCTL_LINK,
4306 KEY_SPEC_USER_KEYRING,
4307 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
4308 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
4309 goto out;
4310 }
4311 }
4312
4313 /* Restore uid/gid back */
4314 if (uid_is_valid(uid) && uid != saved_uid) {
4315 if (setreuid(saved_uid, -1) < 0) {
4316 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
4317 goto out;
4318 }
4319 }
4320
4321 if (gid_is_valid(gid) && gid != saved_gid) {
4322 if (setregid(saved_gid, -1) < 0)
4323 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
4324 }
4325
4326 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
4327 if (!sd_id128_is_null(u->invocation_id)) {
4328 key_serial_t key;
4329
4330 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
4331 if (key == -1)
4332 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
4333 else {
4334 if (keyctl(KEYCTL_SETPERM, key,
4335 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
4336 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
4337 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
4338 }
4339 }
4340
4341 out:
4342 /* Revert back uid & gid for the last time, and exit */
4343 /* no extra logging, as only the first already reported error matters */
4344 if (getuid() != saved_uid)
4345 (void) setreuid(saved_uid, -1);
4346
4347 if (getgid() != saved_gid)
4348 (void) setregid(saved_gid, -1);
4349
4350 return r;
4351 }
4352
4353 static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
4354 assert(array);
4355 assert(n);
4356 assert(pair);
4357
4358 if (pair[0] >= 0)
4359 array[(*n)++] = pair[0];
4360 if (pair[1] >= 0)
4361 array[(*n)++] = pair[1];
4362 }
4363
4364 static int close_remaining_fds(
4365 const ExecParameters *params,
4366 const ExecRuntime *runtime,
4367 int user_lookup_fd,
4368 int socket_fd,
4369 const int *fds, size_t n_fds) {
4370
4371 size_t n_dont_close = 0;
4372 int dont_close[n_fds + 14];
4373
4374 assert(params);
4375
4376 if (params->stdin_fd >= 0)
4377 dont_close[n_dont_close++] = params->stdin_fd;
4378 if (params->stdout_fd >= 0)
4379 dont_close[n_dont_close++] = params->stdout_fd;
4380 if (params->stderr_fd >= 0)
4381 dont_close[n_dont_close++] = params->stderr_fd;
4382
4383 if (socket_fd >= 0)
4384 dont_close[n_dont_close++] = socket_fd;
4385 if (n_fds > 0) {
4386 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
4387 n_dont_close += n_fds;
4388 }
4389
4390 if (runtime)
4391 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
4392
4393 if (runtime && runtime->shared) {
4394 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
4395 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
4396 }
4397
4398 if (runtime && runtime->dynamic_creds) {
4399 if (runtime->dynamic_creds->user)
4400 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
4401 if (runtime->dynamic_creds->group)
4402 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
4403 }
4404
4405 if (user_lookup_fd >= 0)
4406 dont_close[n_dont_close++] = user_lookup_fd;
4407
4408 return close_all_fds(dont_close, n_dont_close);
4409 }
4410
4411 static int send_user_lookup(
4412 Unit *unit,
4413 int user_lookup_fd,
4414 uid_t uid,
4415 gid_t gid) {
4416
4417 assert(unit);
4418
4419 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4420 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4421 * specified. */
4422
4423 if (user_lookup_fd < 0)
4424 return 0;
4425
4426 if (!uid_is_valid(uid) && !gid_is_valid(gid))
4427 return 0;
4428
4429 if (writev(user_lookup_fd,
4430 (struct iovec[]) {
4431 IOVEC_MAKE(&uid, sizeof(uid)),
4432 IOVEC_MAKE(&gid, sizeof(gid)),
4433 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
4434 return -errno;
4435
4436 return 0;
4437 }
4438
4439 static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4440 int r;
4441
4442 assert(c);
4443 assert(home);
4444 assert(buf);
4445
4446 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4447
4448 if (*home)
4449 return 0;
4450
4451 if (!c->working_directory_home)
4452 return 0;
4453
4454 r = get_home_dir(buf);
4455 if (r < 0)
4456 return r;
4457
4458 *home = *buf;
4459 return 1;
4460 }
4461
4462 static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4463 _cleanup_strv_free_ char ** list = NULL;
4464 int r;
4465
4466 assert(c);
4467 assert(p);
4468 assert(ret);
4469
4470 assert(c->dynamic_user);
4471
4472 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4473 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4474 * directories. */
4475
4476 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4477 if (t == EXEC_DIRECTORY_CONFIGURATION)
4478 continue;
4479
4480 if (!p->prefix[t])
4481 continue;
4482
4483 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4484 char *e;
4485
4486 if (exec_directory_is_private(c, t))
4487 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
4488 else
4489 e = path_join(p->prefix[t], c->directories[t].items[i].path);
4490 if (!e)
4491 return -ENOMEM;
4492
4493 r = strv_consume(&list, e);
4494 if (r < 0)
4495 return r;
4496 }
4497 }
4498
4499 *ret = TAKE_PTR(list);
4500
4501 return 0;
4502 }
4503
4504 static int exec_parameters_get_cgroup_path(
4505 const ExecParameters *params,
4506 const CGroupContext *c,
4507 char **ret) {
4508
4509 const char *subgroup = NULL;
4510 char *p;
4511
4512 assert(params);
4513 assert(ret);
4514
4515 if (!params->cgroup_path)
4516 return -EINVAL;
4517
4518 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4519 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4520 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4521 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4522 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4523 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4524 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4525 * flag, which is only passed for the former statements, not for the latter. */
4526
4527 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
4528 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
4529 subgroup = ".control";
4530 else
4531 subgroup = c->delegate_subgroup;
4532 }
4533
4534 if (subgroup)
4535 p = path_join(params->cgroup_path, subgroup);
4536 else
4537 p = strdup(params->cgroup_path);
4538 if (!p)
4539 return -ENOMEM;
4540
4541 *ret = p;
4542 return !!subgroup;
4543 }
4544
4545 static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4546 _cleanup_(cpu_set_reset) CPUSet s = {};
4547 int r;
4548
4549 assert(c);
4550 assert(ret);
4551
4552 if (!c->numa_policy.nodes.set) {
4553 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4554 return 0;
4555 }
4556
4557 r = numa_to_cpu_set(&c->numa_policy, &s);
4558 if (r < 0)
4559 return r;
4560
4561 cpu_set_reset(ret);
4562
4563 return cpu_set_add_all(ret, &s);
4564 }
4565
4566 bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4567 assert(c);
4568
4569 return c->cpu_affinity_from_numa;
4570 }
4571
4572 static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4573 int r;
4574
4575 assert(fds);
4576 assert(n_fds);
4577 assert(*n_fds < fds_size);
4578 assert(ret_fd);
4579
4580 if (fd < 0) {
4581 *ret_fd = -EBADF;
4582 return 0;
4583 }
4584
4585 if (fd < 3 + (int) *n_fds) {
4586 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4587 * the fds we pass to the process (or which are closed only during execve). */
4588
4589 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4590 if (r < 0)
4591 return -errno;
4592
4593 close_and_replace(fd, r);
4594 }
4595
4596 *ret_fd = fds[*n_fds] = fd;
4597 (*n_fds) ++;
4598 return 1;
4599 }
4600
4601 static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4602 union sockaddr_union addr = {
4603 .un.sun_family = AF_UNIX,
4604 };
4605 socklen_t sa_len;
4606 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4607 int r;
4608
4609 assert(u);
4610 assert(of);
4611 assert(ofd >= 0);
4612
4613 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4614 if (r < 0)
4615 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4616
4617 sa_len = r;
4618
4619 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4620 _cleanup_close_ int fd = -EBADF;
4621
4622 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4623 if (fd < 0)
4624 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4625
4626 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4627 if (r == -EPROTOTYPE)
4628 continue;
4629 if (r < 0)
4630 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4631
4632 return TAKE_FD(fd);
4633 }
4634
4635 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4636 }
4637
4638 static int get_open_file_fd(Unit *u, const OpenFile *of) {
4639 struct stat st;
4640 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4641
4642 assert(u);
4643 assert(of);
4644
4645 ofd = open(of->path, O_PATH | O_CLOEXEC);
4646 if (ofd < 0)
4647 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4648
4649 if (fstat(ofd, &st) < 0)
4650 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
4651
4652 if (S_ISSOCK(st.st_mode)) {
4653 fd = connect_unix_harder(u, of, ofd);
4654 if (fd < 0)
4655 return fd;
4656
4657 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4658 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4659 of->path);
4660
4661 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4662 } else {
4663 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4664 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4665 flags |= O_APPEND;
4666 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4667 flags |= O_TRUNC;
4668
4669 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4670 if (fd < 0)
4671 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4672
4673 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4674 }
4675
4676 return TAKE_FD(fd);
4677 }
4678
4679 static int collect_open_file_fds(
4680 Unit *u,
4681 OpenFile* open_files,
4682 int **fds,
4683 char ***fdnames,
4684 size_t *n_fds) {
4685 int r;
4686
4687 assert(u);
4688 assert(fds);
4689 assert(fdnames);
4690 assert(n_fds);
4691
4692 LIST_FOREACH(open_files, of, open_files) {
4693 _cleanup_close_ int fd = -EBADF;
4694
4695 fd = get_open_file_fd(u, of);
4696 if (fd < 0) {
4697 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4698 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4699 continue;
4700 }
4701
4702 return fd;
4703 }
4704
4705 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4706 return -ENOMEM;
4707
4708 r = strv_extend(fdnames, of->fdname);
4709 if (r < 0)
4710 return r;
4711
4712 (*fds)[*n_fds] = TAKE_FD(fd);
4713
4714 (*n_fds)++;
4715 }
4716
4717 return 0;
4718 }
4719
4720 static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
4721 assert(unit);
4722 assert(msg);
4723 assert(executable);
4724
4725 if (!DEBUG_LOGGING)
4726 return;
4727
4728 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
4729
4730 log_unit_struct(unit, LOG_DEBUG,
4731 "EXECUTABLE=%s", executable,
4732 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
4733 LOG_UNIT_INVOCATION_ID(unit));
4734 }
4735
4736 static bool exec_context_need_unprivileged_private_users(
4737 const ExecContext *context,
4738 const ExecParameters *params) {
4739
4740 assert(context);
4741 assert(params);
4742
4743 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
4744 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
4745 * (system manager) then we have privileges and don't need this. */
4746 if (params->runtime_scope != RUNTIME_SCOPE_USER)
4747 return false;
4748
4749 return context->private_users ||
4750 context->private_tmp ||
4751 context->private_devices ||
4752 context->private_network ||
4753 context->network_namespace_path ||
4754 context->private_ipc ||
4755 context->ipc_namespace_path ||
4756 context->private_mounts > 0 ||
4757 context->mount_apivfs ||
4758 context->n_bind_mounts > 0 ||
4759 context->n_temporary_filesystems > 0 ||
4760 context->root_directory ||
4761 !strv_isempty(context->extension_directories) ||
4762 context->protect_system != PROTECT_SYSTEM_NO ||
4763 context->protect_home != PROTECT_HOME_NO ||
4764 context->protect_kernel_tunables ||
4765 context->protect_kernel_modules ||
4766 context->protect_kernel_logs ||
4767 context->protect_control_groups ||
4768 context->protect_clock ||
4769 context->protect_hostname ||
4770 !strv_isempty(context->read_write_paths) ||
4771 !strv_isempty(context->read_only_paths) ||
4772 !strv_isempty(context->inaccessible_paths) ||
4773 !strv_isempty(context->exec_paths) ||
4774 !strv_isempty(context->no_exec_paths);
4775 }
4776
4777 static int exec_child(
4778 Unit *unit,
4779 const ExecCommand *command,
4780 const ExecContext *context,
4781 const ExecParameters *params,
4782 ExecRuntime *runtime,
4783 const CGroupContext *cgroup_context,
4784 int socket_fd,
4785 const int named_iofds[static 3],
4786 int *params_fds,
4787 size_t n_socket_fds,
4788 size_t n_storage_fds,
4789 char **files_env,
4790 int user_lookup_fd,
4791 int *exit_status) {
4792
4793 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
4794 int r, ngids = 0, exec_fd;
4795 _cleanup_free_ gid_t *supplementary_gids = NULL;
4796 const char *username = NULL, *groupname = NULL;
4797 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
4798 const char *home = NULL, *shell = NULL;
4799 char **final_argv = NULL;
4800 dev_t journal_stream_dev = 0;
4801 ino_t journal_stream_ino = 0;
4802 bool userns_set_up = false;
4803 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4804 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4805 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4806 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
4807 #if HAVE_SELINUX
4808 _cleanup_free_ char *mac_selinux_context_net = NULL;
4809 bool use_selinux = false;
4810 #endif
4811 #if ENABLE_SMACK
4812 bool use_smack = false;
4813 #endif
4814 #if HAVE_APPARMOR
4815 bool use_apparmor = false;
4816 #endif
4817 uid_t saved_uid = getuid();
4818 gid_t saved_gid = getgid();
4819 uid_t uid = UID_INVALID;
4820 gid_t gid = GID_INVALID;
4821 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4822 n_keep_fds; /* total number of fds not to close */
4823 int secure_bits;
4824 _cleanup_free_ gid_t *gids_after_pam = NULL;
4825 int ngids_after_pam = 0;
4826 _cleanup_free_ int *fds = NULL;
4827 _cleanup_strv_free_ char **fdnames = NULL;
4828
4829 assert(unit);
4830 assert(command);
4831 assert(context);
4832 assert(params);
4833 assert(exit_status);
4834
4835 /* Explicitly test for CVE-2021-4034 inspired invocations */
4836 assert(command->path);
4837 assert(!strv_isempty(command->argv));
4838
4839 rename_process_from_path(command->path);
4840
4841 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4842 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4843 * both of which will be demoted to SIG_DFL. */
4844 (void) default_signals(SIGNALS_CRASH_HANDLER,
4845 SIGNALS_IGNORE);
4846
4847 if (context->ignore_sigpipe)
4848 (void) ignore_signals(SIGPIPE);
4849
4850 r = reset_signal_mask();
4851 if (r < 0) {
4852 *exit_status = EXIT_SIGNAL_MASK;
4853 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
4854 }
4855
4856 if (params->idle_pipe)
4857 do_idle_pipe_dance(params->idle_pipe);
4858
4859 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4860 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4861 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4862 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
4863
4864 log_forget_fds();
4865 log_set_open_when_needed(true);
4866 log_settle_target();
4867
4868 /* In case anything used libc syslog(), close this here, too */
4869 closelog();
4870
4871 fds = newdup(int, params_fds, n_fds);
4872 if (!fds) {
4873 *exit_status = EXIT_MEMORY;
4874 return log_oom();
4875 }
4876
4877 fdnames = strv_copy((char**) params->fd_names);
4878 if (!fdnames) {
4879 *exit_status = EXIT_MEMORY;
4880 return log_oom();
4881 }
4882
4883 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4884 if (r < 0) {
4885 *exit_status = EXIT_FDS;
4886 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4887 }
4888
4889 int keep_fds[n_fds + 3];
4890 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4891 n_keep_fds = n_fds;
4892
4893 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4894 if (r < 0) {
4895 *exit_status = EXIT_FDS;
4896 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4897 }
4898
4899 #if HAVE_LIBBPF
4900 if (unit->manager->restrict_fs) {
4901 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
4902 if (bpf_map_fd < 0) {
4903 *exit_status = EXIT_FDS;
4904 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
4905 }
4906
4907 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4908 if (r < 0) {
4909 *exit_status = EXIT_FDS;
4910 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4911 }
4912 }
4913 #endif
4914
4915 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
4916 if (r < 0) {
4917 *exit_status = EXIT_FDS;
4918 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
4919 }
4920
4921 if (!context->same_pgrp &&
4922 setsid() < 0) {
4923 *exit_status = EXIT_SETSID;
4924 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4925 }
4926
4927 exec_context_tty_reset(context, params);
4928
4929 if (unit_shall_confirm_spawn(unit)) {
4930 _cleanup_free_ char *cmdline = NULL;
4931
4932 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
4933 if (!cmdline) {
4934 *exit_status = EXIT_MEMORY;
4935 return log_oom();
4936 }
4937
4938 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
4939 if (r != CONFIRM_EXECUTE) {
4940 if (r == CONFIRM_PRETEND_SUCCESS) {
4941 *exit_status = EXIT_SUCCESS;
4942 return 0;
4943 }
4944 *exit_status = EXIT_CONFIRM;
4945 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4946 "Execution cancelled by the user");
4947 }
4948 }
4949
4950 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4951 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4952 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4953 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4954 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4955 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4956 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
4957 *exit_status = EXIT_MEMORY;
4958 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4959 }
4960
4961 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
4962 _cleanup_strv_free_ char **suggested_paths = NULL;
4963
4964 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
4965 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
4966 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4967 *exit_status = EXIT_USER;
4968 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4969 }
4970
4971 r = compile_suggested_paths(context, params, &suggested_paths);
4972 if (r < 0) {
4973 *exit_status = EXIT_MEMORY;
4974 return log_oom();
4975 }
4976
4977 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
4978 if (r < 0) {
4979 *exit_status = EXIT_USER;
4980 if (r == -EILSEQ)
4981 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4982 "Failed to update dynamic user credentials: User or group with specified name already exists.");
4983 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
4984 }
4985
4986 if (!uid_is_valid(uid)) {
4987 *exit_status = EXIT_USER;
4988 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
4989 }
4990
4991 if (!gid_is_valid(gid)) {
4992 *exit_status = EXIT_USER;
4993 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
4994 }
4995
4996 if (runtime->dynamic_creds->user)
4997 username = runtime->dynamic_creds->user->name;
4998
4999 } else {
5000 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
5001 if (r < 0) {
5002 *exit_status = EXIT_USER;
5003 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5004 }
5005
5006 r = get_fixed_group(context, &groupname, &gid);
5007 if (r < 0) {
5008 *exit_status = EXIT_GROUP;
5009 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
5010 }
5011 }
5012
5013 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
5014 r = get_supplementary_groups(context, username, groupname, gid,
5015 &supplementary_gids, &ngids);
5016 if (r < 0) {
5017 *exit_status = EXIT_GROUP;
5018 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
5019 }
5020
5021 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
5022 if (r < 0) {
5023 *exit_status = EXIT_USER;
5024 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
5025 }
5026
5027 user_lookup_fd = safe_close(user_lookup_fd);
5028
5029 r = acquire_home(context, uid, &home, &home_buffer);
5030 if (r < 0) {
5031 *exit_status = EXIT_CHDIR;
5032 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
5033 }
5034
5035 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
5036 if (socket_fd >= 0)
5037 (void) fd_nonblock(socket_fd, false);
5038
5039 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
5040 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
5041 if (params->cgroup_path) {
5042 _cleanup_free_ char *p = NULL;
5043
5044 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5045 if (r < 0) {
5046 *exit_status = EXIT_CGROUP;
5047 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5048 }
5049
5050 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
5051 if (r == -EUCLEAN) {
5052 *exit_status = EXIT_CGROUP;
5053 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
5054 "because the cgroup or one of its parents or "
5055 "siblings is in the threaded mode: %m", p);
5056 }
5057 if (r < 0) {
5058 *exit_status = EXIT_CGROUP;
5059 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
5060 }
5061 }
5062
5063 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5064 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
5065 if (r < 0) {
5066 *exit_status = EXIT_NETWORK;
5067 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
5068 }
5069 }
5070
5071 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5072 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
5073 if (r < 0) {
5074 *exit_status = EXIT_NAMESPACE;
5075 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
5076 }
5077 }
5078
5079 r = setup_input(context, params, socket_fd, named_iofds);
5080 if (r < 0) {
5081 *exit_status = EXIT_STDIN;
5082 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
5083 }
5084
5085 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5086 if (r < 0) {
5087 *exit_status = EXIT_STDOUT;
5088 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
5089 }
5090
5091 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
5092 if (r < 0) {
5093 *exit_status = EXIT_STDERR;
5094 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
5095 }
5096
5097 if (context->oom_score_adjust_set) {
5098 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
5099 * prohibit write access to this file, and we shouldn't trip up over that. */
5100 r = set_oom_score_adjust(context->oom_score_adjust);
5101 if (ERRNO_IS_PRIVILEGE(r))
5102 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
5103 else if (r < 0) {
5104 *exit_status = EXIT_OOM_ADJUST;
5105 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
5106 }
5107 }
5108
5109 if (context->coredump_filter_set) {
5110 r = set_coredump_filter(context->coredump_filter);
5111 if (ERRNO_IS_PRIVILEGE(r))
5112 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5113 else if (r < 0)
5114 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5115 }
5116
5117 if (context->nice_set) {
5118 r = setpriority_closest(context->nice);
5119 if (r < 0)
5120 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5121 }
5122
5123 if (context->cpu_sched_set) {
5124 struct sched_param param = {
5125 .sched_priority = context->cpu_sched_priority,
5126 };
5127
5128 r = sched_setscheduler(0,
5129 context->cpu_sched_policy |
5130 (context->cpu_sched_reset_on_fork ?
5131 SCHED_RESET_ON_FORK : 0),
5132 &param);
5133 if (r < 0) {
5134 *exit_status = EXIT_SETSCHEDULER;
5135 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
5136 }
5137 }
5138
5139 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
5140 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
5141 const CPUSet *cpu_set;
5142
5143 if (context->cpu_affinity_from_numa) {
5144 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
5145 if (r < 0) {
5146 *exit_status = EXIT_CPUAFFINITY;
5147 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
5148 }
5149
5150 cpu_set = &converted_cpu_set;
5151 } else
5152 cpu_set = &context->cpu_set;
5153
5154 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
5155 *exit_status = EXIT_CPUAFFINITY;
5156 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
5157 }
5158 }
5159
5160 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
5161 r = apply_numa_policy(&context->numa_policy);
5162 if (r < 0) {
5163 if (ERRNO_IS_NOT_SUPPORTED(r))
5164 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
5165 else {
5166 *exit_status = EXIT_NUMA_POLICY;
5167 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
5168 }
5169 }
5170 }
5171
5172 if (context->ioprio_set)
5173 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
5174 *exit_status = EXIT_IOPRIO;
5175 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
5176 }
5177
5178 if (context->timer_slack_nsec != NSEC_INFINITY)
5179 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
5180 *exit_status = EXIT_TIMERSLACK;
5181 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
5182 }
5183
5184 if (context->personality != PERSONALITY_INVALID) {
5185 r = safe_personality(context->personality);
5186 if (r < 0) {
5187 *exit_status = EXIT_PERSONALITY;
5188 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
5189 }
5190 }
5191
5192 if (context->utmp_id) {
5193 const char *line = context->tty_path ?
5194 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
5195 NULL;
5196 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
5197 line,
5198 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
5199 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
5200 USER_PROCESS,
5201 username);
5202 }
5203
5204 if (uid_is_valid(uid)) {
5205 r = chown_terminal(STDIN_FILENO, uid);
5206 if (r < 0) {
5207 *exit_status = EXIT_STDIN;
5208 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
5209 }
5210 }
5211
5212 if (params->cgroup_path) {
5213 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
5214 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
5215 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
5216 * touch a single hierarchy too. */
5217
5218 if (params->flags & EXEC_CGROUP_DELEGATE) {
5219 _cleanup_free_ char *p = NULL;
5220
5221 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
5222 if (r < 0) {
5223 *exit_status = EXIT_CGROUP;
5224 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
5225 }
5226
5227 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
5228 if (r < 0) {
5229 *exit_status = EXIT_CGROUP;
5230 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
5231 }
5232 if (r > 0) {
5233 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
5234 if (r < 0) {
5235 *exit_status = EXIT_CGROUP;
5236 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
5237 }
5238 }
5239 }
5240
5241 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
5242 if (cgroup_context_want_memory_pressure(cgroup_context)) {
5243 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
5244 if (r < 0) {
5245 *exit_status = EXIT_MEMORY;
5246 return log_oom();
5247 }
5248
5249 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
5250 if (r < 0) {
5251 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
5252 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
5253 memory_pressure_path = mfree(memory_pressure_path);
5254 }
5255 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
5256 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
5257 if (!memory_pressure_path) {
5258 *exit_status = EXIT_MEMORY;
5259 return log_oom();
5260 }
5261 }
5262 }
5263 }
5264
5265 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
5266
5267 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
5268 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
5269 if (r < 0)
5270 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
5271 }
5272
5273 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
5274 r = setup_credentials(context, params, unit->id, uid);
5275 if (r < 0) {
5276 *exit_status = EXIT_CREDENTIALS;
5277 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
5278 }
5279 }
5280
5281 r = build_environment(
5282 unit,
5283 context,
5284 params,
5285 cgroup_context,
5286 n_fds,
5287 fdnames,
5288 home,
5289 username,
5290 shell,
5291 journal_stream_dev,
5292 journal_stream_ino,
5293 memory_pressure_path,
5294 &our_env);
5295 if (r < 0) {
5296 *exit_status = EXIT_MEMORY;
5297 return log_oom();
5298 }
5299
5300 r = build_pass_environment(context, &pass_env);
5301 if (r < 0) {
5302 *exit_status = EXIT_MEMORY;
5303 return log_oom();
5304 }
5305
5306 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
5307 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
5308 * not specify PATH but the unit has ExecSearchPath. */
5309 if (!strv_isempty(context->exec_search_path)) {
5310 _cleanup_free_ char *joined = NULL;
5311
5312 joined = strv_join(context->exec_search_path, ":");
5313 if (!joined) {
5314 *exit_status = EXIT_MEMORY;
5315 return log_oom();
5316 }
5317
5318 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
5319 if (r < 0) {
5320 *exit_status = EXIT_MEMORY;
5321 return log_oom();
5322 }
5323 }
5324
5325 accum_env = strv_env_merge(params->environment,
5326 our_env,
5327 joined_exec_search_path,
5328 pass_env,
5329 context->environment,
5330 files_env);
5331 if (!accum_env) {
5332 *exit_status = EXIT_MEMORY;
5333 return log_oom();
5334 }
5335 accum_env = strv_env_clean(accum_env);
5336
5337 (void) umask(context->umask);
5338
5339 r = setup_keyring(unit, context, params, uid, gid);
5340 if (r < 0) {
5341 *exit_status = EXIT_KEYRING;
5342 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
5343 }
5344
5345 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
5346 * from it. */
5347 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
5348
5349 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
5350 * for it, and the kernel doesn't actually support ambient caps. */
5351 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
5352
5353 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
5354 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
5355 * desired. */
5356 if (needs_ambient_hack)
5357 needs_setuid = false;
5358 else
5359 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
5360
5361 uint64_t capability_ambient_set = context->capability_ambient_set;
5362
5363 if (needs_sandboxing) {
5364 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
5365 * /sys being present. The actual MAC context application will happen later, as late as
5366 * possible, to avoid impacting our own code paths. */
5367
5368 #if HAVE_SELINUX
5369 use_selinux = mac_selinux_use();
5370 #endif
5371 #if ENABLE_SMACK
5372 use_smack = mac_smack_use();
5373 #endif
5374 #if HAVE_APPARMOR
5375 use_apparmor = mac_apparmor_use();
5376 #endif
5377 }
5378
5379 if (needs_sandboxing) {
5380 int which_failed;
5381
5382 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
5383 * is set here. (See below.) */
5384
5385 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
5386 if (r < 0) {
5387 *exit_status = EXIT_LIMITS;
5388 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
5389 }
5390 }
5391
5392 if (needs_setuid && context->pam_name && username) {
5393 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
5394 * wins here. (See above.) */
5395
5396 /* All fds passed in the fds array will be closed in the pam child process. */
5397 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
5398 if (r < 0) {
5399 *exit_status = EXIT_PAM;
5400 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
5401 }
5402
5403 if (ambient_capabilities_supported()) {
5404 uint64_t ambient_after_pam;
5405
5406 /* PAM modules might have set some ambient caps. Query them here and merge them into
5407 * the caps we want to set in the end, so that we don't end up unsetting them. */
5408 r = capability_get_ambient(&ambient_after_pam);
5409 if (r < 0) {
5410 *exit_status = EXIT_CAPABILITIES;
5411 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
5412 }
5413
5414 capability_ambient_set |= ambient_after_pam;
5415 }
5416
5417 ngids_after_pam = getgroups_alloc(&gids_after_pam);
5418 if (ngids_after_pam < 0) {
5419 *exit_status = EXIT_MEMORY;
5420 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5421 }
5422 }
5423
5424 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5425 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
5426 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
5427 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
5428
5429 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5430 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
5431 * the actual requested operations fail (or silently continue). */
5432 if (r < 0 && context->private_users) {
5433 *exit_status = EXIT_USER;
5434 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5435 }
5436 if (r < 0)
5437 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
5438 else
5439 userns_set_up = true;
5440 }
5441
5442 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
5443
5444 /* Try to enable network namespacing if network namespacing is available and we have
5445 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
5446 * new network namespace. And if we don't have that, then we could only create a network
5447 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
5448 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
5449 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
5450 if (r < 0) {
5451 if (ERRNO_IS_PRIVILEGE(r))
5452 log_unit_notice_errno(unit, r,
5453 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
5454 else {
5455 *exit_status = EXIT_NETWORK;
5456 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
5457 }
5458 }
5459 } else if (context->network_namespace_path) {
5460 *exit_status = EXIT_NETWORK;
5461 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5462 "NetworkNamespacePath= is not supported, refusing.");
5463 } else
5464 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
5465 }
5466
5467 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
5468
5469 if (ns_type_supported(NAMESPACE_IPC)) {
5470 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
5471 if (r == -EPERM)
5472 log_unit_warning_errno(unit, r,
5473 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
5474 else if (r < 0) {
5475 *exit_status = EXIT_NAMESPACE;
5476 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
5477 }
5478 } else if (context->ipc_namespace_path) {
5479 *exit_status = EXIT_NAMESPACE;
5480 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
5481 "IPCNamespacePath= is not supported, refusing.");
5482 } else
5483 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
5484 }
5485
5486 if (needs_mount_namespace) {
5487 _cleanup_free_ char *error_path = NULL;
5488
5489 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
5490 if (r < 0) {
5491 *exit_status = EXIT_NAMESPACE;
5492 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5493 error_path ? ": " : "", strempty(error_path));
5494 }
5495 }
5496
5497 if (needs_sandboxing) {
5498 r = apply_protect_hostname(unit, context, exit_status);
5499 if (r < 0)
5500 return r;
5501 }
5502
5503 if (context->memory_ksm >= 0)
5504 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
5505 if (ERRNO_IS_NOT_SUPPORTED(errno))
5506 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
5507 else {
5508 *exit_status = EXIT_KSM;
5509 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
5510 }
5511 }
5512
5513 /* Drop groups as early as possible.
5514 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5515 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
5516 if (needs_setuid) {
5517 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5518 int ngids_to_enforce = 0;
5519
5520 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5521 ngids,
5522 gids_after_pam,
5523 ngids_after_pam,
5524 &gids_to_enforce);
5525 if (ngids_to_enforce < 0) {
5526 *exit_status = EXIT_MEMORY;
5527 return log_unit_error_errno(unit,
5528 ngids_to_enforce,
5529 "Failed to merge group lists. Group membership might be incorrect: %m");
5530 }
5531
5532 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
5533 if (r < 0) {
5534 *exit_status = EXIT_GROUP;
5535 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
5536 }
5537 }
5538
5539 /* If the user namespace was not set up above, try to do it now.
5540 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
5541 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5542 * case of mount namespaces being less privileged when the mount point list is copied from a
5543 * different user namespace). */
5544
5545 if (needs_sandboxing && context->private_users && !userns_set_up) {
5546 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5547 if (r < 0) {
5548 *exit_status = EXIT_USER;
5549 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
5550 }
5551 }
5552
5553 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5554 * shall execute. */
5555
5556 _cleanup_free_ char *executable = NULL;
5557 _cleanup_close_ int executable_fd = -EBADF;
5558 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
5559 if (r < 0) {
5560 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
5561 log_unit_struct_errno(unit, LOG_INFO, r,
5562 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5563 LOG_UNIT_INVOCATION_ID(unit),
5564 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5565 command->path),
5566 "EXECUTABLE=%s", command->path);
5567 return 0;
5568 }
5569
5570 *exit_status = EXIT_EXEC;
5571
5572 return log_unit_struct_errno(unit, LOG_INFO, r,
5573 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5574 LOG_UNIT_INVOCATION_ID(unit),
5575 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5576 command->path),
5577 "EXECUTABLE=%s", command->path);
5578 }
5579
5580 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5581 if (r < 0) {
5582 *exit_status = EXIT_FDS;
5583 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5584 }
5585
5586 #if HAVE_SELINUX
5587 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
5588 int fd = -EBADF;
5589
5590 if (socket_fd >= 0)
5591 fd = socket_fd;
5592 else if (params->n_socket_fds == 1)
5593 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5594 * use context from that fd to compute the label. */
5595 fd = params->fds[0];
5596
5597 if (fd >= 0) {
5598 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
5599 if (r < 0) {
5600 if (!context->selinux_context_ignore) {
5601 *exit_status = EXIT_SELINUX_CONTEXT;
5602 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5603 }
5604 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
5605 }
5606 }
5607 }
5608 #endif
5609
5610 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
5611 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
5612 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
5613 * execve(). */
5614
5615 r = close_all_fds(keep_fds, n_keep_fds);
5616 if (r >= 0)
5617 r = shift_fds(fds, n_fds);
5618 if (r >= 0)
5619 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
5620 if (r < 0) {
5621 *exit_status = EXIT_FDS;
5622 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
5623 }
5624
5625 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5626 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5627 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5628 * came this far. */
5629
5630 secure_bits = context->secure_bits;
5631
5632 if (needs_sandboxing) {
5633 uint64_t bset;
5634
5635 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
5636 * (Note this is placed after the general resource limit initialization, see above, in order
5637 * to take precedence.) */
5638 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5639 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5640 *exit_status = EXIT_LIMITS;
5641 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
5642 }
5643 }
5644
5645 #if ENABLE_SMACK
5646 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5647 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5648 if (use_smack) {
5649 r = setup_smack(unit->manager, context, executable_fd);
5650 if (r < 0 && !context->smack_process_label_ignore) {
5651 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5652 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5653 }
5654 }
5655 #endif
5656
5657 bset = context->capability_bounding_set;
5658 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5659 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5660 * instead of us doing that */
5661 if (needs_ambient_hack)
5662 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5663 (UINT64_C(1) << CAP_SETUID) |
5664 (UINT64_C(1) << CAP_SETGID);
5665
5666 if (!cap_test_all(bset)) {
5667 r = capability_bounding_set_drop(bset, /* right_now= */ false);
5668 if (r < 0) {
5669 *exit_status = EXIT_CAPABILITIES;
5670 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
5671 }
5672 }
5673
5674 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5675 * keep-caps set.
5676 *
5677 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5678 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5679 * the ambient capabilities can be raised as they are present in the permitted and
5680 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5681 * without changing the user, so we also set the ambient capabilities here.
5682 *
5683 * The requested ambient capabilities are raised in the inheritable set if the second
5684 * argument is true. */
5685 if (!needs_ambient_hack) {
5686 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
5687 if (r < 0) {
5688 *exit_status = EXIT_CAPABILITIES;
5689 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
5690 }
5691 }
5692 }
5693
5694 /* chroot to root directory first, before we lose the ability to chroot */
5695 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
5696 if (r < 0)
5697 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5698
5699 if (needs_setuid) {
5700 if (uid_is_valid(uid)) {
5701 r = enforce_user(context, uid, capability_ambient_set);
5702 if (r < 0) {
5703 *exit_status = EXIT_USER;
5704 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5705 }
5706
5707 if (!needs_ambient_hack && capability_ambient_set != 0) {
5708
5709 /* Raise the ambient capabilities after user change. */
5710 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
5711 if (r < 0) {
5712 *exit_status = EXIT_CAPABILITIES;
5713 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
5714 }
5715 }
5716 }
5717 }
5718
5719 /* Apply working directory here, because the working directory might be on NFS and only the user running
5720 * this service might have the correct privilege to change to the working directory */
5721 r = apply_working_directory(context, params, runtime, home, exit_status);
5722 if (r < 0)
5723 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5724
5725 if (needs_sandboxing) {
5726 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5727 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5728 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5729 * are restricted. */
5730
5731 #if HAVE_SELINUX
5732 if (use_selinux) {
5733 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5734
5735 if (exec_context) {
5736 r = setexeccon(exec_context);
5737 if (r < 0) {
5738 if (!context->selinux_context_ignore) {
5739 *exit_status = EXIT_SELINUX_CONTEXT;
5740 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5741 }
5742 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5743 }
5744 }
5745 }
5746 #endif
5747
5748 #if HAVE_APPARMOR
5749 if (use_apparmor && context->apparmor_profile) {
5750 r = aa_change_onexec(context->apparmor_profile);
5751 if (r < 0 && !context->apparmor_profile_ignore) {
5752 *exit_status = EXIT_APPARMOR_PROFILE;
5753 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5754 }
5755 }
5756 #endif
5757
5758 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5759 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5760 * requires CAP_SETPCAP. */
5761 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
5762 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
5763 * effective set here.
5764 *
5765 * The effective set is overwritten during execve() with the following values:
5766 *
5767 * - ambient set (for non-root processes)
5768 *
5769 * - (inheritable | bounding) set for root processes)
5770 *
5771 * Hence there is no security impact to raise it in the effective set before execve
5772 */
5773 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
5774 if (r < 0) {
5775 *exit_status = EXIT_CAPABILITIES;
5776 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5777 }
5778 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
5779 *exit_status = EXIT_SECUREBITS;
5780 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
5781 }
5782 }
5783
5784 if (context_has_no_new_privileges(context))
5785 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
5786 *exit_status = EXIT_NO_NEW_PRIVILEGES;
5787 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
5788 }
5789
5790 #if HAVE_SECCOMP
5791 r = apply_address_families(unit, context);
5792 if (r < 0) {
5793 *exit_status = EXIT_ADDRESS_FAMILIES;
5794 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
5795 }
5796
5797 r = apply_memory_deny_write_execute(unit, context);
5798 if (r < 0) {
5799 *exit_status = EXIT_SECCOMP;
5800 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
5801 }
5802
5803 r = apply_restrict_realtime(unit, context);
5804 if (r < 0) {
5805 *exit_status = EXIT_SECCOMP;
5806 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
5807 }
5808
5809 r = apply_restrict_suid_sgid(unit, context);
5810 if (r < 0) {
5811 *exit_status = EXIT_SECCOMP;
5812 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5813 }
5814
5815 r = apply_restrict_namespaces(unit, context);
5816 if (r < 0) {
5817 *exit_status = EXIT_SECCOMP;
5818 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
5819 }
5820
5821 r = apply_protect_sysctl(unit, context);
5822 if (r < 0) {
5823 *exit_status = EXIT_SECCOMP;
5824 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
5825 }
5826
5827 r = apply_protect_kernel_modules(unit, context);
5828 if (r < 0) {
5829 *exit_status = EXIT_SECCOMP;
5830 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
5831 }
5832
5833 r = apply_protect_kernel_logs(unit, context);
5834 if (r < 0) {
5835 *exit_status = EXIT_SECCOMP;
5836 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5837 }
5838
5839 r = apply_protect_clock(unit, context);
5840 if (r < 0) {
5841 *exit_status = EXIT_SECCOMP;
5842 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5843 }
5844
5845 r = apply_private_devices(unit, context);
5846 if (r < 0) {
5847 *exit_status = EXIT_SECCOMP;
5848 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
5849 }
5850
5851 r = apply_syscall_archs(unit, context);
5852 if (r < 0) {
5853 *exit_status = EXIT_SECCOMP;
5854 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
5855 }
5856
5857 r = apply_lock_personality(unit, context);
5858 if (r < 0) {
5859 *exit_status = EXIT_SECCOMP;
5860 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
5861 }
5862
5863 r = apply_syscall_log(unit, context);
5864 if (r < 0) {
5865 *exit_status = EXIT_SECCOMP;
5866 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5867 }
5868
5869 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5870 * by the filter as little as possible. */
5871 r = apply_syscall_filter(unit, context, needs_ambient_hack);
5872 if (r < 0) {
5873 *exit_status = EXIT_SECCOMP;
5874 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
5875 }
5876 #endif
5877
5878 #if HAVE_LIBBPF
5879 r = apply_restrict_filesystems(unit, context);
5880 if (r < 0) {
5881 *exit_status = EXIT_BPF;
5882 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5883 }
5884 #endif
5885
5886 }
5887
5888 if (!strv_isempty(context->unset_environment)) {
5889 char **ee = NULL;
5890
5891 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5892 if (!ee) {
5893 *exit_status = EXIT_MEMORY;
5894 return log_oom();
5895 }
5896
5897 strv_free_and_replace(accum_env, ee);
5898 }
5899
5900 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5901 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5902
5903 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5904 if (r < 0) {
5905 *exit_status = EXIT_MEMORY;
5906 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
5907 }
5908 final_argv = replaced_argv;
5909
5910 if (!strv_isempty(unset_variables)) {
5911 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5912 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5913 }
5914
5915 if (!strv_isempty(bad_variables)) {
5916 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5917 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5918 }
5919 } else
5920 final_argv = command->argv;
5921
5922 log_command_line(unit, "Executing", executable, final_argv);
5923
5924 if (exec_fd >= 0) {
5925 uint8_t hot = 1;
5926
5927 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5928 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5929
5930 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5931 *exit_status = EXIT_EXEC;
5932 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5933 }
5934 }
5935
5936 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5937
5938 if (exec_fd >= 0) {
5939 uint8_t hot = 0;
5940
5941 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5942 * that POLLHUP on it no longer means execve() succeeded. */
5943
5944 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5945 *exit_status = EXIT_EXEC;
5946 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5947 }
5948 }
5949
5950 *exit_status = EXIT_EXEC;
5951 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
5952 }
5953
5954 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
5955 static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
5956
5957 int exec_spawn(Unit *unit,
5958 ExecCommand *command,
5959 const ExecContext *context,
5960 const ExecParameters *params,
5961 ExecRuntime *runtime,
5962 const CGroupContext *cgroup_context,
5963 pid_t *ret) {
5964
5965 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
5966 _cleanup_free_ char *subcgroup_path = NULL;
5967 _cleanup_strv_free_ char **files_env = NULL;
5968 size_t n_storage_fds = 0, n_socket_fds = 0;
5969 pid_t pid;
5970
5971 assert(unit);
5972 assert(command);
5973 assert(context);
5974 assert(ret);
5975 assert(params);
5976 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
5977
5978 LOG_CONTEXT_PUSH_UNIT(unit);
5979
5980 if (context->std_input == EXEC_INPUT_SOCKET ||
5981 context->std_output == EXEC_OUTPUT_SOCKET ||
5982 context->std_error == EXEC_OUTPUT_SOCKET) {
5983
5984 if (params->n_socket_fds > 1)
5985 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
5986
5987 if (params->n_socket_fds == 0)
5988 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
5989
5990 socket_fd = params->fds[0];
5991 } else {
5992 socket_fd = -EBADF;
5993 fds = params->fds;
5994 n_socket_fds = params->n_socket_fds;
5995 n_storage_fds = params->n_storage_fds;
5996 }
5997
5998 r = exec_context_named_iofds(context, params, named_iofds);
5999 if (r < 0)
6000 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
6001
6002 r = exec_context_load_environment(unit, context, &files_env);
6003 if (r < 0)
6004 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
6005
6006 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
6007 and, until the next SELinux policy changes, we save further reloads in future children. */
6008 mac_selinux_maybe_reload();
6009
6010 /* We won't know the real executable path until we create the mount namespace in the child, but we
6011 want to log from the parent, so we use the possibly inaccurate path here. */
6012 log_command_line(unit, "About to execute", command->path, command->argv);
6013
6014 if (params->cgroup_path) {
6015 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
6016 if (r < 0)
6017 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
6018 if (r > 0) {
6019 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
6020 * realized by the unit logic) */
6021
6022 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
6023 if (r < 0)
6024 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
6025 }
6026 }
6027
6028 pid = fork();
6029 if (pid < 0)
6030 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
6031
6032 if (pid == 0) {
6033 int exit_status = EXIT_SUCCESS;
6034
6035 r = exec_child(unit,
6036 command,
6037 context,
6038 params,
6039 runtime,
6040 cgroup_context,
6041 socket_fd,
6042 named_iofds,
6043 fds,
6044 n_socket_fds,
6045 n_storage_fds,
6046 files_env,
6047 unit->manager->user_lookup_fds[1],
6048 &exit_status);
6049
6050 if (r < 0) {
6051 const char *status =
6052 exit_status_to_string(exit_status,
6053 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
6054
6055 log_unit_struct_errno(unit, LOG_ERR, r,
6056 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
6057 LOG_UNIT_INVOCATION_ID(unit),
6058 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
6059 status, command->path),
6060 "EXECUTABLE=%s", command->path);
6061 }
6062
6063 _exit(exit_status);
6064 }
6065
6066 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
6067
6068 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
6069 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
6070 * process will be killed too). */
6071 if (subcgroup_path)
6072 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
6073
6074 exec_status_start(&command->exec_status, pid);
6075
6076 *ret = pid;
6077 return 0;
6078 }
6079
6080 void exec_context_init(ExecContext *c) {
6081 assert(c);
6082
6083 c->umask = 0022;
6084 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
6085 c->cpu_sched_policy = SCHED_OTHER;
6086 c->syslog_priority = LOG_DAEMON|LOG_INFO;
6087 c->syslog_level_prefix = true;
6088 c->ignore_sigpipe = true;
6089 c->timer_slack_nsec = NSEC_INFINITY;
6090 c->personality = PERSONALITY_INVALID;
6091 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6092 c->directories[t].mode = 0755;
6093 c->timeout_clean_usec = USEC_INFINITY;
6094 c->capability_bounding_set = CAP_MASK_UNSET;
6095 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
6096 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
6097 c->log_level_max = -1;
6098 #if HAVE_SECCOMP
6099 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
6100 #endif
6101 c->tty_rows = UINT_MAX;
6102 c->tty_cols = UINT_MAX;
6103 numa_policy_reset(&c->numa_policy);
6104 c->private_mounts = -1;
6105 c->memory_ksm = -1;
6106 }
6107
6108 void exec_context_done(ExecContext *c) {
6109 assert(c);
6110
6111 c->environment = strv_free(c->environment);
6112 c->environment_files = strv_free(c->environment_files);
6113 c->pass_environment = strv_free(c->pass_environment);
6114 c->unset_environment = strv_free(c->unset_environment);
6115
6116 rlimit_free_all(c->rlimit);
6117
6118 for (size_t l = 0; l < 3; l++) {
6119 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
6120 c->stdio_file[l] = mfree(c->stdio_file[l]);
6121 }
6122
6123 c->working_directory = mfree(c->working_directory);
6124 c->root_directory = mfree(c->root_directory);
6125 c->root_image = mfree(c->root_image);
6126 c->root_image_options = mount_options_free_all(c->root_image_options);
6127 c->root_hash = mfree(c->root_hash);
6128 c->root_hash_size = 0;
6129 c->root_hash_path = mfree(c->root_hash_path);
6130 c->root_hash_sig = mfree(c->root_hash_sig);
6131 c->root_hash_sig_size = 0;
6132 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
6133 c->root_verity = mfree(c->root_verity);
6134 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
6135 c->extension_directories = strv_free(c->extension_directories);
6136 c->tty_path = mfree(c->tty_path);
6137 c->syslog_identifier = mfree(c->syslog_identifier);
6138 c->user = mfree(c->user);
6139 c->group = mfree(c->group);
6140
6141 c->supplementary_groups = strv_free(c->supplementary_groups);
6142
6143 c->pam_name = mfree(c->pam_name);
6144
6145 c->read_only_paths = strv_free(c->read_only_paths);
6146 c->read_write_paths = strv_free(c->read_write_paths);
6147 c->inaccessible_paths = strv_free(c->inaccessible_paths);
6148 c->exec_paths = strv_free(c->exec_paths);
6149 c->no_exec_paths = strv_free(c->no_exec_paths);
6150 c->exec_search_path = strv_free(c->exec_search_path);
6151
6152 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
6153 c->bind_mounts = NULL;
6154 c->n_bind_mounts = 0;
6155 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
6156 c->temporary_filesystems = NULL;
6157 c->n_temporary_filesystems = 0;
6158 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
6159
6160 cpu_set_reset(&c->cpu_set);
6161 numa_policy_reset(&c->numa_policy);
6162
6163 c->utmp_id = mfree(c->utmp_id);
6164 c->selinux_context = mfree(c->selinux_context);
6165 c->apparmor_profile = mfree(c->apparmor_profile);
6166 c->smack_process_label = mfree(c->smack_process_label);
6167
6168 c->restrict_filesystems = set_free(c->restrict_filesystems);
6169
6170 c->syscall_filter = hashmap_free(c->syscall_filter);
6171 c->syscall_archs = set_free(c->syscall_archs);
6172 c->address_families = set_free(c->address_families);
6173
6174 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
6175 exec_directory_done(&c->directories[t]);
6176
6177 c->log_level_max = -1;
6178
6179 exec_context_free_log_extra_fields(c);
6180 c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
6181 c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
6182
6183 c->log_ratelimit_interval_usec = 0;
6184 c->log_ratelimit_burst = 0;
6185
6186 c->stdin_data = mfree(c->stdin_data);
6187 c->stdin_data_size = 0;
6188
6189 c->network_namespace_path = mfree(c->network_namespace_path);
6190 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
6191
6192 c->log_namespace = mfree(c->log_namespace);
6193
6194 c->load_credentials = hashmap_free(c->load_credentials);
6195 c->set_credentials = hashmap_free(c->set_credentials);
6196 c->import_credentials = set_free(c->import_credentials);
6197
6198 c->root_image_policy = image_policy_free(c->root_image_policy);
6199 c->mount_image_policy = image_policy_free(c->mount_image_policy);
6200 c->extension_image_policy = image_policy_free(c->extension_image_policy);
6201 }
6202
6203 int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
6204 assert(c);
6205
6206 if (!runtime_prefix)
6207 return 0;
6208
6209 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
6210 _cleanup_free_ char *p = NULL;
6211
6212 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6213 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6214 else
6215 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
6216 if (!p)
6217 return -ENOMEM;
6218
6219 /* We execute this synchronously, since we need to be sure this is gone when we start the
6220 * service next. */
6221 (void) rm_rf(p, REMOVE_ROOT);
6222
6223 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
6224 _cleanup_free_ char *symlink_abs = NULL;
6225
6226 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
6227 symlink_abs = path_join(runtime_prefix, "private", *symlink);
6228 else
6229 symlink_abs = path_join(runtime_prefix, *symlink);
6230 if (!symlink_abs)
6231 return -ENOMEM;
6232
6233 (void) unlink(symlink_abs);
6234 }
6235 }
6236
6237 return 0;
6238 }
6239
6240 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
6241 _cleanup_free_ char *p = NULL;
6242
6243 assert(c);
6244
6245 if (!runtime_prefix || !unit)
6246 return 0;
6247
6248 p = path_join(runtime_prefix, "credentials", unit);
6249 if (!p)
6250 return -ENOMEM;
6251
6252 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
6253 * unmount it, and afterwards remove the mount point */
6254 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
6255 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
6256
6257 return 0;
6258 }
6259
6260 int exec_context_destroy_mount_ns_dir(Unit *u) {
6261 _cleanup_free_ char *p = NULL;
6262
6263 if (!u || !MANAGER_IS_SYSTEM(u->manager))
6264 return 0;
6265
6266 p = path_join("/run/systemd/propagate/", u->id);
6267 if (!p)
6268 return -ENOMEM;
6269
6270 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
6271 if (rmdir(p) < 0 && errno != ENOENT)
6272 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
6273
6274 return 0;
6275 }
6276
6277 static void exec_command_done(ExecCommand *c) {
6278 assert(c);
6279
6280 c->path = mfree(c->path);
6281 c->argv = strv_free(c->argv);
6282 }
6283
6284 void exec_command_done_array(ExecCommand *c, size_t n) {
6285 for (size_t i = 0; i < n; i++)
6286 exec_command_done(c+i);
6287 }
6288
6289 ExecCommand* exec_command_free_list(ExecCommand *c) {
6290 ExecCommand *i;
6291
6292 while ((i = c)) {
6293 LIST_REMOVE(command, c, i);
6294 exec_command_done(i);
6295 free(i);
6296 }
6297
6298 return NULL;
6299 }
6300
6301 void exec_command_free_array(ExecCommand **c, size_t n) {
6302 for (size_t i = 0; i < n; i++)
6303 c[i] = exec_command_free_list(c[i]);
6304 }
6305
6306 void exec_command_reset_status_array(ExecCommand *c, size_t n) {
6307 for (size_t i = 0; i < n; i++)
6308 exec_status_reset(&c[i].exec_status);
6309 }
6310
6311 void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
6312 for (size_t i = 0; i < n; i++)
6313 LIST_FOREACH(command, z, c[i])
6314 exec_status_reset(&z->exec_status);
6315 }
6316
6317 typedef struct InvalidEnvInfo {
6318 const Unit *unit;
6319 const char *path;
6320 } InvalidEnvInfo;
6321
6322 static void invalid_env(const char *p, void *userdata) {
6323 InvalidEnvInfo *info = userdata;
6324
6325 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
6326 }
6327
6328 const char* exec_context_fdname(const ExecContext *c, int fd_index) {
6329 assert(c);
6330
6331 switch (fd_index) {
6332
6333 case STDIN_FILENO:
6334 if (c->std_input != EXEC_INPUT_NAMED_FD)
6335 return NULL;
6336
6337 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
6338
6339 case STDOUT_FILENO:
6340 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
6341 return NULL;
6342
6343 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
6344
6345 case STDERR_FILENO:
6346 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
6347 return NULL;
6348
6349 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
6350
6351 default:
6352 return NULL;
6353 }
6354 }
6355
6356 static int exec_context_named_iofds(
6357 const ExecContext *c,
6358 const ExecParameters *p,
6359 int named_iofds[static 3]) {
6360
6361 size_t targets;
6362 const char* stdio_fdname[3];
6363 size_t n_fds;
6364
6365 assert(c);
6366 assert(p);
6367 assert(named_iofds);
6368
6369 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
6370 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
6371 (c->std_error == EXEC_OUTPUT_NAMED_FD);
6372
6373 for (size_t i = 0; i < 3; i++)
6374 stdio_fdname[i] = exec_context_fdname(c, i);
6375
6376 n_fds = p->n_storage_fds + p->n_socket_fds;
6377
6378 for (size_t i = 0; i < n_fds && targets > 0; i++)
6379 if (named_iofds[STDIN_FILENO] < 0 &&
6380 c->std_input == EXEC_INPUT_NAMED_FD &&
6381 stdio_fdname[STDIN_FILENO] &&
6382 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
6383
6384 named_iofds[STDIN_FILENO] = p->fds[i];
6385 targets--;
6386
6387 } else if (named_iofds[STDOUT_FILENO] < 0 &&
6388 c->std_output == EXEC_OUTPUT_NAMED_FD &&
6389 stdio_fdname[STDOUT_FILENO] &&
6390 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
6391
6392 named_iofds[STDOUT_FILENO] = p->fds[i];
6393 targets--;
6394
6395 } else if (named_iofds[STDERR_FILENO] < 0 &&
6396 c->std_error == EXEC_OUTPUT_NAMED_FD &&
6397 stdio_fdname[STDERR_FILENO] &&
6398 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
6399
6400 named_iofds[STDERR_FILENO] = p->fds[i];
6401 targets--;
6402 }
6403
6404 return targets == 0 ? 0 : -ENOENT;
6405 }
6406
6407 static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
6408 _cleanup_strv_free_ char **v = NULL;
6409 int r;
6410
6411 assert(c);
6412 assert(ret);
6413
6414 STRV_FOREACH(i, c->environment_files) {
6415 _cleanup_globfree_ glob_t pglob = {};
6416 bool ignore = false;
6417 char *fn = *i;
6418
6419 if (fn[0] == '-') {
6420 ignore = true;
6421 fn++;
6422 }
6423
6424 if (!path_is_absolute(fn)) {
6425 if (ignore)
6426 continue;
6427 return -EINVAL;
6428 }
6429
6430 /* Filename supports globbing, take all matching files */
6431 r = safe_glob(fn, 0, &pglob);
6432 if (r < 0) {
6433 if (ignore)
6434 continue;
6435 return r;
6436 }
6437
6438 /* When we don't match anything, -ENOENT should be returned */
6439 assert(pglob.gl_pathc > 0);
6440
6441 for (size_t n = 0; n < pglob.gl_pathc; n++) {
6442 _cleanup_strv_free_ char **p = NULL;
6443
6444 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
6445 if (r < 0) {
6446 if (ignore)
6447 continue;
6448 return r;
6449 }
6450
6451 /* Log invalid environment variables with filename */
6452 if (p) {
6453 InvalidEnvInfo info = {
6454 .unit = unit,
6455 .path = pglob.gl_pathv[n]
6456 };
6457
6458 p = strv_env_clean_with_callback(p, invalid_env, &info);
6459 }
6460
6461 if (!v)
6462 v = TAKE_PTR(p);
6463 else {
6464 char **m = strv_env_merge(v, p);
6465 if (!m)
6466 return -ENOMEM;
6467
6468 strv_free_and_replace(v, m);
6469 }
6470 }
6471 }
6472
6473 *ret = TAKE_PTR(v);
6474
6475 return 0;
6476 }
6477
6478 static bool tty_may_match_dev_console(const char *tty) {
6479 _cleanup_free_ char *resolved = NULL;
6480
6481 if (!tty)
6482 return true;
6483
6484 tty = skip_dev_prefix(tty);
6485
6486 /* trivial identity? */
6487 if (streq(tty, "console"))
6488 return true;
6489
6490 if (resolve_dev_console(&resolved) < 0)
6491 return true; /* if we could not resolve, assume it may */
6492
6493 /* "tty0" means the active VC, so it may be the same sometimes */
6494 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6495 }
6496
6497 static bool exec_context_may_touch_tty(const ExecContext *ec) {
6498 assert(ec);
6499
6500 return ec->tty_reset ||
6501 ec->tty_vhangup ||
6502 ec->tty_vt_disallocate ||
6503 is_terminal_input(ec->std_input) ||
6504 is_terminal_output(ec->std_output) ||
6505 is_terminal_output(ec->std_error);
6506 }
6507
6508 bool exec_context_may_touch_console(const ExecContext *ec) {
6509
6510 return exec_context_may_touch_tty(ec) &&
6511 tty_may_match_dev_console(exec_context_tty_path(ec));
6512 }
6513
6514 static void strv_fprintf(FILE *f, char **l) {
6515 assert(f);
6516
6517 STRV_FOREACH(g, l)
6518 fprintf(f, " %s", *g);
6519 }
6520
6521 static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6522 assert(f);
6523 assert(prefix);
6524 assert(name);
6525
6526 if (!strv_isempty(strv)) {
6527 fprintf(f, "%s%s:", prefix, name);
6528 strv_fprintf(f, strv);
6529 fputs("\n", f);
6530 }
6531 }
6532
6533 void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
6534 int r;
6535
6536 assert(c);
6537 assert(f);
6538
6539 prefix = strempty(prefix);
6540
6541 fprintf(f,
6542 "%sUMask: %04o\n"
6543 "%sWorkingDirectory: %s\n"
6544 "%sRootDirectory: %s\n"
6545 "%sRootEphemeral: %s\n"
6546 "%sNonBlocking: %s\n"
6547 "%sPrivateTmp: %s\n"
6548 "%sPrivateDevices: %s\n"
6549 "%sProtectKernelTunables: %s\n"
6550 "%sProtectKernelModules: %s\n"
6551 "%sProtectKernelLogs: %s\n"
6552 "%sProtectClock: %s\n"
6553 "%sProtectControlGroups: %s\n"
6554 "%sPrivateNetwork: %s\n"
6555 "%sPrivateUsers: %s\n"
6556 "%sProtectHome: %s\n"
6557 "%sProtectSystem: %s\n"
6558 "%sMountAPIVFS: %s\n"
6559 "%sIgnoreSIGPIPE: %s\n"
6560 "%sMemoryDenyWriteExecute: %s\n"
6561 "%sRestrictRealtime: %s\n"
6562 "%sRestrictSUIDSGID: %s\n"
6563 "%sKeyringMode: %s\n"
6564 "%sProtectHostname: %s\n"
6565 "%sProtectProc: %s\n"
6566 "%sProcSubset: %s\n",
6567 prefix, c->umask,
6568 prefix, empty_to_root(c->working_directory),
6569 prefix, empty_to_root(c->root_directory),
6570 prefix, yes_no(c->root_ephemeral),
6571 prefix, yes_no(c->non_blocking),
6572 prefix, yes_no(c->private_tmp),
6573 prefix, yes_no(c->private_devices),
6574 prefix, yes_no(c->protect_kernel_tunables),
6575 prefix, yes_no(c->protect_kernel_modules),
6576 prefix, yes_no(c->protect_kernel_logs),
6577 prefix, yes_no(c->protect_clock),
6578 prefix, yes_no(c->protect_control_groups),
6579 prefix, yes_no(c->private_network),
6580 prefix, yes_no(c->private_users),
6581 prefix, protect_home_to_string(c->protect_home),
6582 prefix, protect_system_to_string(c->protect_system),
6583 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
6584 prefix, yes_no(c->ignore_sigpipe),
6585 prefix, yes_no(c->memory_deny_write_execute),
6586 prefix, yes_no(c->restrict_realtime),
6587 prefix, yes_no(c->restrict_suid_sgid),
6588 prefix, exec_keyring_mode_to_string(c->keyring_mode),
6589 prefix, yes_no(c->protect_hostname),
6590 prefix, protect_proc_to_string(c->protect_proc),
6591 prefix, proc_subset_to_string(c->proc_subset));
6592
6593 if (c->root_image)
6594 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6595
6596 if (c->root_image_options) {
6597 fprintf(f, "%sRootImageOptions:", prefix);
6598 LIST_FOREACH(mount_options, o, c->root_image_options)
6599 if (!isempty(o->options))
6600 fprintf(f, " %s:%s",
6601 partition_designator_to_string(o->partition_designator),
6602 o->options);
6603 fprintf(f, "\n");
6604 }
6605
6606 if (c->root_hash) {
6607 _cleanup_free_ char *encoded = NULL;
6608 encoded = hexmem(c->root_hash, c->root_hash_size);
6609 if (encoded)
6610 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6611 }
6612
6613 if (c->root_hash_path)
6614 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6615
6616 if (c->root_hash_sig) {
6617 _cleanup_free_ char *encoded = NULL;
6618 ssize_t len;
6619 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6620 if (len)
6621 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6622 }
6623
6624 if (c->root_hash_sig_path)
6625 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6626
6627 if (c->root_verity)
6628 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6629
6630 STRV_FOREACH(e, c->environment)
6631 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6632
6633 STRV_FOREACH(e, c->environment_files)
6634 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
6635
6636 STRV_FOREACH(e, c->pass_environment)
6637 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6638
6639 STRV_FOREACH(e, c->unset_environment)
6640 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6641
6642 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6643
6644 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
6645 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6646
6647 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6648 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6649
6650 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6651 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6652 }
6653 }
6654
6655 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
6656
6657 if (c->nice_set)
6658 fprintf(f, "%sNice: %i\n", prefix, c->nice);
6659
6660 if (c->oom_score_adjust_set)
6661 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
6662
6663 if (c->coredump_filter_set)
6664 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
6665
6666 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
6667 if (c->rlimit[i]) {
6668 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
6669 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
6670 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
6671 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6672 }
6673
6674 if (c->ioprio_set) {
6675 _cleanup_free_ char *class_str = NULL;
6676
6677 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
6678 if (r >= 0)
6679 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6680
6681 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
6682 }
6683
6684 if (c->cpu_sched_set) {
6685 _cleanup_free_ char *policy_str = NULL;
6686
6687 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6688 if (r >= 0)
6689 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6690
6691 fprintf(f,
6692 "%sCPUSchedulingPriority: %i\n"
6693 "%sCPUSchedulingResetOnFork: %s\n",
6694 prefix, c->cpu_sched_priority,
6695 prefix, yes_no(c->cpu_sched_reset_on_fork));
6696 }
6697
6698 if (c->cpu_set.set) {
6699 _cleanup_free_ char *affinity = NULL;
6700
6701 affinity = cpu_set_to_range_string(&c->cpu_set);
6702 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
6703 }
6704
6705 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6706 _cleanup_free_ char *nodes = NULL;
6707
6708 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6709 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6710 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6711 }
6712
6713 if (c->timer_slack_nsec != NSEC_INFINITY)
6714 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
6715
6716 fprintf(f,
6717 "%sStandardInput: %s\n"
6718 "%sStandardOutput: %s\n"
6719 "%sStandardError: %s\n",
6720 prefix, exec_input_to_string(c->std_input),
6721 prefix, exec_output_to_string(c->std_output),
6722 prefix, exec_output_to_string(c->std_error));
6723
6724 if (c->std_input == EXEC_INPUT_NAMED_FD)
6725 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6726 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6727 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6728 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6729 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6730
6731 if (c->std_input == EXEC_INPUT_FILE)
6732 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6733 if (c->std_output == EXEC_OUTPUT_FILE)
6734 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6735 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6736 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6737 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6738 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
6739 if (c->std_error == EXEC_OUTPUT_FILE)
6740 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6741 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6742 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6743 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6744 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
6745
6746 if (c->tty_path)
6747 fprintf(f,
6748 "%sTTYPath: %s\n"
6749 "%sTTYReset: %s\n"
6750 "%sTTYVHangup: %s\n"
6751 "%sTTYVTDisallocate: %s\n"
6752 "%sTTYRows: %u\n"
6753 "%sTTYColumns: %u\n",
6754 prefix, c->tty_path,
6755 prefix, yes_no(c->tty_reset),
6756 prefix, yes_no(c->tty_vhangup),
6757 prefix, yes_no(c->tty_vt_disallocate),
6758 prefix, c->tty_rows,
6759 prefix, c->tty_cols);
6760
6761 if (IN_SET(c->std_output,
6762 EXEC_OUTPUT_KMSG,
6763 EXEC_OUTPUT_JOURNAL,
6764 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6765 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6766 IN_SET(c->std_error,
6767 EXEC_OUTPUT_KMSG,
6768 EXEC_OUTPUT_JOURNAL,
6769 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6770 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
6771
6772 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
6773
6774 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6775 if (r >= 0)
6776 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
6777
6778 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6779 if (r >= 0)
6780 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
6781 }
6782
6783 if (c->log_level_max >= 0) {
6784 _cleanup_free_ char *t = NULL;
6785
6786 (void) log_level_to_string_alloc(c->log_level_max, &t);
6787
6788 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6789 }
6790
6791 if (c->log_ratelimit_interval_usec > 0)
6792 fprintf(f,
6793 "%sLogRateLimitIntervalSec: %s\n",
6794 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
6795
6796 if (c->log_ratelimit_burst > 0)
6797 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
6798
6799 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6800 fprintf(f, "%sLogFilterPatterns:", prefix);
6801
6802 char *pattern;
6803 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6804 fprintf(f, " %s", pattern);
6805 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6806 fprintf(f, " ~%s", pattern);
6807 fputc('\n', f);
6808 }
6809
6810 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6811 fprintf(f, "%sLogExtraFields: ", prefix);
6812 fwrite(c->log_extra_fields[j].iov_base,
6813 1, c->log_extra_fields[j].iov_len,
6814 f);
6815 fputc('\n', f);
6816 }
6817
6818 if (c->log_namespace)
6819 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6820
6821 if (c->secure_bits) {
6822 _cleanup_free_ char *str = NULL;
6823
6824 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6825 if (r >= 0)
6826 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6827 }
6828
6829 if (c->capability_bounding_set != CAP_MASK_UNSET) {
6830 _cleanup_free_ char *str = NULL;
6831
6832 r = capability_set_to_string(c->capability_bounding_set, &str);
6833 if (r >= 0)
6834 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
6835 }
6836
6837 if (c->capability_ambient_set != 0) {
6838 _cleanup_free_ char *str = NULL;
6839
6840 r = capability_set_to_string(c->capability_ambient_set, &str);
6841 if (r >= 0)
6842 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
6843 }
6844
6845 if (c->user)
6846 fprintf(f, "%sUser: %s\n", prefix, c->user);
6847 if (c->group)
6848 fprintf(f, "%sGroup: %s\n", prefix, c->group);
6849
6850 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6851
6852 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
6853
6854 if (c->pam_name)
6855 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
6856
6857 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6858 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6859 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6860 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6861 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
6862 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
6863
6864 for (size_t i = 0; i < c->n_bind_mounts; i++)
6865 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6866 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6867 c->bind_mounts[i].ignore_enoent ? "-": "",
6868 c->bind_mounts[i].source,
6869 c->bind_mounts[i].destination,
6870 c->bind_mounts[i].recursive ? "rbind" : "norbind");
6871
6872 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6873 const TemporaryFileSystem *t = c->temporary_filesystems + i;
6874
6875 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6876 t->path,
6877 isempty(t->options) ? "" : ":",
6878 strempty(t->options));
6879 }
6880
6881 if (c->utmp_id)
6882 fprintf(f,
6883 "%sUtmpIdentifier: %s\n",
6884 prefix, c->utmp_id);
6885
6886 if (c->selinux_context)
6887 fprintf(f,
6888 "%sSELinuxContext: %s%s\n",
6889 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
6890
6891 if (c->apparmor_profile)
6892 fprintf(f,
6893 "%sAppArmorProfile: %s%s\n",
6894 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6895
6896 if (c->smack_process_label)
6897 fprintf(f,
6898 "%sSmackProcessLabel: %s%s\n",
6899 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6900
6901 if (c->personality != PERSONALITY_INVALID)
6902 fprintf(f,
6903 "%sPersonality: %s\n",
6904 prefix, strna(personality_to_string(c->personality)));
6905
6906 fprintf(f,
6907 "%sLockPersonality: %s\n",
6908 prefix, yes_no(c->lock_personality));
6909
6910 if (c->syscall_filter) {
6911 fprintf(f,
6912 "%sSystemCallFilter: ",
6913 prefix);
6914
6915 if (!c->syscall_allow_list)
6916 fputc('~', f);
6917
6918 #if HAVE_SECCOMP
6919 void *id, *val;
6920 bool first = true;
6921 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
6922 _cleanup_free_ char *name = NULL;
6923 const char *errno_name = NULL;
6924 int num = PTR_TO_INT(val);
6925
6926 if (first)
6927 first = false;
6928 else
6929 fputc(' ', f);
6930
6931 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
6932 fputs(strna(name), f);
6933
6934 if (num >= 0) {
6935 errno_name = seccomp_errno_or_action_to_string(num);
6936 if (errno_name)
6937 fprintf(f, ":%s", errno_name);
6938 else
6939 fprintf(f, ":%d", num);
6940 }
6941 }
6942 #endif
6943
6944 fputc('\n', f);
6945 }
6946
6947 if (c->syscall_archs) {
6948 fprintf(f,
6949 "%sSystemCallArchitectures:",
6950 prefix);
6951
6952 #if HAVE_SECCOMP
6953 void *id;
6954 SET_FOREACH(id, c->syscall_archs)
6955 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6956 #endif
6957 fputc('\n', f);
6958 }
6959
6960 if (exec_context_restrict_namespaces_set(c)) {
6961 _cleanup_free_ char *s = NULL;
6962
6963 r = namespace_flags_to_string(c->restrict_namespaces, &s);
6964 if (r >= 0)
6965 fprintf(f, "%sRestrictNamespaces: %s\n",
6966 prefix, strna(s));
6967 }
6968
6969 #if HAVE_LIBBPF
6970 if (exec_context_restrict_filesystems_set(c)) {
6971 char *fs;
6972 SET_FOREACH(fs, c->restrict_filesystems)
6973 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6974 }
6975 #endif
6976
6977 if (c->network_namespace_path)
6978 fprintf(f,
6979 "%sNetworkNamespacePath: %s\n",
6980 prefix, c->network_namespace_path);
6981
6982 if (c->syscall_errno > 0) {
6983 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6984
6985 #if HAVE_SECCOMP
6986 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
6987 if (errno_name)
6988 fputs(errno_name, f);
6989 else
6990 fprintf(f, "%d", c->syscall_errno);
6991 #endif
6992 fputc('\n', f);
6993 }
6994
6995 for (size_t i = 0; i < c->n_mount_images; i++) {
6996 fprintf(f, "%sMountImages: %s%s:%s", prefix,
6997 c->mount_images[i].ignore_enoent ? "-": "",
6998 c->mount_images[i].source,
6999 c->mount_images[i].destination);
7000 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
7001 fprintf(f, ":%s:%s",
7002 partition_designator_to_string(o->partition_designator),
7003 strempty(o->options));
7004 fprintf(f, "\n");
7005 }
7006
7007 for (size_t i = 0; i < c->n_extension_images; i++) {
7008 fprintf(f, "%sExtensionImages: %s%s", prefix,
7009 c->extension_images[i].ignore_enoent ? "-": "",
7010 c->extension_images[i].source);
7011 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
7012 fprintf(f, ":%s:%s",
7013 partition_designator_to_string(o->partition_designator),
7014 strempty(o->options));
7015 fprintf(f, "\n");
7016 }
7017
7018 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
7019 }
7020
7021 bool exec_context_maintains_privileges(const ExecContext *c) {
7022 assert(c);
7023
7024 /* Returns true if the process forked off would run under
7025 * an unchanged UID or as root. */
7026
7027 if (!c->user)
7028 return true;
7029
7030 if (streq(c->user, "root") || streq(c->user, "0"))
7031 return true;
7032
7033 return false;
7034 }
7035
7036 int exec_context_get_effective_ioprio(const ExecContext *c) {
7037 int p;
7038
7039 assert(c);
7040
7041 if (c->ioprio_set)
7042 return c->ioprio;
7043
7044 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
7045 if (p < 0)
7046 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7047
7048 return ioprio_normalize(p);
7049 }
7050
7051 bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
7052 assert(c);
7053
7054 /* Explicit setting wins */
7055 if (c->mount_apivfs_set)
7056 return c->mount_apivfs;
7057
7058 /* Default to "yes" if root directory or image are specified */
7059 if (exec_context_with_rootfs(c))
7060 return true;
7061
7062 return false;
7063 }
7064
7065 void exec_context_free_log_extra_fields(ExecContext *c) {
7066 assert(c);
7067
7068 for (size_t l = 0; l < c->n_log_extra_fields; l++)
7069 free(c->log_extra_fields[l].iov_base);
7070 c->log_extra_fields = mfree(c->log_extra_fields);
7071 c->n_log_extra_fields = 0;
7072 }
7073
7074 void exec_context_revert_tty(ExecContext *c) {
7075 _cleanup_close_ int fd = -EBADF;
7076 const char *path;
7077 struct stat st;
7078 int r;
7079
7080 assert(c);
7081
7082 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
7083 exec_context_tty_reset(c, NULL);
7084
7085 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
7086 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
7087 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
7088 if (!exec_context_may_touch_tty(c))
7089 return;
7090
7091 path = exec_context_tty_path(c);
7092 if (!path)
7093 return;
7094
7095 fd = open(path, O_PATH|O_CLOEXEC);
7096 if (fd < 0)
7097 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
7098 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
7099 path);
7100
7101 if (fstat(fd, &st) < 0)
7102 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
7103
7104 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
7105 * if things are a character device, since a proper check either means we'd have to open the TTY and
7106 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
7107 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
7108 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
7109 if (!S_ISCHR(st.st_mode))
7110 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
7111
7112 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
7113 if (r < 0)
7114 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
7115 }
7116
7117 int exec_context_get_clean_directories(
7118 ExecContext *c,
7119 char **prefix,
7120 ExecCleanMask mask,
7121 char ***ret) {
7122
7123 _cleanup_strv_free_ char **l = NULL;
7124 int r;
7125
7126 assert(c);
7127 assert(prefix);
7128 assert(ret);
7129
7130 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
7131 if (!FLAGS_SET(mask, 1U << t))
7132 continue;
7133
7134 if (!prefix[t])
7135 continue;
7136
7137 for (size_t i = 0; i < c->directories[t].n_items; i++) {
7138 char *j;
7139
7140 j = path_join(prefix[t], c->directories[t].items[i].path);
7141 if (!j)
7142 return -ENOMEM;
7143
7144 r = strv_consume(&l, j);
7145 if (r < 0)
7146 return r;
7147
7148 /* Also remove private directories unconditionally. */
7149 if (t != EXEC_DIRECTORY_CONFIGURATION) {
7150 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
7151 if (!j)
7152 return -ENOMEM;
7153
7154 r = strv_consume(&l, j);
7155 if (r < 0)
7156 return r;
7157 }
7158
7159 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
7160 j = path_join(prefix[t], *symlink);
7161 if (!j)
7162 return -ENOMEM;
7163
7164 r = strv_consume(&l, j);
7165 if (r < 0)
7166 return r;
7167 }
7168 }
7169 }
7170
7171 *ret = TAKE_PTR(l);
7172 return 0;
7173 }
7174
7175 int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
7176 ExecCleanMask mask = 0;
7177
7178 assert(c);
7179 assert(ret);
7180
7181 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
7182 if (c->directories[t].n_items > 0)
7183 mask |= 1U << t;
7184
7185 *ret = mask;
7186 return 0;
7187 }
7188
7189 bool exec_context_has_encrypted_credentials(ExecContext *c) {
7190 ExecLoadCredential *load_cred;
7191 ExecSetCredential *set_cred;
7192
7193 assert(c);
7194
7195 HASHMAP_FOREACH(load_cred, c->load_credentials)
7196 if (load_cred->encrypted)
7197 return true;
7198
7199 HASHMAP_FOREACH(set_cred, c->set_credentials)
7200 if (set_cred->encrypted)
7201 return true;
7202
7203 return false;
7204 }
7205
7206 void exec_status_start(ExecStatus *s, pid_t pid) {
7207 assert(s);
7208
7209 *s = (ExecStatus) {
7210 .pid = pid,
7211 };
7212
7213 dual_timestamp_get(&s->start_timestamp);
7214 }
7215
7216 void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
7217 assert(s);
7218
7219 if (s->pid != pid)
7220 *s = (ExecStatus) {
7221 .pid = pid,
7222 };
7223
7224 dual_timestamp_get(&s->exit_timestamp);
7225
7226 s->code = code;
7227 s->status = status;
7228
7229 if (context && context->utmp_id)
7230 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
7231 }
7232
7233 void exec_status_reset(ExecStatus *s) {
7234 assert(s);
7235
7236 *s = (ExecStatus) {};
7237 }
7238
7239 void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
7240 assert(s);
7241 assert(f);
7242
7243 if (s->pid <= 0)
7244 return;
7245
7246 prefix = strempty(prefix);
7247
7248 fprintf(f,
7249 "%sPID: "PID_FMT"\n",
7250 prefix, s->pid);
7251
7252 if (dual_timestamp_is_set(&s->start_timestamp))
7253 fprintf(f,
7254 "%sStart Timestamp: %s\n",
7255 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
7256
7257 if (dual_timestamp_is_set(&s->exit_timestamp))
7258 fprintf(f,
7259 "%sExit Timestamp: %s\n"
7260 "%sExit Code: %s\n"
7261 "%sExit Status: %i\n",
7262 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
7263 prefix, sigchld_code_to_string(s->code),
7264 prefix, s->status);
7265 }
7266
7267 static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
7268 _cleanup_free_ char *cmd = NULL;
7269 const char *prefix2;
7270
7271 assert(c);
7272 assert(f);
7273
7274 prefix = strempty(prefix);
7275 prefix2 = strjoina(prefix, "\t");
7276
7277 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
7278
7279 fprintf(f,
7280 "%sCommand Line: %s\n",
7281 prefix, strnull(cmd));
7282
7283 exec_status_dump(&c->exec_status, f, prefix2);
7284 }
7285
7286 void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
7287 assert(f);
7288
7289 prefix = strempty(prefix);
7290
7291 LIST_FOREACH(command, i, c)
7292 exec_command_dump(i, f, prefix);
7293 }
7294
7295 void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
7296 ExecCommand *end;
7297
7298 assert(l);
7299 assert(e);
7300
7301 if (*l) {
7302 /* It's kind of important, that we keep the order here */
7303 end = LIST_FIND_TAIL(command, *l);
7304 LIST_INSERT_AFTER(command, *l, end, e);
7305 } else
7306 *l = e;
7307 }
7308
7309 int exec_command_set(ExecCommand *c, const char *path, ...) {
7310 va_list ap;
7311 char **l, *p;
7312
7313 assert(c);
7314 assert(path);
7315
7316 va_start(ap, path);
7317 l = strv_new_ap(path, ap);
7318 va_end(ap);
7319
7320 if (!l)
7321 return -ENOMEM;
7322
7323 p = strdup(path);
7324 if (!p) {
7325 strv_free(l);
7326 return -ENOMEM;
7327 }
7328
7329 free_and_replace(c->path, p);
7330
7331 return strv_free_and_replace(c->argv, l);
7332 }
7333
7334 int exec_command_append(ExecCommand *c, const char *path, ...) {
7335 _cleanup_strv_free_ char **l = NULL;
7336 va_list ap;
7337 int r;
7338
7339 assert(c);
7340 assert(path);
7341
7342 va_start(ap, path);
7343 l = strv_new_ap(path, ap);
7344 va_end(ap);
7345
7346 if (!l)
7347 return -ENOMEM;
7348
7349 r = strv_extend_strv(&c->argv, l, false);
7350 if (r < 0)
7351 return r;
7352
7353 return 0;
7354 }
7355
7356 static char *destroy_tree(char *path) {
7357 if (!path)
7358 return NULL;
7359
7360 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
7361 log_debug("Spawning process to nuke '%s'", path);
7362
7363 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
7364 }
7365
7366 return mfree(path);
7367 }
7368
7369 static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
7370 if (!rt)
7371 return NULL;
7372
7373 if (rt->manager)
7374 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
7375
7376 rt->id = mfree(rt->id);
7377 rt->tmp_dir = mfree(rt->tmp_dir);
7378 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
7379 safe_close_pair(rt->netns_storage_socket);
7380 safe_close_pair(rt->ipcns_storage_socket);
7381 return mfree(rt);
7382 }
7383
7384 DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
7385 DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
7386
7387 ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
7388 if (!rt)
7389 return NULL;
7390
7391 assert(rt->n_ref > 0);
7392 rt->n_ref--;
7393
7394 if (rt->n_ref > 0)
7395 return NULL;
7396
7397 rt->tmp_dir = destroy_tree(rt->tmp_dir);
7398 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
7399
7400 return exec_shared_runtime_free(rt);
7401 }
7402
7403 static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
7404 _cleanup_free_ char *id_copy = NULL;
7405 ExecSharedRuntime *n;
7406
7407 assert(ret);
7408
7409 id_copy = strdup(id);
7410 if (!id_copy)
7411 return -ENOMEM;
7412
7413 n = new(ExecSharedRuntime, 1);
7414 if (!n)
7415 return -ENOMEM;
7416
7417 *n = (ExecSharedRuntime) {
7418 .id = TAKE_PTR(id_copy),
7419 .netns_storage_socket = PIPE_EBADF,
7420 .ipcns_storage_socket = PIPE_EBADF,
7421 };
7422
7423 *ret = n;
7424 return 0;
7425 }
7426
7427 static int exec_shared_runtime_add(
7428 Manager *m,
7429 const char *id,
7430 char **tmp_dir,
7431 char **var_tmp_dir,
7432 int netns_storage_socket[2],
7433 int ipcns_storage_socket[2],
7434 ExecSharedRuntime **ret) {
7435
7436 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
7437 int r;
7438
7439 assert(m);
7440 assert(id);
7441
7442 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
7443
7444 r = exec_shared_runtime_allocate(&rt, id);
7445 if (r < 0)
7446 return r;
7447
7448 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
7449 if (r < 0)
7450 return r;
7451
7452 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
7453 rt->tmp_dir = TAKE_PTR(*tmp_dir);
7454 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
7455
7456 if (netns_storage_socket) {
7457 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
7458 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
7459 }
7460
7461 if (ipcns_storage_socket) {
7462 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
7463 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
7464 }
7465
7466 rt->manager = m;
7467
7468 if (ret)
7469 *ret = rt;
7470 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
7471 TAKE_PTR(rt);
7472 return 0;
7473 }
7474
7475 static int exec_shared_runtime_make(
7476 Manager *m,
7477 const ExecContext *c,
7478 const char *id,
7479 ExecSharedRuntime **ret) {
7480
7481 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
7482 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
7483 int r;
7484
7485 assert(m);
7486 assert(c);
7487 assert(id);
7488
7489 /* It is not necessary to create ExecSharedRuntime object. */
7490 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
7491 *ret = NULL;
7492 return 0;
7493 }
7494
7495 if (c->private_tmp &&
7496 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
7497 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
7498 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
7499 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
7500 if (r < 0)
7501 return r;
7502 }
7503
7504 if (exec_needs_network_namespace(c)) {
7505 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
7506 return -errno;
7507 }
7508
7509 if (exec_needs_ipc_namespace(c)) {
7510 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
7511 return -errno;
7512 }
7513
7514 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
7515 if (r < 0)
7516 return r;
7517
7518 return 1;
7519 }
7520
7521 int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
7522 ExecSharedRuntime *rt;
7523 int r;
7524
7525 assert(m);
7526 assert(id);
7527 assert(ret);
7528
7529 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
7530 if (rt)
7531 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
7532 goto ref;
7533
7534 if (!create) {
7535 *ret = NULL;
7536 return 0;
7537 }
7538
7539 /* If not found, then create a new object. */
7540 r = exec_shared_runtime_make(m, c, id, &rt);
7541 if (r < 0)
7542 return r;
7543 if (r == 0) {
7544 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
7545 *ret = NULL;
7546 return 0;
7547 }
7548
7549 ref:
7550 /* increment reference counter. */
7551 rt->n_ref++;
7552 *ret = rt;
7553 return 1;
7554 }
7555
7556 int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7557 ExecSharedRuntime *rt;
7558
7559 assert(m);
7560 assert(f);
7561 assert(fds);
7562
7563 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7564 fprintf(f, "exec-runtime=%s", rt->id);
7565
7566 if (rt->tmp_dir)
7567 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
7568
7569 if (rt->var_tmp_dir)
7570 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
7571
7572 if (rt->netns_storage_socket[0] >= 0) {
7573 int copy;
7574
7575 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7576 if (copy < 0)
7577 return copy;
7578
7579 fprintf(f, " netns-socket-0=%i", copy);
7580 }
7581
7582 if (rt->netns_storage_socket[1] >= 0) {
7583 int copy;
7584
7585 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7586 if (copy < 0)
7587 return copy;
7588
7589 fprintf(f, " netns-socket-1=%i", copy);
7590 }
7591
7592 if (rt->ipcns_storage_socket[0] >= 0) {
7593 int copy;
7594
7595 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7596 if (copy < 0)
7597 return copy;
7598
7599 fprintf(f, " ipcns-socket-0=%i", copy);
7600 }
7601
7602 if (rt->ipcns_storage_socket[1] >= 0) {
7603 int copy;
7604
7605 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7606 if (copy < 0)
7607 return copy;
7608
7609 fprintf(f, " ipcns-socket-1=%i", copy);
7610 }
7611
7612 fputc('\n', f);
7613 }
7614
7615 return 0;
7616 }
7617
7618 int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7619 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
7620 ExecSharedRuntime *rt;
7621 int r;
7622
7623 /* This is for the migration from old (v237 or earlier) deserialization text.
7624 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7625 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
7626 * so or not from the serialized text, then we always creates a new object owned by this. */
7627
7628 assert(u);
7629 assert(key);
7630 assert(value);
7631
7632 /* Manager manages ExecSharedRuntime objects by the unit id.
7633 * So, we omit the serialized text when the unit does not have id (yet?)... */
7634 if (isempty(u->id)) {
7635 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7636 return 0;
7637 }
7638
7639 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
7640 return log_oom();
7641
7642 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
7643 if (!rt) {
7644 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
7645 return log_oom();
7646
7647 rt = rt_create;
7648 }
7649
7650 if (streq(key, "tmp-dir")) {
7651 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7652 return -ENOMEM;
7653
7654 } else if (streq(key, "var-tmp-dir")) {
7655 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7656 return -ENOMEM;
7657
7658 } else if (streq(key, "netns-socket-0")) {
7659 int fd;
7660
7661 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7662 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7663 return 0;
7664 }
7665
7666 safe_close(rt->netns_storage_socket[0]);
7667 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7668
7669 } else if (streq(key, "netns-socket-1")) {
7670 int fd;
7671
7672 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
7673 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
7674 return 0;
7675 }
7676
7677 safe_close(rt->netns_storage_socket[1]);
7678 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
7679
7680 } else
7681 return 0;
7682
7683 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
7684 if (rt_create) {
7685 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
7686 if (r < 0) {
7687 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
7688 return 0;
7689 }
7690
7691 rt_create->manager = u->manager;
7692
7693 /* Avoid cleanup */
7694 TAKE_PTR(rt_create);
7695 }
7696
7697 return 1;
7698 }
7699
7700 int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7701 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7702 char *id = NULL;
7703 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
7704 const char *p, *v = ASSERT_PTR(value);
7705 size_t n;
7706
7707 assert(m);
7708 assert(fds);
7709
7710 n = strcspn(v, " ");
7711 id = strndupa_safe(v, n);
7712 if (v[n] != ' ')
7713 goto finalize;
7714 p = v + n + 1;
7715
7716 v = startswith(p, "tmp-dir=");
7717 if (v) {
7718 n = strcspn(v, " ");
7719 tmp_dir = strndup(v, n);
7720 if (!tmp_dir)
7721 return log_oom();
7722 if (v[n] != ' ')
7723 goto finalize;
7724 p = v + n + 1;
7725 }
7726
7727 v = startswith(p, "var-tmp-dir=");
7728 if (v) {
7729 n = strcspn(v, " ");
7730 var_tmp_dir = strndup(v, n);
7731 if (!var_tmp_dir)
7732 return log_oom();
7733 if (v[n] != ' ')
7734 goto finalize;
7735 p = v + n + 1;
7736 }
7737
7738 v = startswith(p, "netns-socket-0=");
7739 if (v) {
7740 char *buf;
7741
7742 n = strcspn(v, " ");
7743 buf = strndupa_safe(v, n);
7744
7745 netns_fdpair[0] = parse_fd(buf);
7746 if (netns_fdpair[0] < 0)
7747 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
7748 if (!fdset_contains(fds, netns_fdpair[0]))
7749 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7750 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7751 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
7752 if (v[n] != ' ')
7753 goto finalize;
7754 p = v + n + 1;
7755 }
7756
7757 v = startswith(p, "netns-socket-1=");
7758 if (v) {
7759 char *buf;
7760
7761 n = strcspn(v, " ");
7762 buf = strndupa_safe(v, n);
7763
7764 netns_fdpair[1] = parse_fd(buf);
7765 if (netns_fdpair[1] < 0)
7766 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
7767 if (!fdset_contains(fds, netns_fdpair[1]))
7768 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7769 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7770 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7771 if (v[n] != ' ')
7772 goto finalize;
7773 p = v + n + 1;
7774 }
7775
7776 v = startswith(p, "ipcns-socket-0=");
7777 if (v) {
7778 char *buf;
7779
7780 n = strcspn(v, " ");
7781 buf = strndupa_safe(v, n);
7782
7783 ipcns_fdpair[0] = parse_fd(buf);
7784 if (ipcns_fdpair[0] < 0)
7785 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7786 if (!fdset_contains(fds, ipcns_fdpair[0]))
7787 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7788 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7789 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7790 if (v[n] != ' ')
7791 goto finalize;
7792 p = v + n + 1;
7793 }
7794
7795 v = startswith(p, "ipcns-socket-1=");
7796 if (v) {
7797 char *buf;
7798
7799 n = strcspn(v, " ");
7800 buf = strndupa_safe(v, n);
7801
7802 ipcns_fdpair[1] = parse_fd(buf);
7803 if (ipcns_fdpair[1] < 0)
7804 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7805 if (!fdset_contains(fds, ipcns_fdpair[1]))
7806 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7807 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7808 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
7809 }
7810
7811 finalize:
7812 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7813 if (r < 0)
7814 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7815 return 0;
7816 }
7817
7818 void exec_shared_runtime_vacuum(Manager *m) {
7819 ExecSharedRuntime *rt;
7820
7821 assert(m);
7822
7823 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
7824
7825 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
7826 if (rt->n_ref > 0)
7827 continue;
7828
7829 (void) exec_shared_runtime_free(rt);
7830 }
7831 }
7832
7833 int exec_runtime_make(
7834 const Unit *unit,
7835 const ExecContext *context,
7836 ExecSharedRuntime *shared,
7837 DynamicCreds *creds,
7838 ExecRuntime **ret) {
7839 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7840 _cleanup_free_ char *ephemeral = NULL;
7841 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
7842 int r;
7843
7844 assert(unit);
7845 assert(context);
7846 assert(ret);
7847
7848 if (!shared && !creds && !exec_needs_ephemeral(context)) {
7849 *ret = NULL;
7850 return 0;
7851 }
7852
7853 if (exec_needs_ephemeral(context)) {
7854 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7855 if (r < 0)
7856 return r;
7857
7858 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7859 if (r < 0)
7860 return r;
7861
7862 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7863 return -errno;
7864 }
7865
7866 rt = new(ExecRuntime, 1);
7867 if (!rt)
7868 return -ENOMEM;
7869
7870 *rt = (ExecRuntime) {
7871 .shared = shared,
7872 .dynamic_creds = creds,
7873 .ephemeral_copy = TAKE_PTR(ephemeral),
7874 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7875 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
7876 };
7877
7878 *ret = TAKE_PTR(rt);
7879 return 1;
7880 }
7881
7882 ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7883 if (!rt)
7884 return NULL;
7885
7886 exec_shared_runtime_unref(rt->shared);
7887 dynamic_creds_unref(rt->dynamic_creds);
7888
7889 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
7890
7891 safe_close_pair(rt->ephemeral_storage_socket);
7892 return mfree(rt);
7893 }
7894
7895 ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7896 if (!rt)
7897 return NULL;
7898
7899 rt->shared = exec_shared_runtime_destroy(rt->shared);
7900 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
7901 return exec_runtime_free(rt);
7902 }
7903
7904 void exec_params_clear(ExecParameters *p) {
7905 if (!p)
7906 return;
7907
7908 p->environment = strv_free(p->environment);
7909 p->fd_names = strv_free(p->fd_names);
7910 p->fds = mfree(p->fds);
7911 p->exec_fd = safe_close(p->exec_fd);
7912 }
7913
7914 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7915 if (!sc)
7916 return NULL;
7917
7918 free(sc->id);
7919 free(sc->data);
7920 return mfree(sc);
7921 }
7922
7923 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7924 if (!lc)
7925 return NULL;
7926
7927 free(lc->id);
7928 free(lc->path);
7929 return mfree(lc);
7930 }
7931
7932 void exec_directory_done(ExecDirectory *d) {
7933 if (!d)
7934 return;
7935
7936 for (size_t i = 0; i < d->n_items; i++) {
7937 free(d->items[i].path);
7938 strv_free(d->items[i].symlinks);
7939 }
7940
7941 d->items = mfree(d->items);
7942 d->n_items = 0;
7943 d->mode = 0755;
7944 }
7945
7946 static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7947 assert(d);
7948 assert(path);
7949
7950 for (size_t i = 0; i < d->n_items; i++)
7951 if (path_equal(d->items[i].path, path))
7952 return &d->items[i];
7953
7954 return NULL;
7955 }
7956
7957 int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
7958 _cleanup_strv_free_ char **s = NULL;
7959 _cleanup_free_ char *p = NULL;
7960 ExecDirectoryItem *existing;
7961 int r;
7962
7963 assert(d);
7964 assert(path);
7965
7966 existing = exec_directory_find(d, path);
7967 if (existing) {
7968 r = strv_extend(&existing->symlinks, symlink);
7969 if (r < 0)
7970 return r;
7971
7972 return 0; /* existing item is updated */
7973 }
7974
7975 p = strdup(path);
7976 if (!p)
7977 return -ENOMEM;
7978
7979 if (symlink) {
7980 s = strv_new(symlink);
7981 if (!s)
7982 return -ENOMEM;
7983 }
7984
7985 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
7986 return -ENOMEM;
7987
7988 d->items[d->n_items++] = (ExecDirectoryItem) {
7989 .path = TAKE_PTR(p),
7990 .symlinks = TAKE_PTR(s),
7991 };
7992
7993 return 1; /* new item is added */
7994 }
7995
7996 static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7997 assert(a);
7998 assert(b);
7999
8000 return path_compare(a->path, b->path);
8001 }
8002
8003 void exec_directory_sort(ExecDirectory *d) {
8004 assert(d);
8005
8006 /* Sort the exec directories to make always parent directories processed at first in
8007 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
8008 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
8009 * list. See also comments in setup_exec_directory() and issue #24783. */
8010
8011 if (d->n_items <= 1)
8012 return;
8013
8014 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
8015
8016 for (size_t i = 1; i < d->n_items; i++)
8017 for (size_t j = 0; j < i; j++)
8018 if (path_startswith(d->items[i].path, d->items[j].path)) {
8019 d->items[i].only_create = true;
8020 break;
8021 }
8022 }
8023
8024 ExecCleanMask exec_clean_mask_from_string(const char *s) {
8025 ExecDirectoryType t;
8026
8027 assert(s);
8028
8029 if (streq(s, "all"))
8030 return EXEC_CLEAN_ALL;
8031 if (streq(s, "fdstore"))
8032 return EXEC_CLEAN_FDSTORE;
8033
8034 t = exec_resource_type_from_string(s);
8035 if (t < 0)
8036 return (ExecCleanMask) t;
8037
8038 return 1U << t;
8039 }
8040
8041 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
8042 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
8043
8044 static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
8045 [EXEC_INPUT_NULL] = "null",
8046 [EXEC_INPUT_TTY] = "tty",
8047 [EXEC_INPUT_TTY_FORCE] = "tty-force",
8048 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
8049 [EXEC_INPUT_SOCKET] = "socket",
8050 [EXEC_INPUT_NAMED_FD] = "fd",
8051 [EXEC_INPUT_DATA] = "data",
8052 [EXEC_INPUT_FILE] = "file",
8053 };
8054
8055 DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
8056
8057 static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
8058 [EXEC_OUTPUT_INHERIT] = "inherit",
8059 [EXEC_OUTPUT_NULL] = "null",
8060 [EXEC_OUTPUT_TTY] = "tty",
8061 [EXEC_OUTPUT_KMSG] = "kmsg",
8062 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
8063 [EXEC_OUTPUT_JOURNAL] = "journal",
8064 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
8065 [EXEC_OUTPUT_SOCKET] = "socket",
8066 [EXEC_OUTPUT_NAMED_FD] = "fd",
8067 [EXEC_OUTPUT_FILE] = "file",
8068 [EXEC_OUTPUT_FILE_APPEND] = "append",
8069 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
8070 };
8071
8072 DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
8073
8074 static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
8075 [EXEC_UTMP_INIT] = "init",
8076 [EXEC_UTMP_LOGIN] = "login",
8077 [EXEC_UTMP_USER] = "user",
8078 };
8079
8080 DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
8081
8082 static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
8083 [EXEC_PRESERVE_NO] = "no",
8084 [EXEC_PRESERVE_YES] = "yes",
8085 [EXEC_PRESERVE_RESTART] = "restart",
8086 };
8087
8088 DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
8089
8090 /* This table maps ExecDirectoryType to the setting it is configured with in the unit */
8091 static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8092 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
8093 [EXEC_DIRECTORY_STATE] = "StateDirectory",
8094 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
8095 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
8096 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
8097 };
8098
8099 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
8100
8101 /* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
8102 static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8103 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
8104 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
8105 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
8106 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
8107 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
8108 };
8109
8110 DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
8111
8112 /* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
8113 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
8114 * directories, specifically .timer units with their timestamp touch file. */
8115 static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8116 [EXEC_DIRECTORY_RUNTIME] = "runtime",
8117 [EXEC_DIRECTORY_STATE] = "state",
8118 [EXEC_DIRECTORY_CACHE] = "cache",
8119 [EXEC_DIRECTORY_LOGS] = "logs",
8120 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
8121 };
8122
8123 DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
8124
8125 /* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
8126 * the service payload in. */
8127 static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
8128 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
8129 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
8130 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
8131 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
8132 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
8133 };
8134
8135 DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
8136
8137 static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
8138 [EXEC_KEYRING_INHERIT] = "inherit",
8139 [EXEC_KEYRING_PRIVATE] = "private",
8140 [EXEC_KEYRING_SHARED] = "shared",
8141 };
8142
8143 DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);